[
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nenv/\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\n*.egg-info/\n.installed.cfg\n*.egg\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n.hypothesis/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# pyenv\n.python-version\n\n# celery beat schedule file\ncelerybeat-schedule\n\n# SageMath parsed files\n*.sage.py\n\n# dotenv\n.env\n\n# virtualenv\n.venv\nvenv/\nENV/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n\n.DS_Store\n.idea\n"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "repos:\n-   repo: https://github.com/Lucas-C/pre-commit-hooks.git\n    sha: v1.0.1\n    hooks:\n    -   id: remove-crlf\n        files: (?!.*third_party)^.*$ | (?!.*book)^.*$\n-   repo: https://github.com/PaddlePaddle/mirrors-yapf.git\n    sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37\n    hooks:\n    -   id: yapf\n        files: (.*\\.(py|bzl)|BUILD|.*\\.BUILD|WORKSPACE)$\n-   repo: https://github.com/pre-commit/pre-commit-hooks\n    sha: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0\n    hooks:\n    -   id: check-added-large-files\n    -   id: check-merge-conflict\n    -   id: check-symlinks\n    -   id: detect-private-key\n        files: (?!.*third_party)^.*$ | (?!.*book)^.*$\n    -   id: end-of-file-fixer\n-   repo: local\n    hooks:\n    -   id: clang-format-with-version-check\n        name: clang-format\n        description: Format files with ClangFormat.\n        entry: bash ./codestyle/clang_format.hook -i\n        language: system\n        files: \\.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$\n-   repo: local\n    hooks:\n    -   id: cpplint-cpp-source\n        name: cpplint\n        description: Check C++ code style using cpplint.py.\n        entry: bash ./codestyle/cpplint_pre_commit.hook\n        language: system\n        files: \\.(c|cc|cxx|cpp|cu|h|hpp|hxx)$\n-   repo: local\n    hooks:\n    -   id: pylint-doc-string\n        name: pylint\n        description: Check python docstring style using docstring_checker.\n        entry: bash ./codestyle/pylint_pre_commit.hook\n        language: system\n        files: \\.(py)$\n-   repo: local\n    hooks:\n    -   id: copyright_checker\n        name: copyright_checker\n        entry: python ./codestyle/copyright.hook\n        language: system\n        files: \\.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py|sh)$\n        exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$\n"
  },
  {
    "path": "Dockerfile",
    "content": "ARG BASE_IMAGE=registry.baidubce.com/paddlepaddle/paddle:2.4.1-gpu-cuda11.2-cudnn8.2-trt8.0\n\nFROM $BASE_IMAGE\n\nWORKDIR /paddle\n\nRUN python -m pip install paddlepaddle-gpu==0.0.0.post112 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html\n\n# RUN wget https://raw.githubusercontent.com/PaddlePaddle/PaddleFleetx/develop/requirements.txt && python -m pip install -r requirements.txt -i https://mirror.baidu.com/pypi/simple\nCOPY requirements.txt /paddle\n\nRUN python -m pip install -r requirements.txt #-i https://mirror.baidu.com/pypi/simple\n\nENV LD_LIBRARY_PATH=/usr/lib64/:${LD_LIBRARY_PATH}\n\n"
  },
  {
    "path": "LICENSE",
    "content": "Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved\n\n                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License."
  },
  {
    "path": "README.md",
    "content": "<p align=\"center\">\n  <img src=\"./paddlefleetx-logo.png\" align=\"middle\"  width=\"350\" />\n</p>\n\n------------------------------------------------------------------------------------------\n\n<p align=\"center\">\n    <a href=\"./LICENSE\"><img src=\"https://img.shields.io/badge/license-Apache%202-dfd.svg\"></a>\n    <a href=\"https://github.com/PaddlePaddle/PaddleFleetX/releases\"><img src=\"https://img.shields.io/github/v/release/PaddlePaddle/PaddleFleetX?color=ffa\"></a>\n    <a href=\"\"><img src=\"https://img.shields.io/badge/python-3.7+-aff.svg\"></a>\n    <a href=\"https://github.com/PaddlePaddle/PaddleFleetX/graphs/contributors\"><img src=\"https://img.shields.io/github/contributors/PaddlePaddle/PaddleFleetX?color=9ea\"></a>\n    <a href=\"https://github.com/PaddlePaddle/PaddleFleetX/issues\"><img src=\"https://img.shields.io/github/issues/PaddlePaddle/PaddleFleetX?color=9cc\"></a>\n    <a href=\"https://github.com/PaddlePaddle/PaddleFleetX/stargazers\"><img src=\"https://img.shields.io/github/stars/PaddlePaddle/PaddleFleetX?color=ccf\"></a>\n</p>\n\n## 简介\n\nPaddleFleetX是基于飞桨深度学习框架开发的大模型套件，旨在提供高性能、灵活易用的大模型全流程应用能力，在**开发**、**训练**、**精调**、**压推**、**推理**、**部署**六大环节提供端到端全流程优化。\n\n<p align=\"center\">\n  <img width=\"1000\" alt=\"飞桨大模型套件\" src=\"https://github.com/PaddlePaddle/PaddleFleetX/assets/1371212/ab5e87cc-df52-48cb-9968-8951d3b164ba\">\n</p>\n\n## 特色介绍\n\n### 大模型开发：动静统一开发模式，4D混合并行策略灵活配置\n\n<p align=\"center\">\n  <img width=\"771\" alt=\"大模型开发\" src=\"https://github.com/PaddlePaddle/PaddleFleetX/assets/1371212/95d1c0e8-df92-489b-8472-0a8b438cbfcf\">\n</p>\n\n基于飞桨动静统一的开发模式，大模型套件全面使用动态图开发，在Generate API中可自动完成算子融合具备静态图的调试性能。全场景统一训练器Trainer可以轻松完成4D混合并行的配置，在预训练与精调环节皆可使用。\n\n### 大模型训练：发挥基础计算潜能、全面提升分布式效率\n\n飞桨针对大模型训练，对数据读取、混合精度计算策略、高性能算子库、并行策略自动寻优、流水线调度的整个全流程实现优化，助力文心大模型训练速度提升3倍。\n\n<p align=\"center\">  \n  <img width=\"1000\" alt=\"飞桨支持大模型训练\" src=\"https://github.com/PaddlePaddle/PaddleFleetX/assets/1371212/3874440d-0b0c-4730-bbcb-f9b87900d75f\">\n</p>\n\n\n\n### 大模型精调：主流精调算法实现性能全面领先\n\n提供了主流的精调算法，包括SFT、Prefix-Tuning、LoRA三种主流的精调算法，有效降低的大模型训练的资源门槛。统一的训练器Trainer实现了预训练加速技术在精调场景的复用，并通过变长数据流优化大幅提升精调性能。\n\n<p align=\"center\">\n  <img width=\"800\" alt=\"大模型精调\" src=\"https://github.com/PaddlePaddle/PaddleFleetX/assets/1371212/0dad24ae-0549-4166-8426-b0a471a82450\">\n</p>\n\n\n### 大模型压缩：自研量化压缩算法实现无损量化\n\n飞桨自研的Shift-SmoothQuant算法相比SmoothQuant算法可以实现更平滑的激活分布，有效提升量化后模型的精度度和生成结果的稳定性。通过PaddleSlim的大模型压缩工具，我们在 C-Eval 和 NL2SQL 两个数据集上对主流开源大模型可以实现无损量化。更多技术介绍与使用说明可以参考[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim)。\n\n<p align=\"center\">\n  <img width=\"350\" alt=\"模型压缩\" src=\"https://github.com/PaddlePaddle/PaddleFleetX/assets/1371212/8b8334d6-dc1a-4ab8-a2f6-dbbece6f0e1e\">\n</p>\n<p align=\"center\">\n  <img width=\"798\" alt=\"模型压缩\" src=\"https://github.com/PaddlePaddle/PaddleFleetX/assets/1371212/badb3f10-314a-4259-8179-08f940197352\">\n</p>\n\n### 大模型推理：针对大模型场景特性匹配最优量化推理方案\n\nPaddle Inference针对大模型Prompt阶段与Token Generation阶段的计算特性的不同，在通用场景提供静态量化，在访存受限场景提供混合量化与低比特的推理方案。\n\n<p align=\"center\">\n  <img width=\"1000\" alt=\"飞桨支撑大模型推理\" src=\"https://github.com/PaddlePaddle/PaddleFleetX/assets/1371212/6bf2a373-a550-4359-9285-6fa4337e550d\">\n</p>\n\n<p align=\"center\">\n  <img width=\"400\" alt=\"推理引擎\" src=\"https://github.com/PaddlePaddle/PaddleFleetX/assets/1371212/8d9ab6f9-fc63-4485-bcf2-f9791b1de273\">\n</p>\n\n\n### 大模型部署：实时感知负载动态插入请求，最大化硬件利用率\n\n由于大模型生成场景解码阶段耗时较长，且不同Query下生成长度不一，为了最大化服务吞吐，我们在FastDeploy服务框架结合推理引擎实现了动态插入技术，科实时感知服务负载，动态插入用户请求最大化推理硬件利用率。\n\n<p align=\"center\">\n  <img width=\"350\" alt=\"大模型服务部署\" src=\"https://github.com/PaddlePaddle/PaddleFleetX/assets/1371212/d2e38f78-9088-4b1a-a9bd-1018385b5b86\">\n</p>\n\n\n## PaddleFleetX 应用案例\n\n### 大语言模型\n\n基于PaddleFleetX的核心能力，我们在PaddleNLP中提供了丰富的大语言模型全流程开发与应用示例，更多详细使用说明可以参考[PaddleNLP大语言模型](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm)。\n\n### 跨模态大模型\n\n除了大语言模型外，PaddleFleetX还提供跨模态大模型的开发与训练，如多模态预训练、文生图扩散模型等，覆盖图片、文本、视频和音频等模态，更多详细使用说明可以参考[PaddleMIX](https://github.com/PaddlePaddle/PaddleMIX)。\n\n### 生物计算大模型\n\n在生物计算领域，基于飞桨4D并行策略与高性能优化，我们在PaddleHelix中提供众多业界领先的生物计算预训练模型，更多详细使用说明可以参考[PaddleHelix](https://github.com/PaddlePaddle/PaddleHelix)。\n\n\n## Citation\n\n```\n@misc{paddlefleetx,\n    title={PaddleFleetX: An Easy-to-use and High-Performance One-stop Tool for Deep Learning},\n    author={PaddleFleetX Contributors},\n    howpublished = {\\url{https://github.com/PaddlePaddle/PaddleFleetX}},\n    year={2022}\n}\n```\n\n## License\n\nPaddleFleetX 基于 [Apache 2.0 license](./LICENSE) 许可发布。\n"
  },
  {
    "path": "benchmarks/README.md",
    "content": ""
  },
  {
    "path": "benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N1C1/ernie_bs16_fp16_DP1-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=ernie\ndp_degree=1\nmp_degree=1\npp_degree=1\nbs_item=16\nfp_item=fp16\nrun_mode=DP1-MP1-PP1\ndevice_num=N1C1\n\nmodel=ernie\nmicro_bs=${bs_item}\n\ncd ./benchmarks\nbash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N1C1/ernie_bs16_fp32_DP1-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=ernie\ndp_degree=1\nmp_degree=1\npp_degree=1\nbs_item=16\nfp_item=fp32\nrun_mode=DP1-MP1-PP1\ndevice_num=N1C1\n\nmodel=ernie\nmicro_bs=${bs_item}\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N1C8/ernie_bs16_fp16_DP2-MP2-PP2.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=ernie\ndp_degree=2\nmp_degree=2\npp_degree=2\nbs_item=16\nfp_item=fp16\nrun_mode=DP2-MP2-PP2\ndevice_num=N1C8\n\nmodel=ernie\nmicro_bs=2\n\ncd ./benchmarks\nbash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N1C8/ernie_bs16_fp32_DP2-MP2-PP2.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=ernie\ndp_degree=2\nmp_degree=2\npp_degree=2\nbs_item=16\nfp_item=fp32\nrun_mode=DP2-MP2-PP2\ndevice_num=N1C8\n\nmodel=ernie\nmicro_bs=2\n\ncd ./benchmarks\nbash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N4C32/ernie_bs16_fp16_DP1-MP8-PP4.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=ernie\ndp_degree=1\nmp_degree=8\npp_degree=4\nbs_item=16\nfp_item=fp16\nrun_mode=DP1-MP8-PP4\ndevice_num=N4C32\n\nmodel=ernie\nmicro_bs=2\n\ncd ./benchmarks\nbash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N4C32/ernie_bs16_fp16_DP2-MP8-PP2.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=ernie\ndp_degree=2\nmp_degree=8\npp_degree=2\nbs_item=16\nfp_item=fp16\nrun_mode=DP2-MP8-PP2\ndevice_num=N4C32\n\nmodel=ernie\nmicro_bs=2\n\ncd ./benchmarks\nbash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N4C32/ernie_bs16_fp16_DP4-MP8-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=ernie\ndp_degree=4\nmp_degree=8\npp_degree=1\nbs_item=16\nfp_item=fp16\nrun_mode=DP4-MP8-PP1\ndevice_num=N4C32\n\nmodel=ernie\nmicro_bs=4\n\ncd ./benchmarks\nbash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N4C32/ernie_bs16_fp32_DP1-MP8-PP4.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=ernie\ndp_degree=1\nmp_degree=8\npp_degree=4\nbs_item=16\nfp_item=fp32\nrun_mode=DP1-MP8-PP4\ndevice_num=N4C32\n\nmodel=ernie\nmicro_bs=2\n\ncd ./benchmarks\nbash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N4C32/ernie_bs16_fp32_DP2-MP8-PP2.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=ernie\ndp_degree=2\nmp_degree=8\npp_degree=2\nbs_item=16\nfp_item=fp32\nrun_mode=DP2-MP8-PP2\ndevice_num=N4C32\n\nmodel=ernie\nmicro_bs=2\n\ncd ./benchmarks\nbash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N4C32/ernie_bs16_fp32_DP4-MP8-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=ernie\ndp_degree=4\nmp_degree=8\npp_degree=1\nbs_item=16\nfp_item=fp32\nrun_mode=DP4-MP8-PP1\ndevice_num=N4C32\n\nmodel=ernie\nmicro_bs=4\n\ncd ./benchmarks\nbash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\npython -m pip install -r ../requirements.txt\n# get data\ncd ../\nrm -rf dataset/ernie\nmkdir -p dataset/ernie\nwget -O dataset/ernie/cluecorpussmall_14g_1207_ids_part0 https://paddlefleetx.bj.bcebos.com/model/nlp/ernie/cluecorpussmall_14g_1207_ids_part0\nwget -O dataset/ernie/cluecorpussmall_14g_1207_ids_part1 https://paddlefleetx.bj.bcebos.com/model/nlp/ernie/cluecorpussmall_14g_1207_ids_part1\ncat dataset/ernie/cluecorpussmall_14g_1207_ids_part* &> dataset/ernie/cluecorpussmall_14g_1207_ids.npy\nwget -O dataset/ernie/cluecorpussmall_14g_1207_idx.npz https://paddlefleetx.bj.bcebos.com/model/nlp/ernie/cluecorpussmall_14g_1207_idx.npz\n"
  },
  {
    "path": "benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh",
    "content": "#!/usr/bin/env bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Test training benchmark for a model.\n# Usage：bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}\nfunction _set_params(){\n    model_item=${1:-\"model_item\"}   # (必选) 模型 item\n    fp_item=${2:-\"fp32\"}            # (必选) fp32|fp16\n    dp_degree=${3:-\"1\"}             # (必选) dp数据并行度\n    mp_degree=${4:-\"1\"}             # (必选) mp数据并行度\n    pp_degree=${5:-\"1\"}             # (必选) pp数据并行度\n    micro_batch_size=${6:-\"2\"}      # (必选) micro_batch_size\n    global_batch_size=${7:-\"16\"}    # （必选）global_batch_size\n    run_mode=${8:-\"DP\"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1\n    device_num=${9:-\"N1C1\"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）\n    profiling=${PROFILING:-\"false\"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递\n    model_repo=\"PaddleFleetX\"          # (必选) 模型套件的名字\n    speed_unit=\"tokens/s\"         # (必选)速度指标单位\n    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step\n    keyword=\"ips:\"                 # (必选)解析日志，筛选出性能数据所在行的关键字\n    convergence_key=\"loss:\"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key=\"loss:\"\n    max_iter=${10:-500}                      # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数\n    use_sharding=${11:-\"false\"}               # （可选) 是否使用Sharding\n    num_workers=0                  # (可选)\n    base_batch_size=$global_batch_size\n    use_recompute=${12:-\"False\"}    # (可选)是否打开recompute\n    sharding_stage=${13:-\"1\"}       # (可选)sharding case\n    sharding_offload=${14:-\"False\"} # (可选)\n    eval_freq=${15:-\"1000000\"}         # (可选)\n    sharding_degree=${16:-\"1\"}      # (可选)\n    # 以下为通用执行命令，无特殊可不用修改\n    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐\n    device=${CUDA_VISIBLE_DEVICES//,/ }\n    arr=(${device})\n    num_gpu_devices=${#arr[*]}\n    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量\n    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量\n    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}\n    #\n    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log\n    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling\n    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed\n\n    OUTPUT_PATH=${run_log_path}/output\n}\n\nfunction _train(){\n    batch_size=${local_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs\n\n    if [ -d $OUTPUT_PATH ]; then\n        rm -rf $OUTPUT_PATH\n    fi\n    mkdir $OUTPUT_PATH\n\n    echo \"current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}\"\n\n    if [ ${profiling} = \"true\" ];then\n        add_options=\"--profiler_options=\\\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\\\"\"\n        log_file=${profiling_log_file}\n    else\n        add_options=\"\"\n        log_file=${train_log_file}\n    fi\n\n    local_batch_size=`expr ${global_batch_size} / ${dp_degree} / ${sharding_degree}`\n    num_attention_heads=16 #\"gpt2-medium-en\"\n    if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_attention_heads=4; fi #\"gpt2-small-en\"\n    num_layers=24 #\"gpt2-medium-en\"\n    if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_layers=4; fi #\"gpt2-small-en\"\n    use_pure_fp16=False # fp32\n    if [ \"fp16\" = ${fp_item} ]; then use_pure_fp16=True; fi\n    train_cmd=\"-o Global.seed=1234 \\\n               -o Global.local_batch_size=${local_batch_size} \\\n               -o Global.micro_batch_size=${micro_batch_size} \\\n               -o Engine.max_steps=${max_iter} \\\n               -o Engine.eval_freq=${eval_freq} \\\n               -o Engine.mix_precision.enable=${use_pure_fp16} \\\n               -o Engine.save_load.save_steps=100000 \\\n               -o Model.hidden_size=1024 \\\n               -o Model.num_hidden_layers=${num_layers} \\\n               -o Model.num_attention_heads=${num_attention_heads} \\\n               -o Model.use_recompute=${use_recompute} \\\n               -o Data.Train.dataset.input_dir=./dataset/ernie \\\n               -o Data.Eval.dataset.input_dir=./dataset/ernie \\\n               -o Distributed.dp_degree=${dp_degree} \\\n               -o Distributed.mp_degree=${mp_degree} \\\n               -o Distributed.pp_degree=${pp_degree} \\\n               -o Distributed.sharding.sharding_degree=${sharding_degree} \\\n               -o Distributed.sharding.sharding_stage=${sharding_stage} \\\n               -o Distributed.sharding.sharding_offload=${sharding_offload} \\\n               -o Optimizer.lr.max_lr=1e-4 \\\n               -o Optimizer.lr.min_lr=1e-5 \"\n\n    if [ ${PADDLE_TRAINER_ID} ]\n    then\n        PADDLE_RANK_OPTION=\" --rank ${PADDLE_TRAINER_ID}\"\n    else\n        PADDLE_RANK_OPTION=\"\"\n    fi\n    # 以下为通用执行命令，无特殊可不用修改\n    # hybrid_parallelism case\n    case ${run_mode} in\n    DP1-MP1-PP1) echo \"run run_mode: ${run_mode}\"\n        train_cmd=\"python -m paddle.distributed.launch --log_dir=./mylog --devices=0 ${PADDLE_RANK_OPTION} \\\n            tools/train.py -c ppfleetx/configs/nlp/ernie/pretrain_ernie_base_3D.yaml \\\n            ${train_cmd}\"\n        workerlog_id=0\n        ;;\n    DP2-MP1-PP1) echo \"run run_mode: ${run_mode}\"\n        train_cmd=\"python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1 ${PADDLE_RANK_OPTION}\\\n            tools/train.py -c ppfleetx/configs/nlp/ernie/pretrain_ernie_base_3D.yaml \\\n            ${train_cmd}\"\n        workerlog_id=0\n        ;;\n    DP2-MP2-PP2|DP2-MP8-PP2|DP4-MP8-PP1|DP1-MP8-PP4) echo \"run run_mode: ${run_mode}\"\n        train_cmd=\"python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\\\n            tools/train.py -c ppfleetx/configs/nlp/ernie/pretrain_ernie_base_3D.yaml \\\n            ${train_cmd}\"\n        workerlog_id=0\n        ;;\n    *) echo \"choose run_mode \"; exit 1;\n    esac\n    cd ../\n    echo \"train_cmd: ${train_cmd}  log_file: ${log_file}\"\n    if [[ ${model_item} =~ \"CE\" ]];then # CE精度-不限制执行时间\n        timeout 240m ${train_cmd} > ${log_file} 2>&1\n    else\n        timeout 15m ${train_cmd} > ${log_file} 2>&1\n    fi\n    if [ $? -ne 0 ];then\n        echo -e \"${model_name}, FAIL\"\n    else\n        echo -e \"${model_name}, SUCCESS\"\n    fi\n    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`\n    if [ ${device_num} != \"N1C1\" -a -d mylog ]; then\n        rm ${log_file}\n        cp mylog/workerlog.${workerlog_id} ${log_file}\n    fi\n}\n\nexport PYTHONPATH=$(dirname \"$PWD\"):$PYTHONPATH\n\nsource ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开\n_set_params $@\n#_train       # 如果只产出训练log,不解析,可取消注释\n_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/data_parallel/N1C8/gpt_1024_bs64_fp16_DP8-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt_1024\ndp_degree=8\nmp_degree=1\npp_degree=1\nbs_item=64\nfp_item=fp16\nrun_mode=DP8-MP1-PP1\ndevice_num=N1C8\nyaml_path=./ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml\n\nmodel=gpt\nmicro_bs=8\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/data_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/data_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${yaml_path} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/data_parallel/N1C8/gpt_1024_flash_bs64_fp16_DP8-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt_1024_flash\ndp_degree=8\nmp_degree=1\npp_degree=1\nbs_item=64\nfp_item=fp16\nrun_mode=DP8-MP1-PP1\ndevice_num=N1C8\nyaml_path=./ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml\n\nmodel=gpt\nmicro_bs=8\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/data_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/data_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${yaml_path} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/data_parallel/N1C8/gpt_2048_bs64_fp16_DP8-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt_2048\ndp_degree=8\nmp_degree=1\npp_degree=1\nbs_item=64\nfp_item=fp16\nrun_mode=DP8-MP1-PP1\ndevice_num=N1C8\nyaml_path=./ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml\n\nmodel=gpt\nmicro_bs=8\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/data_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/data_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${yaml_path} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/data_parallel/benchmark_common/prepare.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\npython -m pip install -r ../requirements.txt\n# get data\ncd ../\nrm -rf data\nmkdir data\nwget -O data/gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy\nwget -O data/gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/data_parallel/benchmark_common/run_benchmark.sh",
    "content": "#!/usr/bin/env bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Test training benchmark for a model.\n# Usage：bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}\nfunction _set_params(){\n    model_item=${1:-\"model_item\"}   # (必选) 模型 item\n    fp_item=${2:-\"fp32\"}            # (必选) fp32|fp16\n    dp_degree=${3:-\"1\"}             # (必选) dp数据并行度\n    mp_degree=${4:-\"1\"}             # (必选) mp数据并行度\n    pp_degree=${5:-\"1\"}             # (必选) pp数据并行度\n    micro_batch_size=${6:-\"2\"}      # (必选) micro_batch_size\n    global_batch_size=${7:-\"16\"}    # （必选）global_batch_size\n    run_mode=${8:-\"DP\"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1\n    device_num=${9:-\"N1C1\"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）\n    profiling=${PROFILING:-\"false\"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递\n    model_repo=\"PaddleFleetX\"          # (必选) 模型套件的名字\n    speed_unit=\"tokens/s\"         # (必选)速度指标单位\n    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step\n    keyword=\"ips:\"                 # (必选)解析日志，筛选出性能数据所在行的关键字\n    convergence_key=\"loss:\"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key=\"loss:\"\n    yaml_path=${10:-\"./pretrain/configs/pretrain_gpt_345M_single_card.yaml\"}\n    max_iter=${11:-500}                      # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数\n    num_workers=0                  # (可选)\n    base_batch_size=$global_batch_size\n    eval_freq=${12:-\"1000\"}         # (可选)模型评估间隔\n    use_recompute=${13:-\"False\"}    # (可选)是否打开recompute\n    # 以下为通用执行命令，无特殊可不用修改\n    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐\n    device=${CUDA_VISIBLE_DEVICES//,/ }\n    arr=(${device})\n    num_gpu_devices=${#arr[*]}\n    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量\n    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量\n    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}\n    #\n    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log\n    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling\n    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed\n\n    OUTPUT_PATH=${run_log_path}/output\n}\n\nfunction _train(){\n    batch_size=${local_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs\n\n    if [ -d $OUTPUT_PATH ]; then\n        rm -rf $OUTPUT_PATH\n    fi\n    mkdir $OUTPUT_PATH\n\n    echo \"current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}\"\n\n    if [ ${profiling} = \"true\" ];then\n        add_options=\"--profiler_options=\\\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\\\"\"\n        log_file=${profiling_log_file}\n    else\n        add_options=\"\"\n        log_file=${train_log_file}\n    fi\n\n    if [ ${model_item} = \"gpt_1024_flash\" ];then\n        args=\"-o Model.use_flash_attn=True\"\n    else\n        args=\"\"\n    fi\n\n    train_cmd=\"-c ${yaml_path} ${args} \\\n               -o Engine.max_steps=${max_iter} \\\n               -o Engine.eval_freq=${eval_freq} \\\n               -o Engine.save_load.save_steps=100000 \\\n               -o Distributed.dp_degree=${dp_degree} \\\n               \"\n\n    if [ ${PADDLE_TRAINER_ID} ]\n    then\n        PADDLE_RANK_OPTION=\" --rank ${PADDLE_TRAINER_ID}\"\n    else\n        PADDLE_RANK_OPTION=\"\"\n    fi\n    # 以下为通用执行命令，无特殊可不用修改\n    case ${run_mode} in\n    DP8-MP1-PP1) echo \"run run_mode: ${run_mode}\"\n        train_cmd=\"python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\\\n            tools/train.py \\\n            ${train_cmd}\"\n        workerlog_id=0\n        ;;\n    *) echo \"choose run_mode \"; exit 1;\n    esac\n    cd ../\n    echo \"train_cmd: ${train_cmd}  log_file: ${log_file}\"\n    if [[ ${model_item} =~ \"CE\" ]];then # CE精度-不限制执行时间\n        ${train_cmd} > ${log_file} 2>&1\n    else\n        timeout 15m ${train_cmd} > ${log_file} 2>&1\n    fi\n    if [ $? -ne 0 ];then\n        echo -e \"${model_name}, FAIL\"\n    else\n        echo -e \"${model_name}, SUCCESS\"\n    fi\n    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`\n    if [ ${device_num} != \"N1C1\" -a -d mylog ]; then\n        rm ${log_file}\n        cp mylog/workerlog.${workerlog_id} ${log_file}\n    fi\n}\n\nexport PYTHONPATH=$(dirname \"$PWD\"):$PYTHONPATH\n\nsource ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开\n_set_params $@\n#_train       # 如果只产出训练log,不解析,可取消注释\n_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_CoLA_bs32_fp16_DP1-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=CE_gpt_finetune_CoLA\ndp_degree=1\nmp_degree=1\npp_degree=1\nbs_item=32\nfp_item=fp16\nrun_mode=DP1-MP1-PP1\ndevice_num=N1C1\nconvergence_key=mcc:\ndataset=CoLA\n\nmodel=gpt\nmicro_bs=${bs_item}\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${convergence_key} ${dataset} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_MRPC_acc_bs32_fp16_DP1-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=CE_gpt_finetune_MRPC_acc\ndp_degree=1\nmp_degree=1\npp_degree=1\nbs_item=32\nfp_item=fp16\nrun_mode=DP1-MP1-PP1\ndevice_num=N1C1\nconvergence_key=acc:\ndataset=MRPC\n\nmodel=gpt\nmicro_bs=${bs_item}\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${convergence_key} ${dataset} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_MRPC_f1_bs32_fp16_DP1-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=CE_gpt_finetune_MRPC_f1\ndp_degree=1\nmp_degree=1\npp_degree=1\nbs_item=32\nfp_item=fp16\nrun_mode=DP1-MP1-PP1\ndevice_num=N1C1\nconvergence_key=f1:\ndataset=MRPC\n\nmodel=gpt\nmicro_bs=${bs_item}\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${convergence_key} ${dataset} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_QNLI_bs32_fp16_DP1-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=CE_gpt_finetune_QNLI\ndp_degree=1\nmp_degree=1\npp_degree=1\nbs_item=32\nfp_item=fp16\nrun_mode=DP1-MP1-PP1\ndevice_num=N1C1\nconvergence_key=acc:\ndataset=QNLI\n\nmodel=gpt\nmicro_bs=${bs_item}\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${convergence_key} ${dataset} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_RTE_bs32_fp16_DP1-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=CE_gpt_finetune_RTE\ndp_degree=1\nmp_degree=1\npp_degree=1\nbs_item=32\nfp_item=fp16\nrun_mode=DP1-MP1-PP1\ndevice_num=N1C1\nconvergence_key=acc:\ndataset=RTE\n\nmodel=gpt\nmicro_bs=${bs_item}\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${convergence_key} ${dataset} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_SST2_bs32_fp16_DP1-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=CE_gpt_finetune_SST2\ndp_degree=1\nmp_degree=1\npp_degree=1\nbs_item=32\nfp_item=fp16\nrun_mode=DP1-MP1-PP1\ndevice_num=N1C1\nconvergence_key=acc:\ndataset=SST2\n\nmodel=gpt\nmicro_bs=${bs_item}\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${convergence_key} ${dataset} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_STSB_pearson_bs32_fp16_DP1-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=CE_gpt_finetune_STSB_pearson\ndp_degree=1\nmp_degree=1\npp_degree=1\nbs_item=32\nfp_item=fp16\nrun_mode=DP1-MP1-PP1\ndevice_num=N1C1\nconvergence_key=pearson:\ndataset=STSB\n\nmodel=gpt\nmicro_bs=${bs_item}\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${convergence_key} ${dataset} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_STSB_spearman_bs32_fp16_DP1-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=CE_gpt_finetune_STSB_spearman\ndp_degree=1\nmp_degree=1\npp_degree=1\nbs_item=32\nfp_item=fp16\nrun_mode=DP1-MP1-PP1\ndevice_num=N1C1\nconvergence_key=spearman:\ndataset=STSB\n\nmodel=gpt\nmicro_bs=${bs_item}\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${convergence_key} ${dataset} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_WNLI_bs32_fp16_DP1-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=CE_gpt_finetune_WNLI\ndp_degree=1\nmp_degree=1\npp_degree=1\nbs_item=32\nfp_item=fp16\nrun_mode=DP1-MP1-PP1\ndevice_num=N1C1\nconvergence_key=acc:\ndataset=WNLI\n\nmodel=gpt\nmicro_bs=${bs_item}\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh\n# run\nsed -i \"s/num_train_epochs=5/num_train_epochs=20/g\" ../projects/gpt/finetune_gpt_345M_single_card.sh\nbash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${convergence_key} ${dataset} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\npython -m pip install -r ../requirements.txt\n# get ckpt\ncd ../\nrm -rf ckpt\nmkdir -p ckpt\nwget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz\ntar -xzf ckpt/GPT_345M.tar.gz -C ckpt/\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh",
    "content": "#!/usr/bin/env bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Test training benchmark for a model.\n# Usage：bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}\nfunction _set_params(){\n    model_item=${1:-\"model_item\"}   # (必选) 模型 item\n    fp_item=${2:-\"fp32\"}            # (必选) fp32|fp16\n    dp_degree=${3:-\"1\"}             # (必选) dp数据并行度\n    mp_degree=${4:-\"1\"}             # (必选) mp数据并行度\n    pp_degree=${5:-\"1\"}             # (必选) pp数据并行度\n    micro_batch_size=${6:-\"2\"}      # (必选) micro_batch_size\n    global_batch_size=${7:-\"16\"}    # （必选）global_batch_size\n    run_mode=${8:-\"DP\"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1\n    device_num=${9:-\"N1C1\"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）\n    profiling=${PROFILING:-\"false\"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递\n    model_repo=\"PaddleFleetX\"          # (必选) 模型套件的名字\n    speed_unit=\"steps/s\"         # (必选)速度指标单位\n    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step\n    keyword=\"ips:\"                 # (必选)解析日志，筛选出性能数据所在行的关键字\n    convergence_key=${10:-\"loss:\"}        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key=\"loss:\"\n    dataset=${11:-\"CoLA\"}                 # 数据集\n    max_iter=${12:-500}                      # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数\n    base_batch_size=$global_batch_size\n    sharding_degree=${13-\"1\"}      # (可选)\n    sharding_stage=${14:-\"1\"}       # (可选)sharding case\n    # 以下为通用执行命令，无特殊可不用修改\n    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐\n    device=${CUDA_VISIBLE_DEVICES//,/ }\n    arr=(${device})\n    num_gpu_devices=${#arr[*]}\n    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量\n    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量\n    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}\n    #\n    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log\n    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling\n    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed\n\n    OUTPUT_PATH=${run_log_path}/output\n}\n\nfunction _train(){\n    batch_size=${local_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs\n\n    if [ -d $OUTPUT_PATH ]; then\n        rm -rf $OUTPUT_PATH\n    fi\n    mkdir $OUTPUT_PATH\n\n    # if [ ${model_item} = \"gpt3_moe\" ];then\n    #     static_scripts=\"../examples/language_model/gpt-moe/dygraph/\"\n    # else\n    #     echo \"not supported model item: ${model_item}\"; exit 1;\n    # fi\n\n    echo \"current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}\"\n\n    if [ ${profiling} = \"true\" ];then\n        add_options=\"--profiler_options=\\\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\\\"\"\n        log_file=${profiling_log_file}\n    else\n        add_options=\"\"\n        log_file=${train_log_file}\n    fi\n\n    # data_path=\"./data/\"\n\n\n    local_batch_size=`expr ${global_batch_size} / ${dp_degree} / ${sharding_degree}`\n\n    train_cmd=\"${dataset}\"\n\n\n    # 以下为通用执行命令，无特殊可不用修改\n\n    # hybrid_parallelism case\n    case ${run_mode} in\n    DP1-MP1-PP1) echo \"run run_mode: DP1-MP1-PP1\"\n        train_cmd=\"bash projects/gpt/finetune_gpt_345M_single_card.sh \\\n            ${train_cmd}\"\n        ;;\n    *) echo \"choose run_mode \"; exit 1;\n    esac\n    cd ../\n    echo \"train_cmd: ${train_cmd}  log_file: ${log_file}\"\n\n    workerlog_id=0\n    timeout 40m ${train_cmd} > ${log_file} 2>&1\n    if [ $? -ne 0 ];then\n        echo -e \"${model_name}, FAIL\"\n    else\n        echo -e \"${model_name}, SUCCESS\"\n    fi\n    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`\n    if [ ${device_num} != \"N1C1\" -a -d mylog ]; then\n        rm ${log_file}\n        cp mylog/workerlog.${workerlog_id} ${log_file}\n    fi\n}\n\nexport PYTHONPATH=$(dirname \"$PWD\"):$PYTHONPATH\n\nsource ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开\n_set_params $@\n#_train       # 如果只产出训练log,不解析,可取消注释\n_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C1/gpt_bs16_fp16_DP1-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt\ndp_degree=1\nmp_degree=1\npp_degree=1\nbs_item=16\nfp_item=fp16\nrun_mode=DP1-MP1-PP1\ndevice_num=N1C1\n\nmodel=gpt\nmicro_bs=${bs_item}\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C1/gpt_bs16_fp32_DP1-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt\ndp_degree=1\nmp_degree=1\npp_degree=1\nbs_item=16\nfp_item=fp32\nrun_mode=DP1-MP1-PP1\ndevice_num=N1C1\n\nmodel=gpt\nmicro_bs=${bs_item}\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C4/gpt_bs16_fp16_DP1-MP1-PP4.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt\ndp_degree=1\nmp_degree=1\npp_degree=4\nbs_item=16\nfp_item=fp16\nrun_mode=DP1-MP1-PP4\ndevice_num=N1C4\n\nmodel=gpt\nmicro_bs=2\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C4/gpt_bs16_fp16_DP1-MP4-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt\ndp_degree=1\nmp_degree=4\npp_degree=1\nbs_item=16\nfp_item=fp16\nrun_mode=DP1-MP4-PP1\ndevice_num=N1C4\n\nmodel=gpt\nmicro_bs=8\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs16_fp16_DP1-MP1-PP8.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt\ndp_degree=1\nmp_degree=1\npp_degree=8\nbs_item=16\nfp_item=fp16\nrun_mode=DP1-MP1-PP8\ndevice_num=N1C8\n\nmodel=gpt\nmicro_bs=2\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs16_fp16_DP1-MP2-PP4.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt\ndp_degree=1\nmp_degree=2\npp_degree=4\nbs_item=16\nfp_item=fp16\nrun_mode=DP1-MP2-PP4\ndevice_num=N1C8\n\nmodel=gpt\nmicro_bs=2\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs16_fp16_DP1-MP4-PP2.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt\ndp_degree=1\nmp_degree=4\npp_degree=2\nbs_item=16\nfp_item=fp16\nrun_mode=DP1-MP4-PP2\ndevice_num=N1C8\n\nmodel=gpt\nmicro_bs=2\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs16_fp16_DP1-MP8-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt\ndp_degree=1\nmp_degree=8\npp_degree=1\nbs_item=16\nfp_item=fp16\nrun_mode=DP1-MP8-PP1\ndevice_num=N1C8\n\nmodel=gpt\nmicro_bs=16\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs16_fp16_DP2-MP2-PP2.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt\ndp_degree=2\nmp_degree=2\npp_degree=2\nbs_item=16\nfp_item=fp16\nrun_mode=DP2-MP2-PP2\ndevice_num=N1C8\n\nmodel=gpt\nmicro_bs=8\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs16_fp32_DP2-MP2-PP2.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt\ndp_degree=2\nmp_degree=2\npp_degree=2\nbs_item=16\nfp_item=fp32\nrun_mode=DP2-MP2-PP2\ndevice_num=N1C8\n\nmodel=gpt\nmicro_bs=8\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs64_fp16_DP8-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt\ndp_degree=8\nmp_degree=1\npp_degree=1\nbs_item=64\nfp_item=fp16\nrun_mode=DP8-MP1-PP1\ndevice_num=N1C8\nmax_iter=500\nuse_recompute=True\n\nmodel=gpt\nmicro_bs=8\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${max_iter} ${use_recompute} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs64_fp32_DP8-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt\ndp_degree=8\nmp_degree=1\npp_degree=1\nbs_item=64\nfp_item=fp32\nrun_mode=DP8-MP1-PP1\ndevice_num=N1C8\nmax_iter=500\nuse_recompute=True\n\nmodel=gpt\nmicro_bs=8\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${max_iter} ${use_recompute} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_recompute_bs16_fp16_DP2-MP2-PP2.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt_recompute\ndp_degree=2\nmp_degree=2\npp_degree=2\nbs_item=16\nfp_item=fp16\nrun_mode=DP2-MP2-PP2\ndevice_num=N1C8\nmax_iter=500\nuse_recompute=True\n\nmodel=gpt\nmicro_bs=2\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${max_iter} ${use_recompute} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_recompute_bs16_fp32_DP2-MP2-PP2.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt_recompute\ndp_degree=2\nmp_degree=2\npp_degree=2\nbs_item=16\nfp_item=fp32\nrun_mode=DP2-MP2-PP2\ndevice_num=N1C8\nmax_iter=500\nuse_recompute=True\n\nmodel=gpt\nmicro_bs=2\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${max_iter} ${use_recompute} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N4C32/gpt_bs16_fp16_DP1-MP8-PP4.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt\ndp_degree=1\nmp_degree=8\npp_degree=4\nbs_item=16\nfp_item=fp16\nrun_mode=DP1-MP8-PP4\ndevice_num=N4C32\n\nmodel=gpt\nmicro_bs=4\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N4C32/gpt_bs16_fp16_DP2-MP8-PP2.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt\ndp_degree=2\nmp_degree=8\npp_degree=2\nbs_item=16\nfp_item=fp16\nrun_mode=DP2-MP8-PP2\ndevice_num=N4C32\n\nmodel=gpt\nmicro_bs=4\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N4C32/gpt_bs16_fp16_DP4-MP8-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt\ndp_degree=4\nmp_degree=8\npp_degree=1\nbs_item=16\nfp_item=fp16\nrun_mode=DP4-MP8-PP1\ndevice_num=N4C32\n\nmodel=gpt\nmicro_bs=4\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N4C32/gpt_bs16_fp32_DP1-MP8-PP4.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt\ndp_degree=1\nmp_degree=8\npp_degree=4\nbs_item=16\nfp_item=fp32\nrun_mode=DP1-MP8-PP4\ndevice_num=N4C32\n\nmodel=gpt\nmicro_bs=4\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N4C32/gpt_bs16_fp32_DP2-MP8-PP2.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt\ndp_degree=2\nmp_degree=8\npp_degree=2\nbs_item=16\nfp_item=fp32\nrun_mode=DP2-MP8-PP2\ndevice_num=N4C32\n\nmodel=gpt\nmicro_bs=4\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N4C32/gpt_bs16_fp32_DP4-MP8-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt\ndp_degree=4\nmp_degree=8\npp_degree=1\nbs_item=16\nfp_item=fp32\nrun_mode=DP4-MP8-PP1\ndevice_num=N4C32\n\nmodel=gpt\nmicro_bs=4\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\npython -m pip install -r ../requirements.txt\n# get data\ncd ../\nrm -rf data\nmkdir data\nwget -O data/gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy\nwget -O data/gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh",
    "content": "#!/usr/bin/env bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Test training benchmark for a model.\n# Usage：bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}\nfunction _set_params(){\n    model_item=${1:-\"model_item\"}   # (必选) 模型 item\n    fp_item=${2:-\"fp32\"}            # (必选) fp32|fp16\n    dp_degree=${3:-\"1\"}             # (必选) dp数据并行度\n    mp_degree=${4:-\"1\"}             # (必选) mp数据并行度\n    pp_degree=${5:-\"1\"}             # (必选) pp数据并行度\n    micro_batch_size=${6:-\"2\"}      # (必选) micro_batch_size\n    global_batch_size=${7:-\"16\"}    # （必选）global_batch_size\n    run_mode=${8:-\"DP\"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1\n    device_num=${9:-\"N1C1\"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）\n    profiling=${PROFILING:-\"false\"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递\n    model_repo=\"PaddleFleetX\"          # (必选) 模型套件的名字\n    speed_unit=\"tokens/s\"         # (必选)速度指标单位\n    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step\n    keyword=\"ips:\"                 # (必选)解析日志，筛选出性能数据所在行的关键字\n    convergence_key=\"loss:\"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key=\"loss:\"\n    max_iter=${10:-500}                      # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数\n    num_workers=0                  # (可选)\n    base_batch_size=$global_batch_size\n    use_recompute=${11:-\"False\"}    # (可选)是否打开recompute\n    eval_freq=${12:-\"1000\"}         # (可选)模型评估间隔\n    sharding_degree=${13:-\"1\"}      # (可选)分组切分并行维度\n    sharding_stage=${14:-\"1\"}       # (可选)切分策略；1表示仅切分优化器状态，2表示再切分梯度，3表示再切分前向参数\n    sharding_offload=${15:-\"False\"} # (可选)CPU offload策略\n    # 以下为通用执行命令，无特殊可不用修改\n    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐\n    device=${CUDA_VISIBLE_DEVICES//,/ }\n    arr=(${device})\n    num_gpu_devices=${#arr[*]}\n    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量\n    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量\n    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}\n    #\n    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log\n    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling\n    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed\n\n    OUTPUT_PATH=${run_log_path}/output\n}\n\nfunction _train(){\n    batch_size=${local_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs\n\n    if [ -d $OUTPUT_PATH ]; then\n        rm -rf $OUTPUT_PATH\n    fi\n    mkdir $OUTPUT_PATH\n\n    echo \"current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}\"\n\n    if [ ${profiling} = \"true\" ];then\n        add_options=\"--profiler_options=\\\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\\\"\"\n        log_file=${profiling_log_file}\n    else\n        add_options=\"\"\n        log_file=${train_log_file}\n    fi\n\n    local_batch_size=`expr ${global_batch_size} / ${dp_degree} / ${sharding_degree}`\n    num_attention_heads=16 #\"gpt2-medium-en\"\n    if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_attention_heads=4; fi #\"gpt2-small-en\"\n    num_layers=24 #\"gpt2-medium-en\"\n    if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_layers=4; fi #\"gpt2-small-en\"\n    use_pure_fp16=False\n    if [ \"fp16\" = ${fp_item} ]; then use_pure_fp16=True; fi\n    train_cmd=\"-o Global.seed=1234 \\\n               -o Global.local_batch_size=${local_batch_size} \\\n               -o Global.micro_batch_size=${micro_batch_size} \\\n               -o Engine.max_steps=${max_iter} \\\n               -o Engine.eval_freq=${eval_freq} \\\n               -o Engine.mix_precision.enable=${use_pure_fp16} \\\n               -o Engine.save_load.save_steps=100000 \\\n               -o Model.hidden_size=1024 \\\n               -o Model.num_layers=${num_layers} \\\n               -o Model.num_attention_heads=${num_attention_heads} \\\n               -o Model.type_vocab_size=1 \\\n               -o Model.use_recompute=${use_recompute} \\\n               -o Distributed.dp_degree=${dp_degree} \\\n               -o Distributed.mp_degree=${mp_degree} \\\n               -o Distributed.pp_degree=${pp_degree} \\\n               -o Distributed.sharding.sharding_degree=${sharding_degree} \\\n               -o Distributed.sharding.sharding_stage=${sharding_stage} \\\n               -o Distributed.sharding.sharding_offload=${sharding_offload} \\\n               -o Optimizer.lr.max_lr=1e-4 \\\n               -o Optimizer.lr.min_lr=1e-5 \"\n\n    if [ ${PADDLE_TRAINER_ID} ]\n    then\n        PADDLE_RANK_OPTION=\" --rank ${PADDLE_TRAINER_ID}\"\n    else\n        PADDLE_RANK_OPTION=\"\"\n    fi\n    # 以下为通用执行命令，无特殊可不用修改\n    case ${run_mode} in\n    DP1-MP1-PP1) echo \"run run_mode: DP1-MP1-PP1\"\n        train_cmd=\"python -m paddle.distributed.launch --log_dir=./mylog --devices=0 ${PADDLE_RANK_OPTION}\\\n              tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml \\\n              ${train_cmd}\" \n        workerlog_id=0\n        ;;\n    DP1-MP1-PP4|DP1-MP4-PP1) echo \"run run_mode: ${run_mode}\"\n        train_cmd=\"python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3 ${PADDLE_RANK_OPTION}\\\n            tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml \\\n            ${train_cmd}\"\n        workerlog_id=0\n        ;;\n    DP8-MP1-PP1|DP1-MP8-PP1|DP1-MP1-PP8|DP1-MP2-PP4|DP1-MP4-PP2|DP2-MP2-PP2| \\\n    DP2-MP8-PP2|DP4-MP8-PP1|DP1-MP8-PP4) echo \"run run_mode: ${run_mode}\"\n        train_cmd=\"python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\\\n            tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml \\\n            ${train_cmd}\"\n        workerlog_id=0\n        ;;\n    *) echo \"choose run_mode \"; exit 1;\n    esac\n    cd ../\n    echo \"train_cmd: ${train_cmd}  log_file: ${log_file}\"\n    if [[ ${model_item} =~ \"CE\" ]];then # CE精度-不限制执行时间\n        ${train_cmd} > ${log_file} 2>&1\n    else\n        timeout 15m ${train_cmd} > ${log_file} 2>&1\n    fi\n    if [ $? -ne 0 ];then\n        echo -e \"${model_name}, FAIL\"\n    else\n        echo -e \"${model_name}, SUCCESS\"\n    fi\n    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`\n    if [ ${device_num} != \"N1C1\" -a -d mylog ]; then\n        rm ${log_file}\n        cp mylog/workerlog.${workerlog_id} ${log_file}\n    fi\n}\n\nexport PYTHONPATH=$(dirname \"$PWD\"):$PYTHONPATH\n\nsource ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开\n_set_params $@\n#_train       # 如果只产出训练log,不解析,可取消注释\n_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/sequence_parallel/N1C8/gpt_sp_False_bs8_fp16_DP1-MP8-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt_sp_False\ndp_degree=1\nmp_degree=8\npp_degree=1\nbs_item=8\nfp_item=fp16\nrun_mode=DP1-MP8-PP1\ndevice_num=N1C8\nsequence_parallel=False\n\nmodel=gpt\nmicro_bs=8\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${sequence_parallel} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/sequence_parallel/N1C8/gpt_sp_True_bs8_fp16_DP1-MP8-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt_sp_True\ndp_degree=1\nmp_degree=8\npp_degree=1\nbs_item=8\nfp_item=fp16\nrun_mode=DP1-MP8-PP1\ndevice_num=N1C8\nsequence_parallel=True\n\nmodel=gpt\nmicro_bs=8\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${sequence_parallel} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/sequence_parallel/N4C32/gpt_sp_False_bs16_fp16_DP2-MP8-PP2.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt_sp_False\ndp_degree=2\nmp_degree=8\npp_degree=2\nbs_item=16\nfp_item=fp16\nrun_mode=DP2-MP8-PP2\ndevice_num=N4C32\nsequence_parallel=False\n\nmodel=gpt\nmicro_bs=8\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${sequence_parallel} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/sequence_parallel/N4C32/gpt_sp_True_bs16_fp16_DP2-MP8-PP2.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt_sp_True\ndp_degree=2\nmp_degree=8\npp_degree=2\nbs_item=16\nfp_item=fp16\nrun_mode=DP2-MP8-PP2\ndevice_num=N4C32\nsequence_parallel=True\n\nmodel=gpt\nmicro_bs=8\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${sequence_parallel} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/prepare.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\npython -m pip install -r ../requirements.txt\n# get data\ncd ../\nrm -rf data\nmkdir data\nwget -O data/gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy\nwget -O data/gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/run_benchmark.sh",
    "content": "#!/usr/bin/env bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Test training benchmark for a model.\n# Usage：bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}\nfunction _set_params(){\n    model_item=${1:-\"model_item\"}   # (必选) 模型 item\n    fp_item=${2:-\"fp32\"}            # (必选) fp32|fp16\n    dp_degree=${3:-\"1\"}             # (必选) dp数据并行度\n    mp_degree=${4:-\"1\"}             # (必选) mp数据并行度\n    pp_degree=${5:-\"1\"}             # (必选) pp数据并行度\n    micro_batch_size=${6:-\"2\"}      # (必选) micro_batch_size\n    global_batch_size=${7:-\"16\"}    # （必选）global_batch_size\n    run_mode=${8:-\"DP\"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1\n    device_num=${9:-\"N1C1\"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）\n    profiling=${PROFILING:-\"false\"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递\n    model_repo=\"PaddleFleetX\"          # (必选) 模型套件的名字\n    speed_unit=\"tokens/s\"         # (必选)速度指标单位\n    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step\n    keyword=\"ips:\"                 # (必选)解析日志，筛选出性能数据所在行的关键字\n    convergence_key=\"loss:\"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key=\"loss:\"\n    sequence_parallel=${10:-\"False\"}    # (可选)是否打开sequence_parallel\n    max_iter=${11:-1000}                      # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数\n    eval_freq=${12:-\"1000\"}         # (可选)模型评估间隔\n    num_workers=0                  # (可选)\n    base_batch_size=$global_batch_size\n    use_recompute=${13:-\"True\"}    # (可选)是否打开recompute\n    sharding_degree=${14:-\"1\"}      # (可选)分组切分并行维度\n    sharding_stage=${15:-\"1\"}       # (可选)切分策略；1表示仅切分优化器状态，2表示再切分梯度，3表示再切分前向参数\n    sharding_offload=${16:-\"False\"} # (可选)CPU offload策略\n    # 以下为通用执行命令，无特殊可不用修改\n    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐\n    device=${CUDA_VISIBLE_DEVICES//,/ }\n    arr=(${device})\n    num_gpu_devices=${#arr[*]}\n    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量\n    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量\n    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}\n    #\n    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log\n    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling\n    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed\n\n    OUTPUT_PATH=${run_log_path}/output\n}\n\nfunction _train(){\n    batch_size=${local_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs\n\n    if [ -d $OUTPUT_PATH ]; then\n        rm -rf $OUTPUT_PATH\n    fi\n    mkdir $OUTPUT_PATH\n\n    echo \"current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}\"\n\n    if [ ${profiling} = \"true\" ];then\n        add_options=\"--profiler_options=\\\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\\\"\"\n        log_file=${profiling_log_file}\n    else\n        add_options=\"\"\n        log_file=${train_log_file}\n    fi\n\n    local_batch_size=`expr ${global_batch_size} / ${dp_degree} / ${sharding_degree}`\n    num_attention_heads=16 #\"gpt2-medium-en\"\n    if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_attention_heads=4; fi #\"gpt2-small-en\"\n    num_layers=24 #\"gpt2-medium-en\"\n    if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_layers=4; fi #\"gpt2-small-en\"\n    use_pure_fp16=False\n    if [ \"fp16\" = ${fp_item} ]; then use_pure_fp16=True; fi\n    train_cmd=\"-o Engine.max_steps=${max_iter} \\\n               -o Engine.eval_iters=${eval_freq} \\\n               -o Distributed.dp_degree=${dp_degree} \\\n               -o Distributed.mp_degree=${mp_degree} \\\n               -o Distributed.pp_degree=${pp_degree} \\\n               -o Distributed.sharding.sharding_degree=${sharding_degree} \\\n               -o Distributed.sharding.sharding_stage=${sharding_stage} \\\n               -o Distributed.sharding.sharding_offload=${sharding_offload} \\\n               -o Model.sequence_parallel=${sequence_parallel}  \\\n               -o Distributed.sharding.reduce_overlap=False \\\n               -o Distributed.sharding.broadcast_overlap=False \\\n               -o Optimizer.tensor_fusion=False \"\n\n    if [ ${PADDLE_TRAINER_ID} ]\n    then\n        PADDLE_RANK_OPTION=\" --rank ${PADDLE_TRAINER_ID}\"\n    else\n        PADDLE_RANK_OPTION=\"\"\n    fi\n    # 以下为通用执行命令，无特殊可不用修改\n    case ${run_mode} in\n    DP1-MP1-PP1) echo \"run run_mode: DP1-MP1-PP1\"\n        train_cmd=\"python -m paddle.distributed.launch --log_dir=./mylog --devices=0 ${PADDLE_RANK_OPTION}\\\n            tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml \\\n            ${train_cmd}\"\n        workerlog_id=0\n        ;;\n    DP1-MP8-PP1) echo \"run run_mode: ${run_mode}\"\n        train_cmd=\"python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\\\n            tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml \\\n            ${train_cmd}\"\n        workerlog_id=0\n        ;;\n    DP2-MP8-PP2) echo \"run run_mode: ${run_mode}\"\n        train_cmd=\"python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\\\n            tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml \\\n            ${train_cmd}\"\n        workerlog_id=0\n        ;;\n    *) echo \"choose run_mode \"; exit 1;\n    esac\n    cd ../\n    echo \"train_cmd: ${train_cmd}  log_file: ${log_file}\"\n    if [[ ${model_item} =~ \"CE\" ]];then # CE精度-不限制执行时间\n        ${train_cmd} > ${log_file} 2>&1\n    else\n        timeout 60m ${train_cmd} > ${log_file} 2>&1\n    fi\n    if [ $? -ne 0 ];then\n        echo -e \"${model_name}, FAIL\"\n    else\n        echo -e \"${model_name}, SUCCESS\"\n    fi\n    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`\n    if [ ${device_num} != \"N1C1\" -a -d mylog ]; then\n        rm ${log_file}\n        cp mylog/workerlog.${workerlog_id} ${log_file}\n    fi\n}\n\nexport PYTHONPATH=$(dirname \"$PWD\"):$PYTHONPATH\n\nsource ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开\n_set_params $@\n#_train       # 如果只产出训练log,不解析,可取消注释\n_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/sharding/N1C2/gpt_stage2_bs16_fp16_DP1-MP1-PP1-Sharding2.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt_stage2\ndp_degree=1\nmp_degree=1\npp_degree=1\nbs_item=16\nfp_item=fp16\nrun_mode=DP1-MP1-PP1-Sharding2\ndevice_num=N1C2\nsharding_degree=2\nsharding_stage=2\nsharding_offload=True\n\nmodel=gpt\nmicro_bs=8\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/sharding/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/sharding/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${sharding_degree} ${sharding_stage} ${sharding_offload} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/sharding/N1C2/gpt_stage3_bs16_fp16_DP1-MP1-PP1-Sharding2.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt_stage3\ndp_degree=1\nmp_degree=1\npp_degree=1\nbs_item=16\nfp_item=fp16\nrun_mode=DP1-MP1-PP1-Sharding2\ndevice_num=N1C2\nsharding_degree=2\nsharding_stage=3\nsharding_offload=True\n\nmodel=gpt\nmicro_bs=8\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/sharding/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/sharding/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${sharding_degree} ${sharding_stage} ${sharding_offload} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/sharding/N1C2/gpt_stage3_bs16_fp32_DP1-MP1-PP1-Sharding2.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt_stage3\ndp_degree=1\nmp_degree=1\npp_degree=1\nbs_item=16\nfp_item=fp32\nrun_mode=DP1-MP1-PP1-Sharding2\ndevice_num=N1C2\nsharding_degree=2\nsharding_stage=3\nsharding_offload=True\n\nmodel=gpt\nmicro_bs=8\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/sharding/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/sharding/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${sharding_degree} ${sharding_stage} ${sharding_offload} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/sharding/N2C16/gpt_stage2_bs128_fp16_DP1-MP1-PP1-Sharding16.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt_stage2\ndp_degree=1\nmp_degree=1\npp_degree=1\nbs_item=128\nfp_item=fp16\nrun_mode=DP1-MP1-PP1-Sharding16\ndevice_num=N2C16\nsharding_degree=16\nsharding_stage=2\nsharding_offload=True\nmax_iter=30\n\nmodel=gpt\nmicro_bs=8\n\ncd ./benchmarks\nbash ./test_tipc/gpt/dygraph/sharding/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/dygraph/sharding/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${sharding_degree} ${sharding_stage} ${sharding_offload} ${max_iter} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/sharding/benchmark_common/prepare.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\npython -m pip install -r ../requirements.txt\n# get data\ncd ../\nrm -rf data\nmkdir data\nwget -O data/gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy\nwget -O data/gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/dygraph/sharding/benchmark_common/run_benchmark.sh",
    "content": "#!/usr/bin/env bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Test training benchmark for a model.\n# Usage：bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}\nfunction _set_params(){\n    model_item=${1:-\"model_item\"}   # (必选) 模型 item\n    fp_item=${2:-\"fp32\"}            # (必选) fp32|fp16\n    dp_degree=${3:-\"1\"}             # (必选) dp数据并行度\n    mp_degree=${4:-\"1\"}             # (必选) mp数据并行度\n    pp_degree=${5:-\"1\"}             # (必选) pp数据并行度\n    micro_batch_size=${6:-\"2\"}      # (必选) micro_batch_size\n    global_batch_size=${7:-\"16\"}    # （必选）global_batch_size\n    run_mode=${8:-\"DP\"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1\n    device_num=${9:-\"N1C1\"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）\n    profiling=${PROFILING:-\"false\"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递\n    model_repo=\"PaddleFleetX\"          # (必选) 模型套件的名字\n    speed_unit=\"tokens/s\"         # (必选)速度指标单位\n    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step\n    keyword=\"ips:\"                 # (必选)解析日志，筛选出性能数据所在行的关键字\n    convergence_key=\"loss:\"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key=\"loss:\"\n    sharding_degree=${10:-\"1\"}      # (可选)分组切分并行维度\n    sharding_stage=${11:-\"1\"}       # (可选)切分策略；1表示仅切分优化器状态，2表示再切分梯度，3表示再切分前向参数\n    sharding_offload=${12:-\"False\"} # (可选)CPU offload策略\n    max_iter=${13:-500}                      # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数\n    eval_freq=${14:-\"1000\"}         # (可选)模型评估间隔\n    num_workers=0                  # (可选)\n    base_batch_size=$global_batch_size\n    use_recompute=${15:-\"True\"}    # (可选)是否打开recompute\n    # 以下为通用执行命令，无特殊可不用修改\n    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐\n    device=${CUDA_VISIBLE_DEVICES//,/ }\n    arr=(${device})\n    num_gpu_devices=${#arr[*]}\n    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量\n    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量\n    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}\n    #\n    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log\n    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling\n    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed\n\n    OUTPUT_PATH=${run_log_path}/output\n}\n\nfunction _train(){\n    batch_size=${local_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs\n\n    if [ -d $OUTPUT_PATH ]; then\n        rm -rf $OUTPUT_PATH\n    fi\n    mkdir $OUTPUT_PATH\n\n    echo \"current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}\"\n\n    if [ ${profiling} = \"true\" ];then\n        add_options=\"--profiler_options=\\\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\\\"\"\n        log_file=${profiling_log_file}\n    else\n        add_options=\"\"\n        log_file=${train_log_file}\n    fi\n\n    local_batch_size=`expr ${global_batch_size} / ${dp_degree} / ${sharding_degree}`\n    use_pure_fp16=False\n    if [ \"fp16\" = ${fp_item} ]; then use_pure_fp16=True; fi\n    train_cmd=\"-o Global.local_batch_size=${local_batch_size} \\\n               -o Global.micro_batch_size=${micro_batch_size} \\\n               -o Engine.max_steps=${max_iter} \\\n               -o Engine.eval_freq=${eval_freq} \\\n               -o Engine.mix_precision.enable=${use_pure_fp16} \\\n               -o Engine.save_load.save_steps=100000 \\\n               -o Model.use_recompute=${use_recompute} \\\n               -o Distributed.dp_degree=${dp_degree} \\\n               -o Distributed.mp_degree=${mp_degree} \\\n               -o Distributed.pp_degree=${pp_degree} \\\n               -o Distributed.sharding.sharding_degree=${sharding_degree} \\\n               -o Distributed.sharding.sharding_stage=${sharding_stage} \\\n               -o Distributed.sharding.sharding_offload=${sharding_offload} \\\n                \"\n\n    if [ ${PADDLE_TRAINER_ID} ]\n    then\n        PADDLE_RANK_OPTION=\" --rank ${PADDLE_TRAINER_ID}\"\n    else\n        PADDLE_RANK_OPTION=\"\"\n    fi\n    # 以下为通用执行命令，无特殊可不用修改\n    case ${run_mode} in\n    DP1-MP1-PP1-Sharding2) echo \"run run_mode: DP1-MP1-PP1-Sharding2\"\n        train_cmd=\"python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1 ${PADDLE_RANK_OPTION}\\\n            ./tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml \\\n            -o Global.seed=1234 \\\n            -o Model.hidden_size=1024 \\\n            -o Model.num_layers=4 \\\n            -o Model.num_attention_heads=4 \\\n            -o Model.type_vocab_size=1 \\\n            -o Optimizer.lr.max_lr=1e-4 \\\n            -o Optimizer.lr.min_lr=1e-5 \\\n            ${train_cmd}\"\n        workerlog_id=0\n        ;;\n    DP1-MP1-PP1-Sharding16) echo \"run run_mode: ${run_mode}\"\n        train_cmd=\"python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\\\n            ./tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml \\\n            -o Engine.logging_freq=1 \\\n            ${train_cmd}\"\n        workerlog_id=0\n        ;;\n    *) echo \"choose run_mode \"; exit 1;\n    esac\n    cd ../\n    echo \"train_cmd: ${train_cmd}  log_file: ${log_file}\"\n    if [[ ${model_item} =~ \"CE\" ]];then # CE精度-不限制执行时间\n        ${train_cmd} > ${log_file} 2>&1\n    else\n        timeout 70m ${train_cmd} > ${log_file} 2>&1\n    fi\n    if [ $? -ne 0 ];then\n        echo -e \"${model_name}, FAIL\"\n    else\n        echo -e \"${model_name}, SUCCESS\"\n    fi\n    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`\n    if [ ${device_num} != \"N1C1\" -a -d mylog ]; then\n        rm ${log_file}\n        cp mylog/workerlog.${workerlog_id} ${log_file}\n    fi\n}\n\nexport PYTHONPATH=$(dirname \"$PWD\"):$PYTHONPATH\n\nsource ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开\n_set_params $@\n#_train       # 如果只产出训练log,不解析,可取消注释\n_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/static/auto_parallel/N1C1/gpt_auto_recompute_bs8_fp32_DP1-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=gpt_auto_recompute\ndp_degree=1\nmp_degree=1\npp_degree=1\nbs_item=8\nfp_item=fp32\nrun_mode=DP1-MP1-PP1\ndevice_num=N1C1\nmax_iter=500\nuse_recompute=True\n\nmodel=gpt\nmicro_bs=8\n\ncd ./benchmarks\nbash ./test_tipc/gpt/static/auto_parallel/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/gpt/static/auto_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${max_iter} ${use_recompute} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/static/auto_parallel/benchmark_common/prepare.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\npython -m pip install -r ../requirements.txt\n# get data\ncd ../\nrm -rf data\nmkdir data\nwget -O data/gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy\nwget -O data/gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz\n"
  },
  {
    "path": "benchmarks/test_tipc/gpt/static/auto_parallel/benchmark_common/run_benchmark.sh",
    "content": "#!/usr/bin/env bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Test training benchmark for a model.\n# Usage：bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}\nfunction _set_params(){\n    model_item=${1:-\"model_item\"}   # (必选) 模型 item\n    fp_item=${2:-\"fp32\"}            # (必选) fp32|fp16\n    dp_degree=${3:-\"1\"}             # (必选) dp数据并行度\n    mp_degree=${4:-\"1\"}             # (必选) mp数据并行度\n    pp_degree=${5:-\"1\"}             # (必选) pp数据并行度\n    micro_batch_size=${6:-\"2\"}      # (必选) micro_batch_size\n    global_batch_size=${7:-\"16\"}    # （必选）global_batch_size\n    run_mode=${8:-\"DP\"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1\n    device_num=${9:-\"N1C1\"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）\n    profiling=${PROFILING:-\"false\"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递\n    model_repo=\"PaddleFleetX\"          # (必选) 模型套件的名字\n    speed_unit=\"samples/s\"         # (必选)速度指标单位\n    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step\n    keyword=\"ips:\"                 # (必选)解析日志，筛选出性能数据所在行的关键字\n    convergence_key=\"loss:\"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key=\"loss:\"\n    max_iter=${10:-500}                      # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数\n    num_workers=0                  # (可选)\n    base_batch_size=$global_batch_size\n    use_recompute=${11:-\"False\"}    # (可选)是否打开recompute\n    verbose=${12:-\"3\"}         # (可选)是否打印性能数据\n    logging_freq=${13:-\"100000\"} # (可选)loss打印频率\n    sharding_degree=${14:-\"1\"}      # (可选)\n    sharding_stage=${15:-\"1\"}       # (可选)sharding case\n    \n    # 以下为通用执行命令，无特殊可不用修改\n    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐\n    device=${CUDA_VISIBLE_DEVICES//,/ }\n    arr=(${device})\n    num_gpu_devices=${#arr[*]}\n    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量\n    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量\n    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}\n    #\n    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log\n    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling\n    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed\n\n    OUTPUT_PATH=${run_log_path}/output\n}\n\nfunction _train(){\n    batch_size=${local_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs\n\n    if [ -d $OUTPUT_PATH ]; then\n        rm -rf $OUTPUT_PATH\n    fi\n    mkdir $OUTPUT_PATH\n\n    echo \"current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}\"\n\n    if [ ${profiling} = \"true\" ];then\n        add_options=\"--profiler_options=\\\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\\\"\"\n        log_file=${profiling_log_file}\n    else\n        add_options=\"\"\n        log_file=${train_log_file}\n    fi\n\n    local_batch_size=`expr ${global_batch_size} / ${dp_degree} / ${sharding_degree}`\n    num_attention_heads=16 #\"gpt2-medium-en\"\n    if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_attention_heads=4; fi #\"gpt2-small-en\"\n    num_layers=24 #\"gpt2-medium-en\"\n    if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_layers=4; fi #\"gpt2-small-en\"\n    use_pure_fp16=False # fp32\n    if [ \"fp16\" = ${fp_item} ]; then use_pure_fp16=True; fi\n    train_cmd=\"-o Global.seed=1234 \\\n               -o Global.local_batch_size=${local_batch_size} \\\n               -o Global.micro_batch_size=${micro_batch_size} \\\n               -o Engine.max_steps=${max_iter} \\\n               -o Engine.eval_freq=100000 \\\n               -o Engine.mix_precision.enable=${use_pure_fp16} \\\n               -o Engine.save_load.save_steps=100000 \\\n               -o Model.hidden_size=1024 \\\n               -o Model.num_layers=${num_layers} \\\n               -o Model.num_attention_heads=${num_attention_heads} \\\n               -o Model.type_vocab_size=1 \\\n               -o Model.use_recompute=${use_recompute} \\\n               -o Distributed.dp_degree=${dp_degree} \\\n               -o Distributed.mp_degree=${mp_degree} \\\n               -o Distributed.pp_degree=${pp_degree} \\\n               -o Distributed.sharding.sharding_degree=${sharding_degree} \\\n               -o Distributed.sharding.sharding_stage=${sharding_stage} \\\n               -o Optimizer.lr.max_lr=1e-4 \\\n               -o Optimizer.lr.min_lr=1e-5  \\\n               -o Engine.verbose=${verbose} \\\n               -o Engine.logging_freq=${logging_freq} \"\n\n    if [ ${PADDLE_TRAINER_ID} ]\n    then\n        PADDLE_RANK_OPTION=\" --rank ${PADDLE_TRAINER_ID}\"\n    else\n        PADDLE_RANK_OPTION=\"\"\n    fi\n    # 以下为通用执行命令，无特殊可不用修改\n    case ${run_mode} in\n    DP1-MP1-PP1) echo \"run run_mode: DP1-MP1-PP1\"\n        train_cmd=\"python -m paddle.distributed.launch --log_dir=./mylog --devices=0 ${PADDLE_RANK_OPTION}\\\n            tools/auto.py -c ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8.yaml \\\n            ${train_cmd}\"\n        workerlog_id=0\n        ;;\n    DP2-MP2-PP2) echo \"run run_mode: ${run_mode}\"\n        train_cmd=\"python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\\\n            tools/auto.py -c ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8.yaml \\\n            ${train_cmd}\"\n        workerlog_id_1=4\n        workerlog_id_2=6\n        ;;\n    *) echo \"choose run_mode \"; exit 1;\n    esac\n    cd ../\n    echo \"train_cmd: ${train_cmd}  log_file: ${log_file}\"\n    if [[ ${model_item} =~ \"CE\" ]];then # CE精度-不限制执行时间\n        ${train_cmd} > ${log_file} 2>&1\n    else\n        timeout 20m ${train_cmd} > ${log_file} 2>&1\n    fi\n    if [ $? -ne 0 ];then\n        echo -e \"${model_name}, FAIL\"\n    else\n        echo -e \"${model_name}, SUCCESS\"\n    fi\n    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`\n    if [ ${device_num} != \"N1C1\" -a -d mylog ]; then\n        rm ${log_file}\n        cp mylog/workerlog.${workerlog_id_1} ${log_file}\n        cp mylog/workerlog.${workerlog_id_2} ${log_file}_2\n    fi\n}\n\nexport PYTHONPATH=$(dirname \"$PWD\"):$PYTHONPATH\n\nsource ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开\n_set_params $@\n#_train       # 如果只产出训练log,不解析,可取消注释\n_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开\n"
  },
  {
    "path": "benchmarks/test_tipc/imagen/dygraph/N1C1/imagen_397M_text2im_64_bs1_fp32_DP1-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=imagen_397M_text2im_64\ndp_degree=1\nmp_degree=1\npp_degree=1\nbs_item=1\nfp_item=fp32\nrun_mode=DP1-MP1-PP1\ndevice_num=N1C1\nyaml_path=ppfleetx/configs/multimodal/imagen/imagen_397M_text2im_64x64.yaml\n\nmodel=imagen\nmicro_bs=1\n\ncd ./benchmarks\nbash ./test_tipc/imagen/dygraph/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/imagen/dygraph/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${yaml_path} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/imagen/dygraph/N1C1/imagen_SR256_bs1_fp32_DP1-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=imagen_SR256\ndp_degree=1\nmp_degree=1\npp_degree=1\nbs_item=1\nfp_item=fp32\nrun_mode=DP1-MP1-PP1\ndevice_num=N1C1\nyaml_path=ppfleetx/configs/multimodal/imagen/imagen_super_resolution_256.yaml\n\nmodel=imagen\nmicro_bs=1\n\ncd ./benchmarks\nbash ./test_tipc/imagen/dygraph/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/imagen/dygraph/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${yaml_path} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/imagen/dygraph/N1C8/imagen_2B_text2im_64_bs8_fp32_DP1-Sharding8.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=imagen_2B_text2im_64\ndp_degree=1\nmp_degree=1\npp_degree=1\nbs_item=8\nfp_item=fp32\nrun_mode=DP1-Sharding8\ndevice_num=N1C8\nyaml_path=ppfleetx/configs/multimodal/imagen/imagen_text2im_64x64_T5-11B.yaml\nmax_iter=1000\nsharding_degree=8\nsharding_stage=2\n\nmodel=imagen\nmicro_bs=1\n\ncd ./benchmarks\nbash ./test_tipc/imagen/dygraph/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/imagen/dygraph/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${yaml_path} ${max_iter} ${sharding_degree} ${sharding_stage} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/imagen/dygraph/N1C8/imagen_397M_text2im_64_bs8_fp32_DP8-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=imagen_397M_text2im_64\ndp_degree=8\nmp_degree=1\npp_degree=1\nbs_item=8\nfp_item=fp32\nrun_mode=DP8-MP1-PP1\ndevice_num=N1C8\nyaml_path=ppfleetx/configs/multimodal/imagen/imagen_397M_text2im_64x64.yaml\n\nmodel=imagen\nmicro_bs=1\n\ncd ./benchmarks\nbash ./test_tipc/imagen/dygraph/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/imagen/dygraph/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${yaml_path} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/imagen/dygraph/N1C8/imagen_SR256_bs8_fp32_DP8-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=imagen_SR256\ndp_degree=8\nmp_degree=1\npp_degree=1\nbs_item=8\nfp_item=fp32\nrun_mode=DP8-MP1-PP1\ndevice_num=N1C8\nyaml_path=ppfleetx/configs/multimodal/imagen/imagen_super_resolution_256.yaml\n\nmodel=imagen\nmicro_bs=1\n\ncd ./benchmarks\nbash ./test_tipc/imagen/dygraph/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/imagen/dygraph/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${yaml_path} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/imagen/dygraph/N1C8/imagen_text2im_64_debertav2_bs8_fp32_DP8-MP1-PP1.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nmodel_item=imagen_text2im_64_debertav2\ndp_degree=8\nmp_degree=1\npp_degree=1\nbs_item=8\nfp_item=fp32\nrun_mode=DP8-MP1-PP1\ndevice_num=N1C8\nyaml_path=ppfleetx/configs/multimodal/imagen/imagen_text2im_64x64_DebertaV2.yaml\n\nmodel=imagen\nmicro_bs=1\n\ncd ./benchmarks\nbash ./test_tipc/imagen/dygraph/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/imagen/dygraph/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \\\n${yaml_path} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/imagen/dygraph/benchmark_common/prepare.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\npython -m pip install -r ../requirements.txt\n# get data\ncd ../\nwget -O projects/imagen/part-00079 https://paddlefleetx.bj.bcebos.com/data/laion400m/part-00079\n# T5-11B\nmkdir -p projects/imagen/t5/t5-11b/ && cd projects/imagen/t5/t5-11b/\nwget https://paddlefleetx.bj.bcebos.com/tokenizers/t5/t5-11b/config.json\nwget https://paddlefleetx.bj.bcebos.com/tokenizers/t5/t5-11b/spiece.model\nwget https://paddlefleetx.bj.bcebos.com/tokenizers/t5/t5-11b/tokenizer.json\nwget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.0\nwget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.1\nwget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.2\nwget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.3\nwget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.4\ncat t5.pd.tar.gz.* |tar -xf -\ncd -\n# DeBERTa V2 1.5B\nmkdir -p projects/imagen/cache/deberta-v-xxlarge && cd projects/imagen/cache/deberta-v-xxlarge\nwget https://paddlefleetx.bj.bcebos.com/tokenizers/debertav2/config.json\nwget https://paddlefleetx.bj.bcebos.com/tokenizers/debertav2/spm.model\nwget https://paddlefleetx.bj.bcebos.com/tokenizers/debertav2/tokenizer_config.json\nwget https://fleetx.bj.bcebos.com/DebertaV2/debertav2.pd.tar.gz.0\nwget https://fleetx.bj.bcebos.com/DebertaV2/debertav2.pd.tar.gz.1\ncat debertav2.pd.tar.gz.* | tar -xf -\ncd -\n"
  },
  {
    "path": "benchmarks/test_tipc/imagen/dygraph/benchmark_common/run_benchmark.sh",
    "content": "#!/usr/bin/env bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Test training benchmark for a model.\n# Usage：bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}\nfunction _set_params(){\n    model_item=${1:-\"model_item\"}   # (必选) 模型 item\n    fp_item=${2:-\"fp32\"}            # (必选) fp32|fp16\n    dp_degree=${3:-\"1\"}             # (必选) dp数据并行度\n    mp_degree=${4:-\"1\"}             # (必选) mp数据并行度\n    pp_degree=${5:-\"1\"}             # (必选) pp数据并行度\n    micro_batch_size=${6:-\"2\"}      # (必选) micro_batch_size\n    global_batch_size=${7:-\"16\"}    # （必选）global_batch_size\n    run_mode=${8:-\"DP\"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1\n    device_num=${9:-\"N1C1\"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）\n    yaml_path=${10:-\"ppfleetx/configs/multimodal/imagen/imagen_397M_text2im_64x64.yaml\"}\n    profiling=${PROFILING:-\"false\"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递\n    model_repo=\"PaddleFleetX\"          # (必选) 模型套件的名字\n    speed_unit=\"step/s\"         # (必选)速度指标单位\n    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step\n    keyword=\"speed:\"                 # (必选)解析日志，筛选出性能数据所在行的关键字\n    convergence_key=\"loss:\"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key=\"loss:\"\n    max_iter=${11:-1000}                      # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数\n    num_workers=0                  # (可选)\n    base_batch_size=$global_batch_size\n    sharding_degree=${12:-\"1\"}      # (可选)\n    sharding_stage=${13:-\"1\"}       # (可选)sharding case\n    sharding_offload=${14:-\"False\"} # (可选)\n    # 以下为通用执行命令，无特殊可不用修改\n    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐\n    device=${CUDA_VISIBLE_DEVICES//,/ }\n    arr=(${device})\n    num_gpu_devices=${#arr[*]}\n    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量\n    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量\n    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}\n    #\n    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log\n    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling\n    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed\n\n    OUTPUT_PATH=${run_log_path}/output\n}\n\nfunction _train(){\n    batch_size=${local_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs\n\n    if [ -d $OUTPUT_PATH ]; then\n        rm -rf $OUTPUT_PATH\n    fi\n    mkdir $OUTPUT_PATH\n\n    echo \"current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}\"\n\n    if [ ${profiling} = \"true\" ];then\n        add_options=\"--profiler_options=\\\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\\\"\"\n        log_file=${profiling_log_file}\n    else\n        add_options=\"\"\n        log_file=${train_log_file}\n    fi\n\n    local_batch_size=`expr ${global_batch_size} / ${dp_degree} / ${sharding_degree}`\n    train_cmd=\"-o Engine.max_steps=${max_iter} \\\n               -o Global.local_batch_size=${local_batch_size} \\\n               -o Global.micro_batch_size=${micro_batch_size} \\\n               -o Distributed.dp_degree=${dp_degree} \\\n               -o Distributed.mp_degree=${mp_degree} \\\n               -o Distributed.pp_degree=${pp_degree} \\\n               -o Distributed.sharding.sharding_degree=${sharding_degree} \\\n               -o Distributed.sharding.sharding_stage=${sharding_stage} \\\n               -o Distributed.sharding.sharding_offload=${sharding_offload} \\\n               \"\n    if [ ${PADDLE_TRAINER_ID} ]\n    then\n        PADDLE_RANK_OPTION=\" --rank ${PADDLE_TRAINER_ID}\"\n    else\n        PADDLE_RANK_OPTION=\"\"\n    fi\n\n    # 以下为通用执行命令，无特殊可不用修改\n    case ${run_mode} in\n    DP1-MP1-PP1) echo \"run run_mode: DP1-MP1-PP1\"\n        train_cmd=\"python -m paddle.distributed.launch --log_dir=./mylog --devices=0 \\\n            ${PADDLE_RANK_OPTION} tools/train.py -c ${yaml_path} \\\n            ${train_cmd}\"\n        workerlog_id=0\n        ;;\n    DP8-MP1-PP1|DP1-Sharding8) echo \"run run_mode: ${run_mode}\"\n        train_cmd=\"python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 \\\n            ${PADDLE_RANK_OPTION} tools/train.py -c ${yaml_path} \\\n            ${train_cmd}\"\n        workerlog_id=0\n        ;;\n    *) echo \"choose run_mode \"; exit 1;\n    esac\n    cd ../\n    echo \"train_cmd: ${train_cmd}  log_file: ${log_file}\"\n    if [[ ${model_item} =~ \"CE\" ]];then # CE精度-不限制执行时间\n        ${train_cmd} > ${log_file} 2>&1\n    else\n        timeout 30m ${train_cmd} > ${log_file} 2>&1\n    fi\n    if [ $? -ne 0 ];then\n        echo -e \"${model_name}, FAIL\"\n    else\n        echo -e \"${model_name}, SUCCESS\"\n    fi\n    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`\n    if [ ${device_num} != \"N1C1\" -a -d mylog ]; then\n        rm ${log_file}\n        cp mylog/workerlog.${workerlog_id} ${log_file}\n    fi\n}\n\nexport PYTHONPATH=$(dirname \"$PWD\"):$PYTHONPATH\n\nsource ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开\n_set_params $@\n#_train       # 如果只产出训练log,不解析,可取消注释\n_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开\n"
  },
  {
    "path": "benchmarks/test_tipc/vit/dygraph/finetune/N1C8/ViT_large_patch16_384_ft_fused_False_bs512_fp16_DP.sh",
    "content": "model_item=ViT_large_patch16_384_ft_fused_False\nfp_item=fp16\nbs_item=512\nrun_mode=DP\ndevice_num=N1C8\nuse_fused_attn=False\nmax_iter=1\n\n\ncd ./benchmarks\nbash ./test_tipc/vit/dygraph/finetune/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/vit/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${bs_item} ${run_mode} ${device_num} \\\n${use_fused_attn} ${max_iter} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/vit/dygraph/finetune/N1C8/ViT_large_patch16_384_ft_fused_True_bs512_fp16_DP.sh",
    "content": "model_item=ViT_large_patch16_384_ft_fused_True\nfp_item=fp16\nbs_item=512\nrun_mode=DP\ndevice_num=N1C8\nuse_fused_attn=True\nmax_iter=1\n\n\ncd ./benchmarks\nbash ./test_tipc/vit/dygraph/finetune/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/vit/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${bs_item} ${run_mode} ${device_num} \\\n${use_fused_attn} ${max_iter} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/vit/dygraph/finetune/benchmark_common/prepare.sh",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\npython -m pip install -r ../requirements.txt\n# get data\ncd ../\nmkdir dataset && cd dataset\ncp -r ${BENCHMARK_ROOT}/models_data_cfs/Paddle_distributed/ILSVRC2012.tgz ./\ntar -zxf ILSVRC2012.tgz\ncd -\n\n# pretrained\nmkdir -p pretrained/vit/\nwget -O ./pretrained/vit/imagenet21k-ViT-L_16.pdparams \\\nhttps://paddle-wheel.bj.bcebos.com/benchmark/imagenet21k-ViT-L_16.pdparams\n"
  },
  {
    "path": "benchmarks/test_tipc/vit/dygraph/finetune/benchmark_common/run_benchmark.sh",
    "content": "#!/usr/bin/env bash\n# Test training benchmark for a model.\n# Usage：bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}\nfunction _set_params(){\n    model_item=${1:-\"model_item\"}   # (必选) 模型 item\n    fp_item=${2:-\"fp32\"}            # (必选) fp32|fp16\n    global_batch_size=${3:-\"128\"}    # （必选）global_batch_size\n    run_mode=${4:-\"DP\"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1\n    device_num=${5:-\"N1C1\"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）\n    profiling=${PROFILING:-\"false\"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递\n    model_repo=\"PaddleFleetX\"          # (必选) 模型套件的名字\n    speed_unit=\"images/sec\"         # (必选)速度指标单位\n    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step\n    keyword=\"ips:\"                 # (必选)解析日志，筛选出性能数据所在行的关键字\n    convergence_key=\"loss:\"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key=\"loss:\"\n    use_fused_attn=${6:-\"False\"}\n    max_iter=${7:-1}                      # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数\n    num_workers=0                  # (可选)\n    base_batch_size=$global_batch_size\n    # 以下为通用执行命令，无特殊可不用修改\n    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐\n    device=${CUDA_VISIBLE_DEVICES//,/ }\n    arr=(${device})\n    num_gpu_devices=${#arr[*]}\n    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量\n    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量\n    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}\n    #\n    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log\n    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling\n    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed\n\n    OUTPUT_PATH=${run_log_path}/output\n}\n\nfunction _train(){\n    batch_size=${local_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs\n\n    if [ -d $OUTPUT_PATH ]; then\n        rm -rf $OUTPUT_PATH\n    fi\n    mkdir $OUTPUT_PATH\n\n    echo \"current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}\"\n\n    if [ ${profiling} = \"true\" ];then\n        add_options=\"--profiler_options=\\\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\\\"\"\n        log_file=${profiling_log_file}\n    else\n        add_options=\"\"\n        log_file=${train_log_file}\n    fi\n\n    train_cmd=\"-o Engine.num_train_epochs=${max_iter} \\\n               -o Model.model.use_fused_attn=${use_fused_attn} \\\n               \"\n    if [ ${PADDLE_TRAINER_ID} ]\n    then\n        PADDLE_RANK_OPTION=\" --rank ${PADDLE_TRAINER_ID}\"\n    else\n        PADDLE_RANK_OPTION=\"\"\n    fi\n    # 以下为通用执行命令，无特殊可不用修改\n    train_cmd=\"python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION} \\\n        tools/train.py -c ppfleetx/configs/vis/vit/ViT_large_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml \\\n        ${train_cmd}\"\n    workerlog_id=0\n    cd ../\n    echo \"train_cmd: ${train_cmd}  log_file: ${log_file}\"\n    if [[ ${model_item} =~ \"CE\" ]];then # CE精度-不限制执行时间\n        ${train_cmd} > ${log_file} 2>&1\n    else\n        timeout 15m ${train_cmd} > ${log_file} 2>&1\n    fi\n    if [ $? -ne 0 ];then\n        echo -e \"${model_name}, FAIL\"\n    else\n        echo -e \"${model_name}, SUCCESS\"\n    fi\n    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`\n    if [ ${device_num} != \"N1C1\" -a -d mylog ]; then\n        rm ${log_file}\n        cp mylog/workerlog.${workerlog_id} ${log_file}\n    fi\n}\n\nexport PYTHONPATH=$(dirname \"$PWD\"):$PYTHONPATH\n\nsource ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开\n_set_params $@\n#_train       # 如果只产出训练log,不解析,可取消注释\n_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开\n"
  },
  {
    "path": "benchmarks/test_tipc/vit/dygraph/pretrained/N2C16/ViT_large_patch16_224_pt_fused_False_bs128_fp16_DP.sh",
    "content": "model_item=ViT_large_patch16_224_pt_fused_False\nfp_item=fp16\nbs_item=128\nrun_mode=DP\ndevice_num=N2C16\nuse_fused_attn=False\nmax_iter=1\n\n\ncd ./benchmarks\nbash ./test_tipc/vit/dygraph/pretrained/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/vit/dygraph/pretrained/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${bs_item} ${run_mode} ${device_num} \\\n${use_fused_attn} ${max_iter} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/vit/dygraph/pretrained/N2C16/ViT_large_patch16_224_pt_fused_True_bs128_fp16_DP.sh",
    "content": "model_item=ViT_large_patch16_224_pt_fused_True\nfp_item=fp16\nbs_item=128\nrun_mode=DP\ndevice_num=N2C16\nuse_fused_attn=True\nmax_iter=1\n\n\ncd ./benchmarks\nbash ./test_tipc/vit/dygraph/pretrained/benchmark_common/prepare.sh\n# run\nbash ./test_tipc/vit/dygraph/pretrained/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${bs_item} ${run_mode} ${device_num} \\\n${use_fused_attn} ${max_iter} 2>&1;\n"
  },
  {
    "path": "benchmarks/test_tipc/vit/dygraph/pretrained/benchmark_common/prepare.sh",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\npython -m pip install -r ../requirements.txt\n# get data\ncd ../\nmkdir dataset && cd dataset\ncp -r ${BENCHMARK_ROOT}/models_data_cfs/Paddle_distributed/ILSVRC2012.tgz ./\ntar -zxf ILSVRC2012.tgz\ncd -\n"
  },
  {
    "path": "benchmarks/test_tipc/vit/dygraph/pretrained/benchmark_common/run_benchmark.sh",
    "content": "#!/usr/bin/env bash\n# Test training benchmark for a model.\n# Usage：bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}\nfunction _set_params(){\n    model_item=${1:-\"model_item\"}   # (必选) 模型 item\n    fp_item=${2:-\"fp32\"}            # (必选) fp32|fp16\n    global_batch_size=${3:-\"128\"}    # （必选）global_batch_size\n    run_mode=${4:-\"DP\"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1\n    device_num=${5:-\"N1C1\"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）\n    yaml_path=${6:-\"./task/classification/vit/configs/ViT_base_patch16_224_in1k_1n8c_dp_fp16o2.yaml\"}\n    profiling=${PROFILING:-\"false\"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递\n    model_repo=\"PaddleFleetX\"          # (必选) 模型套件的名字\n    speed_unit=\"images/sec\"         # (必选)速度指标单位\n    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step\n    keyword=\"ips:\"                 # (必选)解析日志，筛选出性能数据所在行的关键字\n    convergence_key=\"loss:\"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key=\"loss:\"\n    use_fused_attn=${7:-\"False\"}\n    max_iter=${8:-1}                      # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数\n    num_workers=0                  # (可选)\n    base_batch_size=$global_batch_size\n    pretrained_model=${9:-\"null\"}\n    # 以下为通用执行命令，无特殊可不用修改\n    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐\n    device=${CUDA_VISIBLE_DEVICES//,/ }\n    arr=(${device})\n    num_gpu_devices=${#arr[*]}\n    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量\n    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量\n    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}\n    #\n    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log\n    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling\n    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed\n\n    OUTPUT_PATH=${run_log_path}/output\n}\n\nfunction _train(){\n    batch_size=${local_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs\n\n    if [ -d $OUTPUT_PATH ]; then\n        rm -rf $OUTPUT_PATH\n    fi\n    mkdir $OUTPUT_PATH\n\n    echo \"current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}\"\n\n    if [ ${profiling} = \"true\" ];then\n        add_options=\"--profiler_options=\\\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\\\"\"\n        log_file=${profiling_log_file}\n    else\n        add_options=\"\"\n        log_file=${train_log_file}\n    fi\n\n    train_cmd=\"-o Engine.num_train_epochs=${max_iter} \\\n               -o Data.Train.sampler.batch_size=${global_batch_size} \\\n               -o Model.model.name=ViT_large_patch16_224 \\\n               -o Model.model.use_fused_attn=${use_fused_attn}\n               \"\n    if [ ${PADDLE_TRAINER_ID} ]\n    then\n        PADDLE_RANK_OPTION=\" --rank ${PADDLE_TRAINER_ID}\"\n    else\n        PADDLE_RANK_OPTION=\"\"\n    fi\n    # 以下为通用执行命令，无特殊可不用修改\n    train_cmd=\"python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION} \\\n        tools/train.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml \\\n        ${train_cmd}\"\n    workerlog_id=0\n    cd ../\n    echo \"train_cmd: ${train_cmd}  log_file: ${log_file}\"\n    if [[ ${model_item} =~ \"CE\" ]];then # CE精度-不限制执行时间\n        ${train_cmd} > ${log_file} 2>&1\n    else\n        timeout 15m ${train_cmd} > ${log_file} 2>&1\n    fi\n    if [ $? -ne 0 ];then\n        echo -e \"${model_name}, FAIL\"\n    else\n        echo -e \"${model_name}, SUCCESS\"\n    fi\n    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`\n    if [ ${device_num} != \"N1C1\" -a -d mylog ]; then\n        rm ${log_file}\n        cp mylog/workerlog.${workerlog_id} ${log_file}\n    fi\n}\n\nexport PYTHONPATH=$(dirname \"$PWD\"):$PYTHONPATH\n\nsource ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开\n_set_params $@\n#_train       # 如果只产出训练log,不解析,可取消注释\n_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开\n"
  },
  {
    "path": "codestyle/.gitignore",
    "content": "*.pyc\n"
  },
  {
    "path": "codestyle/clang_format.hook",
    "content": "#!/bin/bash\nset -e\n\nreadonly VERSION=\"13.0.0\"\n\nversion=$(clang-format -version)\n\nif ! [[ $(python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1$2}') -ge 36 ]]; then\n    echo \"clang-format installation by pip need python version great equal 3.6, \n          please change the default python to higher version.\"\n    exit 1\nfi\n\nif ! [[ $version == *\"$VERSION\"* ]]; then\n    # low version of pip may not have the source of clang-format whl\n    pip install --upgrade pip \n    pip install clang-format==13.0.0\nfi\n\nclang-format $@\n"
  },
  {
    "path": "codestyle/copyright.hook",
    "content": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import print_function\nfrom __future__ import unicode_literals\n\nimport argparse\nimport io\nimport re\nimport sys\nimport os\nimport datetime\n\nCOPYRIGHT = '''Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.\n\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file except in compliance with the License.\nYou may obtain a copy of the License at\n\n    http://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an \"AS IS\" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nSee the License for the specific language governing permissions and\nlimitations under the License.'''\n\ndef _generate_copyright(comment_mark):\n    copyright=COPYRIGHT.split(os.linesep)\n    header = copyright[0].rstrip()\n\n    p = re.search('(\\d{4})', header).group(0)\n    now = datetime.datetime.now()\n\n    header = header.replace(p,str(now.year))\n\n    ans=[comment_mark + \" \" + header + os.linesep]\n    for idx, line in enumerate(copyright[1:]):\n        ans.append(comment_mark + \" \" + line.rstrip() + os.linesep)\n\n    return ans\n\ndef _get_comment_mark(path):\n    lang_type=re.compile(r\"\\.(py|sh)$\")\n    if lang_type.search(path) is not None:\n        return \"#\"\n\n    lang_type=re.compile(r\"\\.(h|c|hpp|cc|cpp|cu|go|cuh|proto)$\")\n    if lang_type.search(path) is not None:\n        return \"//\"\n\n    return None\n\n\nRE_ENCODE = re.compile(r\"^[ \\t\\v]*#.*?coding[:=]\", re.IGNORECASE)\nRE_COPYRIGHT = re.compile(r\".*Copyright \\(c\\) \\d{4}\", re.IGNORECASE)\nRE_SHEBANG = re.compile(r\"^[ \\t\\v]*#[ \\t]?\\!\")\n\ndef _check_copyright(path):\n    head=[]\n    try:\n        with open(path) as f:\n            head = [next(f) for x in range(4)]\n    except StopIteration:\n        pass\n\n    for idx, line in enumerate(head):\n        if RE_COPYRIGHT.search(line) is not None:\n            return True\n\n    return False\n\ndef generate_copyright(path, comment_mark):\n    original_contents = io.open(path, encoding=\"utf-8\").readlines()\n    head = original_contents[0:4]\n\n    insert_line_no=0\n    for i, line in enumerate(head):\n        if RE_ENCODE.search(line) or RE_SHEBANG.search(line):\n            insert_line_no=i+1\n\n    copyright = _generate_copyright(comment_mark)\n    if insert_line_no == 0:\n        new_contents = copyright\n        if len(original_contents) > 0 and len(original_contents[0].strip()) != 0:\n            new_contents.append(os.linesep)\n        new_contents.extend(original_contents)\n    else:\n        new_contents=original_contents[0:insert_line_no]\n        new_contents.append(os.linesep)\n        new_contents.extend(copyright)\n        if len(original_contents) > insert_line_no and len(original_contents[insert_line_no].strip()) != 0:\n            new_contents.append(os.linesep)\n        new_contents.extend(original_contents[insert_line_no:])\n    new_contents=\"\".join(new_contents)\n\n    with io.open(path, 'w') as output_file:\n        output_file.write(new_contents)\n\n\n\ndef main(argv=None):\n    parser = argparse.ArgumentParser(\n        description='Checker for copyright declaration.')\n    parser.add_argument('filenames', nargs='*', help='Filenames to check')\n    args = parser.parse_args(argv)\n\n    retv = 0\n    for path in args.filenames:\n        comment_mark = _get_comment_mark(path)\n        if comment_mark is None:\n            print(\"warning:Unsupported file\", path, file=sys.stderr)\n            continue\n\n        if _check_copyright(path):\n            continue\n\n        generate_copyright(path, comment_mark)\n\n\nif __name__ == '__main__':\n    exit(main())\n"
  },
  {
    "path": "codestyle/cpplint_pre_commit.hook",
    "content": "#!/bin/bash\n\nTOTAL_ERRORS=0\n\nreadonly VERSION=\"1.6.0\"\n\nversion=$(cpplint --version)\n\nif [[ ! $TRAVIS_BRANCH ]]; then\n  # install cpplint on local machine.\n  if ! [[ $version == *\"$VERSION\"* ]]; then\n    pip install cpplint==1.6.0\n  fi\n  # diff files on local machine. \n  files=$(git diff --cached --name-status | awk '$1 != \"D\" {print $2}')\nelse\n  # diff files between PR and latest commit on Travis CI. \n  branch_ref=$(git rev-parse \"$TRAVIS_BRANCH\")\n  head_ref=$(git rev-parse HEAD)\n  files=$(git diff --name-status $branch_ref $head_ref | awk '$1 != \"D\" {print $2}')\nfi\n# The trick to remove deleted files: https://stackoverflow.com/a/2413151\nfor file in $files; do\n    if [[ $file =~ ^(patches/.*) ]]; then\n        continue;\n    else\n        cpplint --filter=-readability/fn_size,-build/include_what_you_use,-build/c++11,-whitespace/parens $file;\n        TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);\n    fi\ndone\n\nexit $TOTAL_ERRORS\n"
  },
  {
    "path": "codestyle/docstring_checker.py",
    "content": "#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"DocstringChecker is used to check python doc string's style.\"\"\"\n\nimport astroid\n\nfrom pylint.checkers import BaseChecker, utils\nfrom pylint.interfaces import IAstroidChecker\n\nfrom collections import defaultdict\nimport re\n\n\ndef register(linter):\n    \"\"\"Register checkers.\"\"\"\n    linter.register_checker(DocstringChecker(linter))\n\n\nclass Docstring(object):\n    \"\"\"Docstring class holds the parsed doc string elements.\n    \"\"\"\n\n    def __init__(self):\n        self.d = defaultdict(list)  #name->[]\n        self.clear()\n\n    def clear(self):\n        self.d['Args'] = []\n        self.d['Examples'] = []\n        self.d['Returns'] = []\n        self.d['Raises'] = []\n        self.args = {}  #arg_name->arg_type\n\n    def get_level(self, string, indent='    '):\n        level = 0\n        unit_size = len(indent)\n        while string[:unit_size] == indent:\n            string = string[unit_size:]\n            level += 1\n\n        return level\n\n    def parse(self, doc):\n        \"\"\"parse gets sections from doc\n        Such as Args, Returns, Raises, Examples s\n        Args:\n            doc (string): is the astroid node doc string.\n        Returns:\n            True if doc is parsed successfully.\n        \"\"\"\n        self.clear()\n\n        lines = doc.splitlines()\n        state = (\"others\", -1)\n        for l in lines:\n            c = l.strip()\n            if len(c) <= 0:\n                continue\n\n            level = self.get_level(l)\n            if c.startswith(\"Args:\"):\n                state = (\"Args\", level)\n            elif c.startswith(\"Returns:\"):\n                state = (\"Returns\", level)\n            elif c.startswith(\"Raises:\"):\n                state = (\"Raises\", level)\n            elif c.startswith(\"Examples:\"):\n                state = (\"Examples\", level)\n            else:\n                if level > state[1]:\n                    self.d[state[0]].append(c)\n                    continue\n\n                state = (\"others\", -1)\n                self.d[state[0]].append(c)\n\n        self._arg_with_type()\n        return True\n\n    def get_returns(self):\n        return self.d['Returns']\n\n    def get_raises(self):\n        return self.d['Raises']\n\n    def get_examples(self):\n        return self.d['Examples']\n\n    def _arg_with_type(self):\n\n        for t in self.d['Args']:\n            m = re.search(r'([A-Za-z0-9_-]+)\\s{0,4}(\\(.+\\))\\s{0,4}:', t)\n            if m:\n                self.args[m.group(1)] = m.group(2)\n\n        return self.args\n\n\nclass DocstringChecker(BaseChecker):\n    \"\"\"DosstringChecker is pylint checker to\n    check docstring style.\n    \"\"\"\n    __implements__ = (IAstroidChecker, )\n\n    POSITIONAL_MESSAGE_ID = 'str-used-on-positional-format-argument'\n    KEYWORD_MESSAGE_ID = 'str-used-on-keyword-format-argument'\n\n    name = 'doc-string-checker'\n    symbol = \"doc-string\"\n    priority = -1\n    msgs = {\n        'W9001': ('One line doc string on > 1 lines', symbol + \"-one-line\",\n                  'Used when a short doc string is on multiple lines'),\n        'W9002':\n        ('Doc string does not end with \".\" period', symbol + \"-end-with\",\n         'Used when a doc string does not end with a period'),\n        'W9003':\n        ('All args with their types must be mentioned in doc string %s',\n         symbol + \"-with-all-args\",\n         'Used when not all arguments are in the doc string '),\n        'W9005': ('Missing docstring or docstring is too short',\n                  symbol + \"-missing\", 'Add docstring longer >=10'),\n        'W9006': ('Docstring indent error, use 4 space for indent',\n                  symbol + \"-indent-error\", 'Use 4 space for indent'),\n        'W9007': ('You should add `Returns` in comments',\n                  symbol + \"-with-returns\",\n                  'There should be a `Returns` section in comments'),\n        'W9008': ('You should add `Raises` section in comments',\n                  symbol + \"-with-raises\",\n                  'There should be a `Raises` section in comments'),\n    }\n    options = ()\n\n    def visit_functiondef(self, node):\n        \"\"\"visit_functiondef checks Function node docstring style.\n        Args:\n            node (astroid.node): The visiting node.\n        Returns:\n            True if successful other wise False.\n        \"\"\"\n\n        self.check_doc_string(node)\n\n        if node.tolineno - node.fromlineno <= 10:\n            return True\n\n        if not node.doc:\n            return True\n\n        doc = Docstring()\n        doc.parse(node.doc)\n\n        self.all_args_in_doc(node, doc)\n        self.with_returns(node, doc)\n        self.with_raises(node, doc)\n\n    def visit_module(self, node):\n        self.check_doc_string(node)\n\n    def visit_classdef(self, node):\n        self.check_doc_string(node)\n\n    def check_doc_string(self, node):\n        self.missing_doc_string(node)\n        self.one_line(node)\n        self.has_period(node)\n        self.indent_style(node)\n\n    def missing_doc_string(self, node):\n        if node.name.startswith(\"__\") or node.name.startswith(\"_\"):\n            return True\n        if node.tolineno - node.fromlineno <= 10:\n            return True\n\n        if node.doc is None or len(node.doc) < 10:\n            self.add_message('W9005', node=node, line=node.fromlineno)\n        return False\n\n    # FIXME(gongwb): give the docstring line-no\n    def indent_style(self, node, indent=4):\n        \"\"\"indent_style checks docstring's indent style\n        Args:\n            node (astroid.node): The visiting node.\n            indent (int): The default indent of style\n        Returns:\n            True if successful other wise False.\n        \"\"\"\n        if node.doc is None:\n            return True\n\n        doc = node.doc\n        lines = doc.splitlines()\n        line_num = 0\n\n        for l in lines:\n            if line_num == 0:\n                continue\n            cur_indent = len(l) - len(l.lstrip())\n            if cur_indent % indent != 0:\n                self.add_message('W9006', node=node, line=node.fromlineno)\n                return False\n            line_num += 1\n\n        return True\n\n    def one_line(self, node):\n        \"\"\"one_line checks if docstring (len < 40) is on one line.\n        Args:\n            node (astroid.node): The node visiting.\n        Returns:\n            True if successful otherwise False.\n        \"\"\"\n\n        doc = node.doc\n        if doc is None:\n            return True\n\n        if len(doc) > 40:\n            return True\n        elif sum(doc.find(nl) for nl in ('\\n', '\\r', '\\n\\r')) == -3:\n            return True\n        else:\n            self.add_message('W9001', node=node, line=node.fromlineno)\n            return False\n\n        return True\n\n    def has_period(self, node):\n        \"\"\"has_period checks if one line doc end-with '.' .\n        Args:\n            node (astroid.node): the node is visiting.\n        Returns:\n            True if successful otherwise False.\n        \"\"\"\n        if node.doc is None:\n            return True\n\n        if len(node.doc.splitlines()) > 1:\n            return True\n\n        if not node.doc.strip().endswith('.'):\n            self.add_message('W9002', node=node, line=node.fromlineno)\n            return False\n\n        return True\n\n    def with_raises(self, node, doc):\n        \"\"\"with_raises checks if one line doc end-with '.' .\n        Args:\n            node (astroid.node): the node is visiting.\n            doc (Docstring): Docstring object.\n        Returns:\n            True if successful otherwise False.\n        \"\"\"\n\n        find = False\n        for t in node.body:\n            if not isinstance(t, astroid.Raise):\n                continue\n\n            find = True\n            break\n\n        if not find:\n            return True\n\n        if len(doc.get_raises()) == 0:\n            self.add_message('W9008', node=node, line=node.fromlineno)\n            return False\n\n        return True\n\n    def with_returns(self, node, doc):\n        \"\"\"with_returns checks if docstring comments what are returned .\n        Args:\n            node (astroid.node): the node is visiting.\n            doc (Docstring): Docstring object.\n        Returns:\n            True if successful otherwise False.\n        \"\"\"\n\n        if node.name.startswith(\"__\") or node.name.startswith(\"_\"):\n            return True\n        find = False\n        for t in node.body:\n            if not isinstance(t, astroid.Return):\n                continue\n\n            find = True\n            break\n\n        if not find:\n            return True\n\n        if len(doc.get_returns()) == 0:\n            self.add_message('W9007', node=node, line=node.fromlineno)\n            return False\n\n        return True\n\n    def all_args_in_doc(self, node, doc):\n        \"\"\"all_args_in_doc checks if arguments are mentioned in doc\n        Args:\n            node (astroid.node): the node is visiting.\n            doc (Docstring): Docstring object\n        Returns:\n            True if successful otherwise False.\n        \"\"\"\n        if node.name.startswith(\"__\") or node.name.startswith(\"_\"):\n            return True\n        args = []\n        for arg in node.args.get_children():\n            if (not isinstance(arg, astroid.AssignName)) \\\n                or arg.name == \"self\":\n                continue\n            args.append(arg.name)\n\n        if len(args) <= 0:\n            return True\n\n        parsed_args = doc.args\n        args_not_documented = set(args) - set(parsed_args)\n        if len(args) > 0 and len(parsed_args) <= 0:\n            self.add_message(\n                'W9003',\n                node=node,\n                line=node.fromlineno,\n                args=list(args_not_documented))\n            return False\n\n        for t in args:\n            if t not in parsed_args:\n                self.add_message(\n                    'W9003', node=node, line=node.fromlineno, args=[t, ])\n                return False\n\n        return True\n"
  },
  {
    "path": "codestyle/pylint_pre_commit.hook",
    "content": "#!/bin/bash\n\nTOTAL_ERRORS=0\n\n\nDIR=\"$( cd \"$( dirname \"${BASH_SOURCE[0]}\" )\" && pwd )\"\nexport PYTHONPATH=$DIR:$PYTHONPATH\n\nreadonly VERSION=\"2.12.0\"\nversion=$(pylint --version | grep 'pylint')\n\nif ! [[ $version == *\"$VERSION\"* ]]; then\n    pip install pylint==2.12.0\nfi\n\n# The trick to remove deleted files: https://stackoverflow.com/a/2413151\nfor file in $(git diff --name-status | awk '$1 != \"D\" {print $2}'); do\n    pylint --disable=all --load-plugins=docstring_checker \\\n    --enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises $file;\n    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);\ndone\n\nexit $TOTAL_ERRORS\n#For now, just warning:\n#exit 0\nFooter\n\n"
  },
  {
    "path": "codestyle/test_docstring_checker.py",
    "content": "#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport docstring_checker\nimport pylint.testutils\nimport astroid\nimport pytest\nimport sys\n\n\nclass TestDocstring(pylint.testutils.CheckerTestCase):\n    CHECKER_CLASS = docstring_checker.DocstringChecker\n\n    def test_one_line(self):\n        func_node = astroid.extract_node('''\n        def test(): \n            \"\"\"get \n            news.\n            \"\"\"\n            if True:\n                return 5\n            return 5\n        ''')\n\n        self.checker.visit_functiondef(func_node)\n        got = self.linter.release_messages()\n        assert len(got) == 1\n        assert 'W9001' == got[0][0]\n\n    def test_one_line_1(self):\n        func_node = astroid.extract_node('''\n        def test(): \n            \"\"\"get news\"\"\"\n            if True:\n                return 5\n            return 5\n        ''')\n\n        self.checker.visit_functiondef(func_node)\n        got = self.linter.release_messages()\n        assert len(got) == 1\n        assert 'W9002' == got[0][0]\n\n    def test_args(self):\n        func_node = astroid.extract_node('''\n        def test(scale, mean): \n            \"\"\"get news.\n            Args:\n                scale (int): scale is the number.\n            \"\"\"\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n        ''')\n\n        self.checker.visit_functiondef(func_node)\n        got = self.linter.release_messages()\n        assert len(got) == 1\n        assert 'W9003' == got[0][0]\n\n    def test_missing(self):\n        func_node = astroid.extract_node('''\n        def test(): \n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n        ''')\n\n        self.checker.visit_functiondef(func_node)\n        got = self.linter.release_messages()\n        assert len(got) == 1\n        assert 'W9005' == got[0][0]\n\n    def test_indent(self):\n        func_node = astroid.extract_node('''\n        def test(): \n            \"\"\" get get get get get get get get\n              get get get get get get get get.\n            \"\"\"\n            pass \n        ''')\n\n        self.checker.visit_functiondef(func_node)\n        got = self.linter.release_messages()\n        assert len(got) == 1\n        assert 'W9006' == got[0][0]\n\n    def test_with_resturns(self):\n        func_node = astroid.extract_node('''\n        def test(): \n            \"\"\"get news.\n            Args:\n                scale (int): scale is the number.\n            \"\"\"\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            return mean\n        ''')\n\n        self.checker.visit_functiondef(func_node)\n        got = self.linter.release_messages()\n        assert len(got) == 1\n        assert 'W9007' == got[0][0]\n\n    def test_with_raises(self):\n        func_node = astroid.extract_node('''\n        def test(): \n            \"\"\"get news.\n            Args:\n                scale (int): scale is the number.\n            \"\"\"\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            mean=scale\n            raise ValueError('A very specific bad thing happened.')\n        ''')\n\n        self.checker.visit_functiondef(func_node)\n        got = self.linter.release_messages()\n        assert len(got) == 1\n        assert 'W9008' == got[0][0]\n\n    def test_no_message(self):\n        p = '''\ndef fc(input,\n       size,\n       num_flatten_dims=1,\n       param_attr=None,\n       bias_attr=None,\n       act=None,\n       name=None):\n    \"\"\"\n    **Fully Connected Layer**\n    The fully connected layer can take multiple tensors as its inputs. It\n    creates a variable called weights for each input tensor, which represents\n    a fully connected weight matrix from each input unit to each output unit.\n    The fully connected layer multiplies each input tensor with its coresponding\n    weight to produce an output Tensor. If multiple input tensors are given,\n    the results of multiple multiplications will be sumed up. If bias_attr is\n    not None, a bias variable will be created and added to the output. Finally,\n    if activation is not None, it will be applied to the output as well.\n    This process can be formulated as follows:\n    Args:\n        input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of\n            the input tensor(s) is at least 2.\n        size(int): The number of output units in this layer.\n        num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than\n            two dimensions. If this happens, the multidimensional tensor will first be flattened\n            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input\n            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)\n            dimensions will be flatten to form the first dimension of the final matrix (height of\n            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to\n            form the second dimension of the final matrix (width of the matrix). For example, suppose\n            `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.\n            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].\n        param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable\n            parameters/weights of this layer.\n        bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias\n            of this layer. If it is set to None, no bias will be added to the output units.\n        act (str, default None): Activation to be applied to the output of this layer.\n        name (str, default None): The name of this layer.\n    Returns:\n        A tensor variable storing the transformation result.\n    Raises:\n        ValueError: If rank of the input tensor is less than 2.\n    Examples:\n        .. code-block:: python\n            data = fluid.layers.data(name=\"data\", shape=[32, 32], dtype=\"float32\")\n            fc = fluid.layers.fc(input=data, size=1000, act=\"tanh\")\n    \"\"\"\n    raise ValueError('A very specific bad thing happened.')\n    size = 1\n    size = 1\n    size = 1\n    size = 1\n    size = 1\n    size = 1\n    size = 1\n    size = 1\n    size = 1\n    size = 1\n    size = 1\n    size = 1\n    size = 1\n    return size\n    '''\n\n        func_node = astroid.extract_node(p)\n        self.checker.visit_functiondef(func_node)\n        got = self.linter.release_messages()\n        assert len(got) == 0\n"
  },
  {
    "path": "docs/cluster_deployment.md",
    "content": "\n## 集群部署\n\n本文档介绍在集群上使用分布式进行大模型训练的方法，包括在 Kubernetes 上使用 PaddlePaddle 分布式和在云上使用的方法。\n\n### 1. Kubernetes部署\n\n在 Kubernetes 上部署分布式任务需要安装 [paddle-operator](https://github.com/PaddleFlow/paddle-operator) 。\n\npaddle-operator 通过添加自定义资源类型 (paddlejob) 以及部署 controller 和一系列 Kubernetes 原生组件的方式实现简单定义即可运行 PaddlePaddle 任务的需求。\n\n目前支持运行 ParameterServer (PS) 和 Collective 两种分布式任务，当然也支持运行单节点任务。\n\n**paddle-operator 安装**\n\n安装 paddle-operator 需要有已经安装的 Kubernetes (v1.16+) 集群和 [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) (v1.16+) 工具。\n\n本节所需配置文件和示例可以在 [这里](https://github.com/PaddleFlow/paddle-operator/tree/main/deploy) 找到，\n可以通过 *git clone* 或者复制文件内容保存。\n\n```yaml\ndeploy\n|-- examples\n|   |-- resnet.yaml\n|   |-- wide_and_deep.yaml\n|   |-- wide_and_deep_podip.yaml\n|   |-- wide_and_deep_service.yaml\n|   `-- wide_and_deep_volcano.yaml\n|-- v1\n|   |-- crd.yaml\n|   `-- operator.yaml\n```\n\n执行以下命令，\n\n```shell\nkubectl create -f https://raw.githubusercontent.com/PaddleFlow/paddle-operator/dev/deploy/v1/crd.yaml\n```\n\n或者\n\n```shell\nkubectl create -f deploy/v1/crd.yaml\n```\n\n通过以下命令查看是否成功，\n\n```shell\nkubectl get crd\nNAME                                    CREATED AT\npaddlejobs.batch.paddlepaddle.org       2021-02-08T07:43:24Z\n```\n\n执行以下部署命令，\n\n```shell\nkubectl create -f https://raw.githubusercontent.com/PaddleFlow/paddle-operator/dev/deploy/v1/operator.yaml\n```\n\n或者\n\n```shell\nkubectl create -f deploy/v1/operator.yaml\n```\n\n通过以下命令查看部署结果和运行状态，\n\n```shell\nkubectl -n paddle-system get pods\nNAME                                         READY   STATUS    RESTARTS   AGE\npaddle-controller-manager-698dd7b855-n65jr   1/1     Running   0          1m\n```\n\n通过查看 controller 日志以确保运行正常，\n\n```shell\nkubectl -n paddle-system logs paddle-controller-manager-698dd7b855-n65jr\n```\n\n提交 demo 任务查看效果，\n\n```shell\nkubectl -n paddle-system create -f deploy/examples/wide_and_deep.yaml\n```\n\n查看 paddlejob 任务状态, pdj 为 paddlejob 的缩写，\n\n```shell\nkubectl -n paddle-system get pdj\nNAME                     STATUS      MODE   AGE\nwide-ande-deep-service   Completed   PS     4m4s\n```\n\n以上信息可以看出：训练任务已经正确完成，该任务为 ps 模式。\n可通过 cleanPodPolicy 配置任务完成/失败后的 pod 删除策略，详见任务配置。\n\n训练期间可以通过如下命令查看 pod 状态，\n\n```shell\nkubectl -n paddle-system get pods\n```\n\n**paddlejob 任务提交**\n\n本resnet示例为 Collective 模式，使用 GPU 进行训练，只需要配置 worker，worker 配置中需要声明使用的 GPU 信息。\n\n准备配置文件，\n\n```yaml\napiVersion: batch.paddlepaddle.org/v1\nkind: PaddleJob\nmetadata:\n  name: resnet\nspec:\n  cleanPodPolicy: Never\n  worker:\n    replicas: 2\n    template:\n      spec:\n        containers:\n          - name: paddle\n            image: registry.baidubce.com/paddle-operator/demo-resnet:v1\n            command:\n            - python\n            args:\n            - \"-m\"\n            - \"paddle.distributed.launch\"\n            - \"train_fleet.py\"\n            volumeMounts:\n            - mountPath: /dev/shm\n              name: dshm\n            resources:\n              limits:\n                nvidia.com/gpu: 1\n        volumes:\n        - name: dshm\n          emptyDir:\n            medium: Memory\n```\n\n注意：\n\n* 这里需要添加 shared memory 挂载以防止缓存出错。\n* 本示例采用内置 flower 数据集，程序启动后会进行下载，根据网络环境可能等待较长时间。\n\n提交任务: 使用 kubectl 提交 yaml 配置文件以创建任务，\n\n```shell\nkubectl -n paddle-system create -f resnet.yaml\n```\n\n**卸载**\n\n通过以下命令卸载部署的组件，\n\n```shell\nkubectl delete -f deploy/v1/crd.yaml -f deploy/v1/operator.yaml\n```\n\n*注意：重新安装时，建议先卸载再安装*\n\n### 2. 公有云和私有云部署\n\n在公有云上运行 PaddlePaddle 分布式建议通过选购容器引擎服务的方式，各大云厂商都推出了基于标准 Kubernetes 的云产品，然后根据上节中的教程安装使用即可。\n\n| 云厂商 | 容器引擎 | 链接                                           |\n| --- | ---- | -------------------------------------------- |\n| 百度云 | CCE  | https://cloud.baidu.com/product/cce.html     |\n| 阿里云 | ACK  | https://help.aliyun.com/product/85222.html   |\n| 华为云 | CCE  | https://www.huaweicloud.com/product/cce.html |\n\n更为方便的是使用百度提供的全功能AI开发平台 [BML](https://cloud.baidu.com/product/bml) 来使用，详细的使用方式请参考 [BML文档](https://ai.baidu.com/ai-doc/BML/pkhxhgo5v)。\n\n"
  },
  {
    "path": "docs/compression.md",
    "content": "# 模型压缩\n\n------------------------------------------------------------------------------------------\n\n## **简介**\n\nPaddleFleetX 集成了 PaddleSlim 中的常见的压缩方法：量化训练（Qutization Aware Training，QAT）、结构化稀疏（Structured Pruning，SP）和知识蒸馏（Knowledge Distillation，KD）。本文会介绍如何在 PaddleFleetX 中使用这些功能，来压缩并且导出压缩后的模型。\n\n## **特性**\n\n- <a href=https://github.com/PaddlePaddle/PaddleSlim/tree/release/2.4/demo/dygraph/quant>量化训练</a>：通过将全连接层的矩阵乘计算由 Float 浮点型优化为 INT8 整型来优化推理性能；\n- <a href=https://github.com/PaddlePaddle/PaddleSlim/tree/release/2.4/demo/dygraph/pruning>结构化稀疏</a>：通过剪裁全连接层权重的通道数目来优化推理性能；\n- <a href=#知识蒸馏>知识蒸馏</a>：通过使用高精度的大模型（教师模型）来蒸馏低精度的小模型（学生模型）来提升小模型精度\n\n\n\n## **配置文档**\n\n模型压缩开关通过 Compress 字段控制，预训练的模型参数路径由 pretrained 指定。接下来就是量化训练、结构化稀疏和知识蒸馏各自的技术参数。\n\n```yaml\nCompress:\n  pretrained:         // 预训练模型参数的保存路径\n  \n  Quantization:       // 量化训练参数\n    \n  Prune:              // 结构化稀疏参数\n  \n  Distillation:       // 知识蒸馏参数\n```\n\n**注意**： 我们正在开发上述三种压缩方法的联合使用，请先单独使用上述各个方法。\n\n### **量化训练参数**\n\n```yaml\nCompress:\n  pretrained:\n  Quantization:\n    enable: True\n    weight_quantize_type: 'abs_max'\n    activation_quantize_type: 'moving_average_abs_max'\n    weight_preprocess_type: None\n    activation_preprocess_type: 'PACT'\n    weight_bits: 8\n    activation_bits: 8\n    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']\n    onnx_format: True\n```\n\n其中参数说明：\n\n| **参数名**                   | **参数释义**                              |\n|-----------------------------|-----------------------------------------|\n| pretrained                  | 预训练模型的加载目录，若设置该参数，将在量化之前加载预训练模型；若需要加载量化后参数，将此参数设置为None，直接设置Engine.save_load.ckpt_dir即可       |\n| enable                      | 是否开启量化训练                           |\n| weight_quantize_type        | weight量化方法, 默认为`channel_wise_abs_max`, 此外还支持`abs_max` |\n| activation_quantize_type    | activation量化方法, 默认为`moving_average_abs_max`               |\n| weight_preprocess_type      | weight预处理方法，默认为None，代表不进行预处理；当需要使用`PACT`方法时设置为`PACT` |\n| activation_preprocess_type  | activation预处理方法，默认为None，代表不进行预处理                   |\n| weight_bits                 | weight量化比特数, 默认为 8                                        |\n| activation_bits             | activation量化比特数, 默认为 8                                    |\n| quantizable_layer_type      | 需要量化的算子类型                                                |\n| onnx_format                 | 是否使用新量化格式，默认为False                                     |\n\n更详细的量化训练参数介绍可参考[PaddleSlim动态图量化训练接口介绍](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/api_cn/dygraph/quanter/qat.rst)。\n\n### **结构化稀疏参数**\n\n```yaml\nCompress:\n  pretrained:\n  Prune:\n    enable: True\n    criterion: l1_norm\n    ratio: 0.125\n```\n\n其中参数说明：\n\n| **参数名**                   | **参数释义**                              |\n|-----------------------------|-----------------------------------------|\n| pretrained                  | 预训练模型的加载目录       |\n| enable                      | 是否开启结构化稀疏训练                           |\n| criterion    | 权重的重要性指标，目前支持l1_norm 和 l2_norm|\n| ratio      | 权重稀疏的比例。例如，0.125的意思是12.5%的权重会被稀疏掉 |\n"
  },
  {
    "path": "docs/deployment_faq.md",
    "content": "## 环境验证和常见问题\n\n本文为环境问题排查指引，包括环境正确性验证的方法和常见的一些问题解决方法。\n\n### 1. 单机环境验证\n\n以下验证不区分本机环境和 Docker 环境。\n\n**GPU验证**\n\n当使用 GPU 时，使用 `nvidia-smi` 命令查看环境中 GPU 状态，预期输出如下\n\n```shell\nThu Jul 21 19:32:03 2022       \n+-----------------------------------------------------------------------------+\n| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |\n|-------------------------------+----------------------+----------------------+\n| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n|                               |                      |               MIG M. |\n|===============================+======================+======================|\n|   0  Tesla V100-SXM2...  On   | 00000000:3F:00.0 Off |                    0 |\n| N/A   33C    P0    40W / 300W |      0MiB / 32510MiB |      0%      Default |\n|                               |                      |                  N/A |\n+-------------------------------+----------------------+----------------------+\n|   1  Tesla V100-SXM2...  On   | 00000000:40:00.0 Off |                    0 |\n| N/A   34C    P0    41W / 300W |      0MiB / 32510MiB |      0%      Default |\n|                               |                      |                  N/A |\n+-------------------------------+----------------------+----------------------+\n|   2  Tesla V100-SXM2...  On   | 00000000:41:00.0 Off |                    0 |\n| N/A   35C    P0    41W / 300W |      0MiB / 32510MiB |      0%      Default |\n|                               |                      |                  N/A |\n+-------------------------------+----------------------+----------------------+\n|   3  Tesla V100-SXM2...  On   | 00000000:42:00.0 Off |                    0 |\n| N/A   38C    P0    42W / 300W |      0MiB / 32510MiB |      0%      Default |\n|                               |                      |                  N/A |\n+-------------------------------+----------------------+----------------------+\n|   4  Tesla V100-SXM2...  On   | 00000000:62:00.0 Off |                    0 |\n| N/A   34C    P0    39W / 300W |      0MiB / 32510MiB |      0%      Default |\n|                               |                      |                  N/A |\n+-------------------------------+----------------------+----------------------+\n|   5  Tesla V100-SXM2...  On   | 00000000:63:00.0 Off |                    0 |\n| N/A   36C    P0    40W / 300W |      0MiB / 32510MiB |      0%      Default |\n|                               |                      |                  N/A |\n+-------------------------------+----------------------+----------------------+\n|   6  Tesla V100-SXM2...  On   | 00000000:64:00.0 Off |                    0 |\n| N/A   37C    P0    41W / 300W |      0MiB / 32510MiB |      0%      Default |\n|                               |                      |                  N/A |\n+-------------------------------+----------------------+----------------------+\n|   7  Tesla V100-SXM2...  On   | 00000000:65:00.0 Off |                    0 |\n| N/A   36C    P0    39W / 300W |      0MiB / 32510MiB |      0%      Default |\n|                               |                      |                  N/A |\n+-------------------------------+----------------------+----------------------+\n\n+-----------------------------------------------------------------------------+\n| Processes:                                                                  |\n|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n|        ID   ID                                                   Usage      |\n|=============================================================================|\n|  No running processes found                                                 |\n+-----------------------------------------------------------------------------+\n```\n\n结果中可以看出\n\n* CUDA Version栏显示的是当前环境中的CUDA版本号，此处为11.2。开始使用飞桨前，请先保证此处CUDA Version显示正常。如果CUDA Version栏不显示版本号，则需要添加CUDA相关库的路径到环境变量`LD_LIBRARY_PATH`中，例如执行命令添加 `export LD_LIBRARY_PATH=/usr/lib64/:/usr/local/lib/:/usr/local/cuda-11.2/targets/x86_64-linux/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}` 。具体请参考[文档](https://docs.nvidia.com/cuda/cuda-quick-start-guide/index.html)。\n* Memory-Usage 列显示的是当前的显存占用值，此处为0MiB，表示当前设备的显存未被占用；GPU-Util 列显示的是当前的GPU利用率，此处为0%，表示当前设备空闲，可以使用。开始使用飞桨前，请保证当前设备显存充足，且利用率处于空闲状态。\n* 最后的 Processes 信息表示正在使用设备的进程，Docker 内可能存在不准确的情况，不影响使用。\n\n**PaddlePaddle 安装验证**\n\n首先运行如下命令确保 PaddlePaddle 正确安装\n\n```shell\npython -c \"import paddle; paddle.utils.run_check()\"\n```\n\n预期会有如下输出\n\n```shell\nRunning verify PaddlePaddle program ... \nW0720 09:29:22.035640 12791 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2\nW0720 09:29:22.040702 12791 gpu_resources.cc:91] device: 0, cuDNN Version: 8.1.\nPaddlePaddle works well on 1 GPU.\nW0720 09:29:36.763486 12791 fuse_all_reduce_op_pass.cc:79] Find all_reduce operators: 2. To make the speed faster, some all_reduce ops are fused during training, after fusion, the number of all_reduce ops is 2.\nPaddlePaddle works well on 8 GPUs.\nPaddlePaddle is installed successfully! Let's start deep learning with PaddlePaddle now.\n```\n\n表示 PaddlePaddle 已经正确安装。\n\n如果出现以下错误信息请确保 CUDA 安装正确且已根据 CUDA 安装路径正确配置的 LD_LIBRARY_PATH。\n例如执行命令添加 `export LD_LIBRARY_PATH=/usr/lib64/:/usr/local/lib/:/usr/local/cuda-11.2/targets/x86_64-linux/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}` 。\n具体请参考[文档](https://docs.nvidia.com/cuda/cuda-quick-start-guide/index.html)。\n\n```\nYou are using GPU version Paddle, but your CUDA device is not set properly.\n```\n\n### 2. 分布式环境验证\n\n如果单机运行正常，但多机分布式运行异常请先根据 [网络问题排查](#31-网络问题排查) 部分排查网络问题再进行以下排查。\n\n请先确保**各个机器**的 PaddlePaddle 环境已经正确安装，然后在等待验证的其中一个节点上运行如下命令\n\n```shell\npython -m paddle.distributed.launch run_check\n```\n\n> 默认验证 2 机分布式环境，如果需要验证更多机器（例如4个）环境下飞桨分布式是否运行正常，请添加节点数参数 --nnodes，具体命令如下：\n> \n> `python -m paddle.distributed.launch --nnodes=4 run_check`\n\n预期输出如下\n\n```shell\nLAUNCH INFO 2022-07-20 09:38:33,349 PaddlePaddle Distributed Check begin...\nLAUNCH INFO 2022-07-20 09:38:33,358 -----------  Configuration  ----------------------\nLAUNCH INFO 2022-07-20 09:38:33,358 devices: None\nLAUNCH INFO 2022-07-20 09:38:33,358 elastic_level: -1\nLAUNCH INFO 2022-07-20 09:38:33,358 elastic_timeout: 30\nLAUNCH INFO 2022-07-20 09:38:33,358 gloo_port: 6767\nLAUNCH INFO 2022-07-20 09:38:33,358 host: None\nLAUNCH INFO 2022-07-20 09:38:33,358 job_id: default\nLAUNCH INFO 2022-07-20 09:38:33,358 legacy: False\nLAUNCH INFO 2022-07-20 09:38:33,358 log_dir: log\nLAUNCH INFO 2022-07-20 09:38:33,358 log_level: ERROR\nLAUNCH INFO 2022-07-20 09:38:33,358 master: None\nLAUNCH INFO 2022-07-20 09:38:33,358 max_restart: 3\nLAUNCH INFO 2022-07-20 09:38:33,358 nnodes: 2\nLAUNCH INFO 2022-07-20 09:38:33,358 nproc_per_node: None\nLAUNCH INFO 2022-07-20 09:38:33,358 rank: -1\nLAUNCH INFO 2022-07-20 09:38:33,358 run_mode: collective\nLAUNCH INFO 2022-07-20 09:38:33,359 server_num: None\nLAUNCH INFO 2022-07-20 09:38:33,359 servers: \nLAUNCH INFO 2022-07-20 09:38:33,359 trainer_num: None\nLAUNCH INFO 2022-07-20 09:38:33,359 trainers: \nLAUNCH INFO 2022-07-20 09:38:33,359 training_script: /usr/local/lib/python3.7/dist-packages/paddle/distributed/launch/plugins/test.py\nLAUNCH INFO 2022-07-20 09:38:33,359 training_script_args: []\nLAUNCH INFO 2022-07-20 09:38:33,359 with_gloo: 1\nLAUNCH INFO 2022-07-20 09:38:33,359 --------------------------------------------------\nLAUNCH INFO 2022-07-20 09:38:33,360 Job: default, mode collective, replicas 2[2:2], elastic False\nLAUNCH INFO 2022-07-20 09:38:33,367 Waiting peer start...\nCopy the following command to other nodes to run.\n--------------------------------------------------------------------------------\npython -m paddle.distributed.launch --master 10.10.1.1:49178 run_check\n--------------------------------------------------------------------------------\n```\n\n> 如果当前安装的 PaddlePaddle 中未包含该工具，请根据上节提示安装 develop 版本进行测试。\n\n根据提示，复制最后的命令（复制机器上个命令的执行结果，以下命令为示例），在其他节点上粘贴执行\n\n```shell\npython -m paddle.distributed.launch --master 10.10.1.1:49178 run_check\n```\n\n执行后，如果配置正常则每个节点都会有后续输出\n\n```shell\nLAUNCH INFO 2022-07-20 09:46:41,571 Run Pod: xqqbsr, replicas 2, status ready\nLAUNCH INFO 2022-07-20 09:46:41,601 Watching Pod: xqqbsr, replicas 2, status running\nPrepare distributed training with 2 nodes 2 cards\nI0720 09:46:43.583846 13375 tcp_utils.cc:181] The server starts to listen on IP_ANY:14863\nI0720 09:46:43.584153 13375 tcp_utils.cc:130] Successfully connected to 10.10.10.1:14863\nW0720 09:46:47.089151 13375 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2\nW0720 09:46:47.098454 13375 gpu_resources.cc:91] device: 0, cuDNN Version: 8.1.\n2022-07-20 09:46:51,333-INFO: [topology.py:187:__init__] HybridParallelInfo: rank_id: 0, mp_degree: 1, sharding_degree: 1, pp_degree: 1, dp_degree: 4, mp_group: [0],  sharding_group: [0], pp_group: [0], dp_group: [0, 1, 2, 3], check/clip group: [0]\nDistributed training start...\n[Epoch 0, batch 0] loss: 5.10316, acc1: 0.03125, acc5: 0.06250\nDistributed training completed\nI0720 09:46:54.828758 13432 tcp_store.cc:257] receive shutdown event and so quit from MasterDaemon run loop\nLAUNCH INFO 2022-07-20 09:46:56,617 Pod completed\nLAUNCH INFO 2022-07-20 09:46:57,085 Exit code 0\n```\n\n则表示分布式环境配置正常，多机分布式训练可以成功运行。\n\n> 如果其他节点执行命令后各个节点没有后续输出或输出不符合预期请参考 [FAQ](#3-faq) 部分解决。\n\n**实际分布式训练任务验证**\n\n在启动分布式任务前需要确保各个节点上安装好 PaddlePaddle 环境，同步好数据和代码。\n\n例如准备好训练代码 `train.py`，同步至每个训练节点的工作目录。\n\n```python\nimport numpy as np\nimport paddle\nfrom paddle.distributed import fleet\nfrom paddle.vision.models import ResNet\nfrom paddle.vision.models.resnet import BottleneckBlock\nfrom paddle.io import Dataset, BatchSampler, DataLoader\n\nbase_lr = 0.1\nmomentum_rate = 0.9\nl2_decay = 1e-4\n\nepoch = 10\nbatch_num = 3\nbatch_size = 32\nclass_dim = 102\n\nclass RandomDataset(Dataset):\n    def __init__(self, num_samples):\n        self.num_samples = num_samples\n\n    def __getitem__(self, idx):\n        image = np.random.random([3, 224, 224]).astype('float32')\n        label = np.random.randint(0, class_dim - 1, (1, )).astype('int64')\n        return image, label\n\n    def __len__(self):\n        return self.num_samples\n\ndef optimizer_setting(parameter_list=None):\n    optimizer = paddle.optimizer.Momentum(\n        learning_rate=base_lr,\n        momentum=momentum_rate,\n        weight_decay=paddle.regularizer.L2Decay(l2_decay),\n        parameters=parameter_list)\n    return optimizer\n\n\ndef train_resnet():\n    fleet.init(is_collective=True)\n\n    resnet = ResNet(BottleneckBlock, 18, num_classes=class_dim)\n    optimizer = optimizer_setting(parameter_list=resnet.parameters())\n    optimizer = fleet.distributed_optimizer(optimizer)\n    resnet = fleet.distributed_model(resnet)\n\n    dataset = RandomDataset(batch_num * batch_size)\n    train_loader = DataLoader(dataset,\n                    batch_size=batch_size,\n                    shuffle=True,\n                    drop_last=True,\n                    num_workers=2)\n\n    for eop in range(epoch):\n        resnet.train()\n\n        for batch_id, data in enumerate(train_loader()):\n            img, label = data\n            label.stop_gradient = True\n\n            out = resnet(img)\n            loss = paddle.nn.functional.cross_entropy(input=out, label=label)\n            avg_loss = paddle.mean(x=loss)\n            acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1)\n            acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5)\n\n            avg_loss.backward()\n            optimizer.step()\n            resnet.clear_gradients()\n\n            print(\"[Epoch %d, batch %d] loss: %.5f, acc1: %.5f, acc5: %.5f\" % (eop, batch_id, avg_loss, acc_top1, acc_top5))\n\nif __name__ == '__main__':\n    train_resnet()\n```\n\n启动分布式训练的命令如下，\n这个命令需要在每个参与训练的节点上执行（每个节点上的 `--master`都设置为同一个），如节点较多可以考虑使用 `ssh` 脚本或 `mpirun` 进行跨节点命令分发。\n\n```python\npython -m paddle.distributed.launch --master=10.10.1.1:49178 --nnodes=2 train.py\n```\n\n这里用到了分布式启动最重要的两个参数\n\n- `--nnodes` 为分布式任务的节点个数（一般为参与任务的机器数量），默认为 1 即启动单机任务，也可使用环境变量 PADDLE_NNODES 设置。\n\n- `--master` 为分布式信息同步的主节点地址，ip:port 格式，可以由第一个启动的节点自动打印或者直接由用户设置为参与任务的任意节点 ip 和任意可用端口，也可使用环境变量 PADDLE_MASTER 设置。\n\n> master 支持使用 etcd 服务，当使用 etcd 服务时，需要同时指定任务 id 以避免任务间冲突。具体地，可以通过 --job_id 参数或者设置环境变量 PADDLE_JOB_ID 指定任务id。\n\n\n启动后，将看到如下日志，首先是配置部分\n\n```shell\nLAUNCH INFO 2022-07-20 12:10:15,863 -----------  Configuration  ----------------------\nLAUNCH INFO 2022-07-20 12:10:15,863 devices: None\nLAUNCH INFO 2022-07-20 12:10:15,863 elastic_level: -1\nLAUNCH INFO 2022-07-20 12:10:15,863 elastic_timeout: 30\nLAUNCH INFO 2022-07-20 12:10:15,863 gloo_port: 6767\nLAUNCH INFO 2022-07-20 12:10:15,863 host: None\nLAUNCH INFO 2022-07-20 12:10:15,863 job_id: default\nLAUNCH INFO 2022-07-20 12:10:15,863 legacy: False\nLAUNCH INFO 2022-07-20 12:10:15,863 log_dir: log\nLAUNCH INFO 2022-07-20 12:10:15,863 log_level: INFO\nLAUNCH INFO 2022-07-20 12:10:15,863 master: 127.0.0.1:8890\nLAUNCH INFO 2022-07-20 12:10:15,863 max_restart: 3\nLAUNCH INFO 2022-07-20 12:10:15,863 nnodes: 2\nLAUNCH INFO 2022-07-20 12:10:15,863 nproc_per_node: None\nLAUNCH INFO 2022-07-20 12:10:15,863 rank: -1\nLAUNCH INFO 2022-07-20 12:10:15,863 run_mode: collective\nLAUNCH INFO 2022-07-20 12:10:15,863 server_num: None\nLAUNCH INFO 2022-07-20 12:10:15,863 servers: \nLAUNCH INFO 2022-07-20 12:10:15,863 trainer_num: None\nLAUNCH INFO 2022-07-20 12:10:15,863 trainers: \nLAUNCH INFO 2022-07-20 12:10:15,863 training_script: train.py\nLAUNCH INFO 2022-07-20 12:10:15,863 training_script_args: []\nLAUNCH INFO 2022-07-20 12:10:15,864 with_gloo: 1\nLAUNCH INFO 2022-07-20 12:10:15,864 --------------------------------------------------\n```\n\n这里打印分布式启动时的配置信息， 更多 launch 启动参数和用法请参考 [API 文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/distributed/launch_cn.html) 或通过以下命令获得。\n\n```shell\npython -m paddle.distributed.launch --help\n```\n\n然后打印的是任务启动相关的信息：\n\n```shell\nLAUNCH INFO 2022-07-20 12:10:15,864 Job: default, mode collective, replicas 2[2:2], elastic False\nLAUNCH INFO 2022-07-20 12:10:15,870 Waiting peer start...\nLAUNCH INFO 2022-07-20 12:10:25,860 Run Pod: bpdjev, replicas 2, status ready\nLAUNCH INFO 2022-07-20 12:10:25,883 Watching Pod: bpdjev, replicas 2, status running\n```\n\n其中，每行对应的具体含义解释如下：\n\n* 因为未设置 job_id，使用默认名称 default，启动的是 collective 模式，总共 2 个节点的分布式任务，不支持弹性（即节点数不可变）。\n* 节点短暂处于等待其他节点启动的状态，如果其他节点已启动但日志长期处于等待状态，请根据 [FAQ](#31-网络问题排查) 进行排查。\n* 任务准备启动，当前节点名为 bpdjev（该名称为随机生成）处于 ready 状态，当前节点包含 2 个进程（1 个进程对应 1 个 GPU）。\n* 节点已启动，正在监控进程健康状态。\n\n至此分布式启动成功，接下来打印业务日志（即用户代码相关输出日志）\n\n```shell\nI0720 12:10:27.763713 14071 tcp_utils.cc:181] The server starts to listen on IP_ANY:11061\nI0720 12:10:27.763914 14071 tcp_utils.cc:130] Successfully connected to 10.10.10.1:11061\nW0720 12:10:30.666985 14071 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2\nW0720 12:10:30.675815 14071 gpu_resources.cc:91] device: 0, cuDNN Version: 8.1.\n2022-07-20 12:10:36,377-INFO: [topology.py:187:**init**] HybridParallelInfo: rank_id: 0, mp_degree: 1, sharding_degree: 1, pp_degree: 1, dp_degree: 4, mp_group: [0], sharding_group: [0], pp_group: [0], dp_group: [0, 1, 2, 3], check/clip group: [0]\n/usr/local/lib/python3.7/dist-packages/paddle/nn/layer/norm.py:668: UserWarning: When training, we now always track global mean and variance.\n \"When training, we now always track global mean and variance.\")\n[Epoch 0, batch 0] loss: 5.42939, acc1: 0.00000, acc5: 0.00000\n[Epoch 0, batch 1] loss: 6.13338, acc1: 0.00000, acc5: 0.03125\n[Epoch 0, batch 2] loss: 7.25566, acc1: 0.03125, acc5: 0.06250\n// 此处省略多行类似日志\n[Epoch 9, batch 0] loss: 7.23511, acc1: 0.00000, acc5: 0.00000\n[Epoch 9, batch 1] loss: 4.69053, acc1: 0.03125, acc5: 0.06250\n[Epoch 9, batch 2] loss: 5.08652, acc1: 0.00000, acc5: 0.03125\nI0720 12:10:53.647085 14112 tcp_store.cc:257] receive shutdown event and so quit from MasterDaemon run loop\n```\n\n至此，训练结束，业务代码结束，最后打印退出日志\n\n```shell\nLAUNCH INFO 2022-07-20 12:10:56,915 Pod completed\nLAUNCH INFO 2022-07-20 12:10:57,388 Exit code 0\n```\n\n更多日志请在 log 目录下查看，日志文件命名为` {job_id}.{节点名}.{卡号}.log` , 例如如下两个文件为本例子中 2 张卡分别对应的日志。\n\n```shell\n-rw-r--r--  1 root   root 2.9K Jul 20 12:10 default.bpdjev.0.log\n-rw-r--r--  1 root   root 2.7K Jul 20 12:10 default.bpdjev.1.log\n```\n\n当有错误发生时，比如 GPU 卡被占用发生冲突时，会有如下输出\n\n```shell\nLAUNCH INFO 2022-07-21 11:58:59,451 Pod failed\nLAUNCH ERROR 2022-07-21 11:58:59,452 Container failed !!!\nContainer rank 6 status failed cmd ['/usr/bin/python', '-u', 'train.py'] code 1 log log/default.fxemxd.6.log \nenv {'GREP_COLOR': '1;31', 'CUDNN_VERSION': '8.1.1.33', 'LC_ALL': 'en_US.UTF-8', 'LD_LIBRARY_PATH': '/usr/local/lib/python3.7/dist-packages/cv2/../../lib64:/usr/local/cuda-11.2/targets/x86_64-linux/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64', 'LANG': 'en_US.UTF-8', 'HOSTNAME': 'xxxxx', 'OLDPWD': '/home/userhome', 'WITH_GPU': 'ON', 'NVIDIA_VISIBLE_DEVICES': 'all', 'NCCL_VERSION': '2.8.4', 'GOPATH': '/root/gopath', 'PWD': '/home/userhome/workspace/Paddle', 'HOME': '/home/userhome', 'GOROOT': '/usr/local/go', 'CLICOLOR': '1', 'DEBIAN_FRONTEND': 'noninteractive', 'LIBRARY_PATH': '/usr/local/cuda/lib64/stubs', 'TERM': 'xterm', 'WITH_AVX': 'ON', 'CUDA_VERSION': '11.2.1', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'SHLVL': '1', 'LANGUAGE': 'en_US.UTF-8', 'NVIDIA_REQUIRE_CUDA': 'cuda>=11.2 brand=tesla,driver>=418,driver<419 brand=tesla,driver>=440,driver<441 driver>=450,driver<451', 'PATH': '/home/cmake-3.16.0-Linux-x86_64/bin:/usr/local/gcc-8.2/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin:/root/gopath/bin:/home/userhome/.fzf/bin', 'PS1': '\\\\[\\\\033[1;33m\\\\]kui \\\\[\\\\033[1;37m\\\\]\\\\h \\\\[\\\\033[1;32m\\\\]\\\\w\\\\[\\\\033[1;33m\\\\]$(__git_ps1 \" \\\\[\\\\033[35m\\\\]{\\\\[\\\\033[36m\\\\]%s\\\\[\\\\033[35m\\\\]}\")\\\\[\\\\033[0m\\\\] ', '_': '/usr/bin/python', 'CUSTOM_DEVICE_ROOT': '', 'OMP_NUM_THREADS': '1', 'QT_QPA_PLATFORM_PLUGIN_PATH': '/usr/local/lib/python3.7/dist-packages/cv2/qt/plugins', 'QT_QPA_FONTDIR': '/usr/local/lib/python3.7/dist-packages/cv2/qt/fonts', 'runtime_include_dir': '/usr/local/lib/python3.7/dist-packages/paddle/libs', 'POD_NAME': 'fxemxd', 'PADDLE_MASTER': '10.10.10.1:60216', 'PADDLE_GLOBAL_SIZE': '10', 'PADDLE_LOCAL_SIZE': '8', 'PADDLE_GLOBAL_RANK': '8', 'PADDLE_LOCAL_RANK': '6', 'PADDLE_NNODES': '2', 'PADDLE_TRAINER_ENDPOINTS': '10.10.10.1:49825,10.10.10.1:18781,10.10.10.1:53546,10.10.10.1:30837,10.10.10.1:11249,10.10.10.1:13092,10.10.10.1:11398,10.10.10.1:21309,10.10.10.1:47065,10.10.10.1:14834', 'PADDLE_CURRENT_ENDPOINT': '10.10.10.1:47065', 'PADDLE_TRAINER_ID': '8', 'PADDLE_TRAINERS_NUM': '10', 'PADDLE_RANK_IN_NODE': '6', 'FLAGS_selected_gpus': '6'}\nI0721 11:58:51.079766 29676 tcp_utils.cc:130] Successfully connected to 10.10.10.1:60216\nW0721 11:58:54.582710 29676 gpu_resources.cc:61] Please NOTE: device: 6, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2\nW0721 11:58:54.590724 29676 gpu_resources.cc:91] device: 6, cuDNN Version: 8.1.\nTraceback (most recent call last):\n  File \"train.py\", line 75, in <module>\n    train_resnet()\n  File \"train.py\", line 39, in train_resnet\n    fleet.init(is_collective=True)\n  File \"/usr/local/lib/python3.7/dist-packages/paddle/distributed/fleet/base/fleet_base.py\", line 319, in init\n    paddle.distributed.init_parallel_env()\n  File \"/usr/local/lib/python3.7/dist-packages/paddle/distributed/parallel.py\", line 264, in init_parallel_env\n    paddle.distributed.barrier(group=group)\n  File \"/usr/local/lib/python3.7/dist-packages/paddle/distributed/collective.py\", line 334, in barrier\n    task = group.process_group.barrier()\nOSError: (External) NCCL error(5), invalid usage. \n  [Hint: 'ncclInvalidUsage'. The call to NCCL is incorrect. This is usually reflecting a programming error.] (at /paddle/Paddle/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc:214)\n\nLAUNCH INFO 2022-07-21 11:59:00,655 Exit code -15\n```\n\n这当中主要包含以下信息：\n\n* 发生错误的提示 Pod failed 和 Container failed !!!.\n* 错误的卡号（Container rank 6），错误命令和错误环境的环境变量。\n* 具体的错误信息 trace，该部分取决于业务代码错误内容。\n* 最后打印错误退出码 Exit code -15.\n\n请根据报错信息进行排查，部分错误请参考 [FAQ](#3-faq)。\n\n### 3. FAQ\n\n#### 3.1 网络问题排查\n\n请按照以下步骤排查网络问题\n\n**获取节点IP**\n\n使用命令 `hostname -i` 查看机器 ip，多网卡环境使用 `ifconfig` 命令查看(见上节)，获得 IP。\n\n```shell\n$ hostname -i\n10.10.10.1\n```\n\n如果这里得到的IP非预期使用的IP或者和日志中打印的IP不相符时，请根据后面小节排查是否是多网卡环境导致使用的网卡不一致。\n\n\n**确认节点间是否能通过ping连接**\n\n这里举例获得 ip 为 10.10.10.1，在其他节点上使用 `ping 10.10.10.1` 测试机器间是否能连接，有如下输出即为连接成功\n\n```shell\n$ ping 10.10.10.1 \nPING 10.10.10.1 (10.10.10.1) 56(84) bytes of data.\n64 bytes from 10.10.10.1: icmp_seq=1 ttl=61 time=0.089 ms\n64 bytes from 10.10.10.1: icmp_seq=2 ttl=61 time=0.057 ms\n64 bytes from 10.10.10.1: icmp_seq=3 ttl=61 time=0.059 ms\n64 bytes from 10.10.10.1: icmp_seq=4 ttl=61 time=0.078 ms\n64 bytes from 10.10.10.1: icmp_seq=5 ttl=61 time=0.055 ms\n^C\n--- 10.10.10.1 ping statistics ---\n5 packets transmitted, 5 received, 0% packet loss, time 4053ms\nrtt min/avg/max/mdev = 0.055/0.067/0.089/0.016 ms\n```\n\n长时间无输出或其他输出即无法连接，请联系机器网络管理员处理。\n\n**确认节点间是否能通过HTTP/TCP连接**\n\n在机器 `10.10.10.1`上运行命令 `python -m http.server 8090` 启动 http 服务，\n\n```shell\n$ python -m http.server 8090\nServing HTTP on 0.0.0.0 port 8090 (http://0.0.0.0:8090/) ...\n```\n\n如果提示端口被占用请使用其他可用端口启动服务，然后在其他的机器上运行命令 \n`curl 10.10.10.1:8090`\n\n```shell\n$ curl 10.10.10.1:8090\n<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">\n<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n<title>Directory listing for /</title>\n</head>\n<body>\n<h1>Directory listing for /</h1>\n<hr>\n<li><a href=\"train.py\">train.py</a></li>\n</ul>\n<hr>\n</body>\n</html>\n```\n\n有类似以上输出则说明连接成功，否则两台机器间网络可能存在问题，尝试其他端口仍有问题需要联系网络管理员处理。\n\n**确认NCCL是否运行正常**\n\n首先，设置环境变量NCCL_DEBUG，查看NCCL版本和当前使用的IP\n\n```shell\nexport NCCL_DEBUG=INFO\n\npython -m paddle.distributed.launch train.py\n```\n\n在输出日志中找到 NCCL 版本信息\n\n```shell\nNCCL version 2.8.4+cuda11.2\n```\n\n确认各个节点的 NCCL 版本相同且高于 2.8。\n\n以及在输出的信息中查找如下信息\n\n```shell\n[0] NCCL INFO NET/Socket : Using [0]eth0:10.10.10.1<0> [1]\n```\n\n表示 nccl 使用了名为 `eth0` ip 为 10.10.10.1 的网卡，如果需要使用其他网卡，需要在运行命令前添加环境变量\n\n```shell\nexport NCCL_SOCKET_IFNAME=eth1\n```\n\n注意这里添加的时网卡名不是 ip，对应关系参照 `ifconfig` 的输出。\n\n上述测试均正常但是无法跑通分布式环境测试时\n请使用 [nccl-test](https://github.com/NVIDIA/nccl-tests)  测试 GPU 通信是否正常。\n\n#### 3.2 多Python环境问题\n\n当工作环境中存在多个版本的 python 时可能存在不一致导致问题。\n\n检查 python 版本\n\n```shell\n$ python --version\nPython 3.7.12\n```\n\n检查 python 安装目录\n\n```shell\n$ which python\n/usr/bin/python\n```\n\n直接调用绝对路径验证版本\n\n```shell\n$ /usr/bin/python --version\nPython 3.7.12\n```\n\n如果两次打印的版本不匹配，可以通过使用绝对路径的方式解决。\n获取绝对路径需要知道需要安装目录，默认环境中可以通过以下命令查看安装的版本。\n\n```shell\n$ ls /usr/bin/python*\n/usr/bin/python   /usr/bin/python2.7  /usr/bin/python3.6   /usr/bin/python3.7\n```\n\n即当使用 python 时，使用绝对路径 `/usr/bin/python3.7` 替换。\n\n#### 3.3 自动获取 IP 错误（多网卡环境问题）\n\n使用 paddle.distributed.launch 会自动识别使用的 IP，在多网卡配置的环境中自动识别的网卡可能不是预期使用的网卡。\n\n首先可以通过 `ifconfig` 命令查看机器的网卡配置情况，例如\n\n```shell\ndocker0: flags=4163<UP,BROADCAST,RUNNING,MULTICAST>  mtu 1500\n        inet 10.0.3.1  netmask 255.255.255.0  broadcast 0.0.0.0\n        inet6 fe80::7050:1cff:fea2:14f3  prefixlen 64  scopeid 0x20<link>\n        ether 1e:a6:0d:0d:3b:1e  txqueuelen 1000  (Ethernet)\n        RX packets 27201548  bytes 12176726229 (11.3 GiB)\n        RX errors 0  dropped 0  overruns 0  frame 0\n        TX packets 26762571  bytes 48666409371 (45.3 GiB)\n        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0\n\nlo: flags=73<UP,LOOPBACK,RUNNING>  mtu 65536\n        inet 127.0.0.1  netmask 255.0.0.0\n        inet6 ::1  prefixlen 128  scopeid 0x10<host>\n        loop  txqueuelen 1000  (Local Loopback)\n        RX packets 1321339447  bytes 1047567817083 (975.6 GiB)\n        RX errors 0  dropped 0  overruns 0  frame 0\n        TX packets 1321339447  bytes 1047567817083 (975.6 GiB)\n        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0\n\neth0: flags=4163<UP,BROADCAST,RUNNING,MULTICAST>  mtu 1500\n        inet 10.10.10.1  netmask 255.255.255.192  broadcast 10.127.4.191\n        inet6 f080::5200:4bff:f030:2090  prefixlen 64  scopeid 0x20<link>\n        ether 50:6b:4b:31:2a:90  txqueuelen 1000  (Ethernet)\n        RX packets 32040749852  bytes 43394575453133 (39.4 TiB)\n        RX errors 0  dropped 391107  overruns 0  frame 0\n        TX packets 24330967394  bytes 30441950099144 (27.6 TiB)\n        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0\n```\n\n结果中虽然有3项甚至更多但这里只有一张 ip 为 `10.10.10.1` 网卡（inet值），docker0 为 Docker 虚拟网卡， lo 为本地回路，都不需要关注。\n\n当启动分布式训练命令时，如果飞桨自动识别出的网卡IP不正确时，可以使用--host参数手动配置IP，如\n\n```python\npython -m paddle.distributed.launch --master=10.10.10.1:49178 --nnodes=2 --host=10.10.10.1 train.py\n```\n\n> 当 --master 地址识别错误时，也需要手动替换。\n\n#### 3.4 机器端口有限制，需要使用固定端口\n\n当集群环境限制通信网卡时需要手动配置所有 ip 和 port 以启动分布式，以机器 `10.10.10.1` 和机器 `10.10.10.2` 必须使用端口 8000-8999 的情况为例，\n假设每台机器有两个卡，使用如下脚本设置每个卡对应进程的环境变量，依次启动进程。\n\n```shell\n# 所有卡 ip port 列表， ip1:port1,ip2:port2\nexport PADDLE_TRAINER_ENDPOINTS=10.10.10.1:8000,10.10.10.1:8001,10.10.10.2:8000,10.10.10.2:8001\n# 所有卡数\nexport PADDLE_TRAINERS_NUM=4       \n# 当前卡 ip:port\nexport PADDLE_CURRENT_ENDPOINT=10.10.10.1:8000\n# 当前卡序号\nexport PADDLE_TRAINER_ID=0  \n# 当前卡在节点内序号\nexport PADDLE_RANK_IN_NODE=0     \n# 当前卡使用的 GPU 卡号\nexport FLAGS_selected_gpus=0\n\n# 注意，这里不再使用 launch 启动，但本脚本需要运行多次\npython train.py\n```\n\n注意在执行时，需要依次替换后面4个环境变量为对应值启动。\n\n#### 3.5 常用的通信问题排查\n\nGPU/NCCL 问题请先核对**版本是否匹配**，通过 `nvidia-smi` 查看是否有进程正在占用，仍有问题需要通过 [nccl-test](https://github.com/NVIDIA/nccl-tests)  测试。常见运行时错误和解决方法如下，\n\n**NCCL error(5)**\n\n```shell\nOSError: (External) NCCL error(5), invalid usage. \n  [Hint: 'ncclInvalidUsage'. The call to NCCL is incorrect. This is usually reflecting a programming error.]\n```\n\n原因和解决方法：该错误多为同一张 GPU 卡被多个进程同时使用导致冲突，请检查正在使用 GPU 的进程。如果需要在同一台机器上启动多个逻辑节点，可以使用 `CUDA_VISIBLE_DEVICES` 环境变量控制设备可见性。\n\n**NCCL error(2)**\n\n```shell\nExternalError: Nccl error(2), unhandled system error\n```\n\n原因和解决方法：该错误一般为 shm 设置太小，如果使用 Docker 环境需要在启动 Docker 时做映射和设置如 `--shm-size 32G`.\n\n"
  },
  {
    "path": "docs/docker_install.md",
    "content": "\n## Docker 环境安装\n\n使用 Docker 首先需要安装 Docker  环境，安装的完整流程请参考[文档](https://docs.docker.com/engine/install/)，基础安装流程如下所述。\n另外在 Docker 中使用 GPU 还需要安装 [nvida-container-runtime](https://github.com/NVIDIA/nvidia-container-runtime)。\n\n**Ubuntu**\n\n添加 apt 源。\n```\nsudo curl -fsSL https://mirrors.aliyun.com/docker-ce/linux/ubuntu/gpg | sudo apt-key add -\nsudo add-apt-repository \"deb [arch=amd64] https://mirrors.aliyun.com/docker-ce/linux/ubuntu $(lsb_release -cs) stable\"\n```\n\n软件源升级， 安装docker\n\n```\nsudo apt-get update\n\nsudo apt-get docker-ce docker-ce-cli containerd.io \n```\n\n使用 `docker version` 查看 docker 版本信息无错误信息即说明安装运行正常。\n\n安装 nvida-container-runtime\n\n```\nsudo apt-get install nvidia-container-runtimeb\n```\n\n**CentOS**\n\n添加yum源。\n\n```\nsudo wget -O /etc/yum.repos.d/docker-ce.repo https://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo\n```\n\n安装组件。\n```\nsudo yum install docker-ce docker-ce-cli containerd.io\n```\n\n启动Docker。\n```\nsudo systemctl start docker\n```\n\n查看Docker状态。\n```\nsudo systemctl status docker\n```\n\n如日志状态为 active (running) 则表示docker启动正常。\n```\n● docker.service - LSB: start and stop docker\n   Loaded: loaded (/etc/rc.d/init.d/docker; bad; vendor preset: disabled)\n   Active: active (running) since Thu 2022-08-11 20:11:19 CST; 3 days ago\n     Docs: man:systemd-sysv-generator(8)\n  Process: 29766 ExecStop=/etc/rc.d/init.d/docker stop (code=exited, status=0/SUCCESS)\n  Process: 33215 ExecStart=/etc/rc.d/init.d/docker start (code=exited, status=0/SUCCESS)\n```\n\n安装 nvida-container-runtime。\n\n```\nsudo yum install nvidia-container-runtime\n```\n\n"
  },
  {
    "path": "docs/quick_start.md",
    "content": "\n# 快速开始\n\n## 1. 环境准备\n\n这里介绍使用裸机或者 Docker 环境使用 PaddleFleetX 的方法，用户根据具体情况选择一种安装部署方式即可。\n使用多机训练时，需要在每台机器上都部署相应的环境。\n\n### 1.1 Docker 环境部署\n\n推荐使用 Docker 安装部署 PaddleFleetX 进行大模型训练，Docker 环境的安装可以参考[文档](docker_install.md)。\n\n请根据本地 CUDA 版本（使用 `nvidia-smi`命令查看）使用以下命令拉取对应或兼容的镜像，\n\n```\ndocker pull registry.baidubce.com/ppfleetx/fleetx-cuda11.2-cudnn8:dev\n```\n\n如本地环境cuda版本较低可以参考 Dockerfile 根据需要定制镜像。\n\n大模型训练需要使用GPU，如已安装 nvida-container-runtime 可以使用以下命令运行镜像，\n\n```\ndocker run -it --name=paddle --net=host -v /dev/shm:/dev/shm --shm-size=32G -v $PWD:/paddle --runtime=nvidia registry.baidubce.com/ppfleetx/ppfleetx-cuda11.2-cudnn8:v0.1.0 bash\n```\n\n未安装 nvida-container-runtime 或启动后无法执行 `nvidia-smi` 查看GPU信息时可以尝试通过如下脚本启动运行，\n\n```shell\nexport CUDA_SO=\"$(\\ls /usr/lib64/libcuda* | grep -v : | xargs -I{} echo '-v {}:{}') $(\\ls /usr/lib64/libnvidia* | grep -v : | xargs -I{} echo '-v {}:{}')\"\nexport DEVICES=$(find /dev/nvidia* -maxdepth 1 -not -type d | xargs -I{} echo '--device {}:{}')\n\nnvsmi=`which nvidia-smi`\n\ndocker run \\\n${CUDA_SO} ${DEVICES} \\\n-v /dev/shm:/dev/shm \\\n-v $PWD:/paddle \\\n--name paddle \\\n--net=host \\\n--shm-size=32G \\\n-v $nvsmi:$nvsmi \\\n-it \\\nregistry.baidubce.com/ppfleetx/ppfleetx-cuda11.2-cudnn8:v0.1.0 \\\nbash\n```\n\n以上命令 `-v $PWD:/paddle` 将当前目录映射到 /paddle 目录，在 docker 环境内部对该目录的更改将会持久化。\n\n> 为保证通信效率和通信正常，添加参数 --net=host 使用主机网络，更多 docker run 参数说明请参考 [docker 文档](https://docs.docker.com/engine/reference/commandline/run/)。\n\n### 1.2 裸机部署\n\n**安装 PaddlePaddle**\n\n首先根据环境在\n[安装文档](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html) 选择对应的版本使用 pip install 执行对应命令安装 PaddlePaddle.\n**请务必按照文档安装 GPU 版本且验证安装成功**。\n\n例如使用如下命令将会安装基于 CUDA 11.2 最新版本的 PaddlePaddle. \n\n```shell\npython -m pip install paddlepaddle-gpu==0.0.0.post112 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html\n```\n\n安装遇到问题以及环境验证的方法也可以参考[文档](deployment_faq.md#1-单机环境验证)。\n\n**安装依赖**\n\n使用以下命令安装 PaddleFleetX 运行所需依赖。\n\n```shell\npython -m pip install -r https://raw.githubusercontent.com/PaddlePaddle/PaddleFleetX/develop/requirements.txt -i https://mirror.baidu.com/pypi/simple\n```\n\n## 2. 模型训练\n\n进入环境后首先使用以下命令拉取最新代码\n\n```shell\ngit clone https://github.com/PaddlePaddle/PaddleFleetX.git\n```\n\n然后根据需求选择对应的训练方式。\n\n### 2.1. 单卡训练\n\n切换工作目录并下载demo数据，\n```\nmkdir data\nwget -O data/gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy\nwget -O data/gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz\n```\n\n然后使用以下命令运行程序，\n\n```shell\npython ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml\n```\n\n若要在显存容量更小的16G V100环境下进行GPT模型单卡训练，可将对应yaml文件中的Model-hidden size值改为原来的1/2即可。\n\n**运行日志**\n\n```\n[2022-09-21 05:42:26,980] [    INFO] - [train] epoch: 0, batch: 0, loss: 10.999595642, avg_batch_cost: 2.73014 sec, speed: 0.37 step/s, ips_total: 3001 tokens/s, ips: 3001 tokens/s, learning rate: 2.77778e-08\n[2022-09-21 05:42:27,492] [    INFO] - [train] epoch: 0, batch: 1, loss: 10.997043610, avg_batch_cost: 0.51164 sec, speed: 1.95 step/s, ips_total: 16011 tokens/s, ips: 16011 tokens/s, learning rate: 4.16667e-08\n[2022-09-21 05:42:27,997] [    INFO] - [train] epoch: 0, batch: 2, loss: 10.994422913, avg_batch_cost: 0.50457 sec, speed: 1.98 step/s, ips_total: 16236 tokens/s, ips: 16236 tokens/s, learning rate: 5.55556e-08\n[2022-09-21 05:42:28,503] [    INFO] - [train] epoch: 0, batch: 3, loss: 11.005314827, avg_batch_cost: 0.50497 sec, speed: 1.98 step/s, ips_total: 16223 tokens/s, ips: 16223 tokens/s, learning rate: 6.94444e-08\n[2022-09-21 05:42:29,009] [    INFO] - [train] epoch: 0, batch: 4, loss: 10.988020897, avg_batch_cost: 0.50480 sec, speed: 1.98 step/s, ips_total: 16228 tokens/s, ips: 16228 tokens/s, learning rate: 8.33333e-08\n[2022-09-21 05:42:29,513] [    INFO] - [train] epoch: 0, batch: 5, loss: 10.983006477, avg_batch_cost: 0.50393 sec, speed: 1.98 step/s, ips_total: 16256 tokens/s, ips: 16256 tokens/s, learning rate: 9.72222e-08\n[2022-09-21 05:42:30,018] [    INFO] - [train] epoch: 0, batch: 6, loss: 10.988539696, avg_batch_cost: 0.50427 sec, speed: 1.98 step/s, ips_total: 16245 tokens/s, ips: 16245 tokens/s, learning rate: 1.11111e-07\n```\n\n\n\n### 2.2. 单机多卡训练\n\n切换工作目录并下载demo数据，\n\n```shell\nmkdir data\nwget -O data/gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy\nwget -O data/gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz\n```\n\n然后使用以下命令运行单机多卡程序，\n\n```\npython -m paddle.distributed.launch \\\n    ./tools/train.py \\\n    -c ./ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml\n```\n\n若要在显存容量更小的环境例如 16G 显存下进行GPT模型单机训练，可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练，命令如下：\n\n```\npython -m paddle.distributed.launch \\\n    ./tools/train.py -c \\\n    ./ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml -o Model.hidden_size=1024\n```\n\n> 更多 launch 启动参数和用法请参考 [API 文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/distributed/launch_cn.html)。\n\n成功则开始训练过程，\n```\nLAUNCH INFO 2022-08-15 07:37:38,946 -----------  Configuration  ----------------------\nLAUNCH INFO 2022-08-15 07:37:38,946 devices: None\nLAUNCH INFO 2022-08-15 07:37:38,947 elastic_level: -1\nLAUNCH INFO 2022-08-15 07:37:38,947 elastic_timeout: 30\nLAUNCH INFO 2022-08-15 07:37:38,947 gloo_port: 6767\nLAUNCH INFO 2022-08-15 07:37:38,947 host: None\nLAUNCH INFO 2022-08-15 07:37:38,947 ips: None\nLAUNCH INFO 2022-08-15 07:37:38,947 job_id: default\nLAUNCH INFO 2022-08-15 07:37:38,947 legacy: False\nLAUNCH INFO 2022-08-15 07:37:38,947 log_dir: log\nLAUNCH INFO 2022-08-15 07:37:38,947 log_level: INFO\nLAUNCH INFO 2022-08-15 07:37:38,947 master: None\nLAUNCH INFO 2022-08-15 07:37:38,947 max_restart: 3\nLAUNCH INFO 2022-08-15 07:37:38,947 nnodes: 1\nLAUNCH INFO 2022-08-15 07:37:38,947 nproc_per_node: None\nLAUNCH INFO 2022-08-15 07:37:38,947 rank: -1\nLAUNCH INFO 2022-08-15 07:37:38,947 run_mode: collective\nLAUNCH INFO 2022-08-15 07:37:38,947 server_num: None\nLAUNCH INFO 2022-08-15 07:37:38,947 servers:\nLAUNCH INFO 2022-08-15 07:37:38,947 start_port: 6070\nLAUNCH INFO 2022-08-15 07:37:38,947 trainer_num: None\nLAUNCH INFO 2022-08-15 07:37:38,947 trainers:\nLAUNCH INFO 2022-08-15 07:37:38,947 training_script: run_pretrain.py\nLAUNCH INFO 2022-08-15 07:37:38,947 training_script_args: ['-c', './configs_1.3B_dp8.yaml']\nLAUNCH INFO 2022-08-15 07:37:38,947 with_gloo: 1\nLAUNCH INFO 2022-08-15 07:37:38,947 --------------------------------------------------\nLAUNCH INFO 2022-08-15 07:37:38,948 Job: default, mode collective, replicas 1[1:1], elastic False\nLAUNCH INFO 2022-08-15 07:37:38,949 Run Pod: vqhbut, replicas 8, status ready\nLAUNCH INFO 2022-08-15 07:37:39,063 Watching Pod: vqhbut, replicas 8, status running\n## 启动配置\n[2022-08-15 07:41:23,063] [    INFO] - [train] epoch: 0, batch: 0, loss: 11.255846024, avg_batch_cost: 7.06713 sec, speed: 0.14 step/s, ips_total: 9273 tokens/s, ips: 1159 tokens/s, learning rate: 2.77778e-08\n## 更多训练日志\n```\n\n如有启动异常请根据[文档](deployment_faq.md#1-单机环境验证)进行工作环境验证，其他问题可参考[FAQ](deployment_faq.md#3-faq)解决。\n\n## 2.3. 多机多卡训练\n\n使用以下命令进行多机分布式训练，其中 --nnodes 参数为分布式训练机器数量，--master 为训练机器中其中一台机器的IP，运行时需要将命令中示例IP替换为真实的机器IP和任意可用端口，然后在**每个节点**上都运行以下命令，\n如果不知道机器IP可以不设置--master参数先在一台机器上启动，然后根据提示复制命令在其他机器上启动即可。\n\n```\npython -m paddle.distributed.launch --master=10.10.10.1:8099 --nnodes=2 \\\n    ./tools/train.py -c \\\n    ./ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml\n```\n\n> 该示例为16卡任务，需要满足总卡数为16的要求。\n\n> 注意这里需要使用单机多卡训练部分的代码和数据。\n\n\n成功则开始多机训练过程，日志和单机多卡类似，日志异常时请按照[文档](deployment_faq.md#2-分布式环境验证)进行环境验证和问题排查。\n\n若要在显存容量更小的环境例如 16G 显存下进行GPT模型单机训练，可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练，命令如下：\n\n```\npython -m paddle.distributed.launch --master=10.10.10.1:8099 --nnodes=2 \\\n    ./tools/train.py -c \\\n    ./ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml -o Model.hidden_size=2048\n```\n\n更多大模型多机训练内容可见[文档](../projects/gpt/docs/README.md)。\n"
  },
  {
    "path": "docs/standard.md",
    "content": "## 模型接入规范\n\n本文讲述在PaddleFleetX repo接入一个新模型，该如何添加和修改文件，以及相应的规范化流程。\n\n### 1.PaddleFleetX 介绍\nPaddleFleetX是飞桨大模型训练推理一站式工具组件。与Paddle.distributed、Paddle.fleet API的关系如下：\n\n\n<div align=\"center\">\n<img src=\"./images/fleetx_arc.png\"  alt=\"drawing\" width=\"500\">\n\n<em> PaddleFleetX与Paddle的关系 </em>\n</div>\n\n\n目前支持的模型列表如下：\n- GPT\n\n\n### 2.目录结构\n\n整体的PaddleFleetX的目录结构如下：\n\n```text\n.\n├── benchmarks                  # benchmark评估结果和示例代码\n│   └── README.md\n├── Dockerfile\n├── docs                        # 文档\n│   ├── cluster_deployment.md\n│   ├── deployment_faq.md\n│   ├── docker_install.md\n│   ├── images\n│   ├── quick_start.md\n│   └── standard.md\n├── ppfleetx\n│   ├── configs\n│   ├── core                    # 管理模型的组网规范，执行规范\n│   ├── data                    # 数据集下载、预处理脚本\n│   ├── models                  # 模型组网\n│   ├── optims                  # 优化器类定义\n│   └── utils\n├── projects                    # 模型脚本，包含GPT模型\n│   ├── ernie\n│   ├── gpt\n│   ├── imagen\n│   └── vit\n├── README.md\n├── requirements.txt\n├── tasks\n│   └── gpt\n└── tools\n    ├── auto.py\n    ├── eval.py\n    ├── export_model.py\n    ├── inference.py\n    └── train.py\n```\n\n### 3.模型接入方法\n\n根据模型训练的阶段不同，整体分为两个阶段：组网阶段和执行阶段。\n#### 3.1 组网阶段\n需要不同的分布式策略，它们会调用github/PaddlePaddle/Paddle核心框架里面的分布式高层API（FleetAPI），参考：\n需要的并行方式。\n- [数据并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/data_parallel/index_cn.html)\n- [张量模型并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/model_parallel_cn.html\n)\n- [流水线并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/pipeline_parallel_cn.html)\n- [分组切片并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/group_sharded_parallel_cn.html)\n\n\n#### 3.2 执行阶段\n##### BasicModule\n执行阶段采用Engine模块分装，为了能够保证Engine的模块化调用，需要将组网为``BasicModule``的子类，保证其规范化输出。其中``BasicModule``提供了多个统一的函数方法：\n\n| **函数名**                      | **参数释义**               |\n|------------------------------|------------------------|\n| init | 接受用户的组网参数，实现Module初始化 |\n| pretreating_batch | 预处理batch数据 |\n| train_step    | 一次完整的训练                  |\n| train_step_end  |   一次完整的训练后的操作                |\n| training_epoch_end  | 一次完整的epoch训练后的操作                  |\n| validation_step    | 一次完整的验证                  |\n| validation_step_end  | 一次完整的验证后的操作                  |\n| validation_epoch_end  | 一次完整的epoch验证后的操作                  |\n| test_step    | 一次完整的测试                  |\n| test_step_end  | 一次完整的测试后的操作                  |\n| configure_optimizers  | 配置这次训练的优化器                  |\n\n##### EagerEngine\n``EagerEngine``将上述函数串联起来，实现底层的执行逻辑对上层的屏蔽，减少冗余代码。\n初始化需要传入对应的config配置，其层级配置如下：\n\n```yaml\nEngine:\n  max_steps: 500000\n  num_train_epochs: 1\n  accumulate_steps: 1\n  logging_freq: 1\n  eval_freq: 500\n  eval_iters: 10\n  test_iters:\n  mix_precision:\n    enable: True\n    dtype: \"float16\"\n    level: \"O2\"\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n  save_load:\n    save_steps: 1000\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n```\n\n其中参数对应的释义如下：\n\n| **参数名**                      | **参数释义**               |\n|------------------------------|------------------------|\n| max_steps         | 最大训练步数                               |\n| num_train_epochs  | 训练的epoch数量                           |\n| accumulate_steps  | 梯度累加次数                           |\n| logging_freq      | 训练日志打印的频率                            |\n| eval_freq         | 模型评估间隔                               |\n| eval_iters        | 模型评估时训练评估测试集的轮数                      |\n| enable            | 是否使用混合精度策略进行训练                     |\n| dtype             | 混合精度训练数据类型使用float16还是bfloat16，默认为float16类型 |\n| level             | 混合精度训练模式，默认``O2``模式                 |\n| scale_loss        | 使用fp16混合精度策略下，loss的放缩比例                  |\n| custom_black_list | 自定义算子黑名单。这个名单中的算子在支持混合精度计算时会被认为是数值危险的，它们的影响也可能会在下游操作中观察到。这些算子通常不会转为float16/bfloat16计算 |\n| custom_white_list | 自定义算子白名单。这个名单中的算子在支持混合精度计算时会被认为是数值安全的，并且对性能至关重要。如果设置了白名单，该名单中的算子会使用float16/bfloat16计算 |\n| save_steps        | 保存模型间隔                               |\n| save_epoch        | 保存模型epoch间隔                               |\n| output_dir        | 指定输出文件                               |\n| ckpt_dir          | checkpoint的加载目录                      |\n\n``EagerEngine``中重载了多个常用函数，整体的说明如下：\n\n\n| **函数名**                      | **参数释义**               |\n|------------------------------|------------------------|\n| fit | 模型训练 |\n| evaluate | 模型评估 |\n| predict    | 模型预测                 |\n| save  |   模型参数保存                |\n| load    | 模型参数加载                  |\n\n其中module和engine函数方法的映射关系如下：\n\n- fit\n  \n``fit``实现模型的训练，EagerEngine的内部调用伪代码如下：\n\n```python\nmodule.model.train()\nfor batch in train_dataloader:\n    module.training_step()\n    module.training_step_end()\n\n    module.optimizer.step()\n    module.lr_scheduler.step()\n\n    module.optimizer.clear_grad()\n```\n\n- evaluate\n  \n``evaluate``实现模型的评估，``EagerEngine``的内部调用伪代码如下：\n\n```python\nwith paddle.no_grad():\n    module.model.eval()\n    for batch in vailidation_dataloader:\n        module.validation_step()\n        module.validation_step_end()\n```\n\n- test\n  \n`` predict``实现模型的预测，``EagerEngine``的内部调用伪代码如下：\n\n```python\nwith paddle.no_grad():\n    module.model.eval()\n    for batch in test_dataloader:\n        module.predict_step()\n        module.predict_step_end()\n```\n\n\n### 4.模型接入示例\n\n\n1、构建组网文件，放置在`ppfleex/models`目录下。\n\n```python\nclass SimpleNet(nn.Layer):\n    def __init__(self):\n        super(SimpleNet, self).__init__()\n        self.fc1 = nn.Linear(IMAGE_SIZE, IMAGE_SIZE)\n        self.fc2 = nn.Linear(IMAGE_SIZE, IMAGE_SIZE)\n        self.fc3 = nn.Linear(IMAGE_SIZE, IMAGE_SIZE)\n        self.fc4 = nn.Linear(IMAGE_SIZE, IMAGE_SIZE)\n        self.fc5 = nn.Linear(IMAGE_SIZE, CLASS_NUM)\n\n    def forward(self, image, label=None):\n        output = self.fc1(image)\n        output = self.fc2(output)\n        output = self.fc3(output)\n        output = self.fc4(output)\n        return self.fc5(output)\n\nclass LossLayer(nn.Layer):\n    def __init__(self):\n        super(LossLayer, self).__init__()\n\n    def forward(self, image, label=None):\n        return F.cross_entropy(image, label)\n```\n\n2、构建BasicModule，设置符合要求的组网形式，放置在`ppfleetx/models`目录下；并引入`ppfleetx/models/__init__.py`\n\n```python\nclass TestModule(BasicModule):\n    def __init__(self):\n        super().__init__()\n        self.loss_fn = LossLayer()\n\n    def get_model(self):\n        model = SimpleNet()\n        return model\n\n    def forward(self, x):\n        return self.model(x)\n\n    def training_step(self, batch):\n        x, y = batch\n        loss = self.loss_fn(self(x), y)\n        return loss\n\n    def training_step_end(self, log_dict):\n        logger.info(\n            \"[train] epoch: %d, batch: %d, loss: %.9f, avg_batch_cost: %.5f sec\"\n            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['train_cost']))\n\n    def validation_step(self, batch):\n        x, y = batch\n        loss = self.loss_fn(self(x), y)\n        return loss\n\n    def validation_step_end(self, log_dict):\n        logger.info(\n            \"[eval] epoch: %d, batch: %d, loss: %.9f, avg_eval_cost: %.5f sec\"\n            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['eval_cost']))\n\n    def test_step(self, batch):\n        x, y = batch\n        loss = self.loss_fn(self(x), y)\n        return loss\n\n    def test_step_end(self, log_dict):\n        logger.info(\n            \"[test] epoch: %d, batch: %d, loss: %.9f, avg_test_cost: %.5f sec\"\n            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['test_cost']))\n\n```\n3、通过config配置Dataset\n\nDataset可以通过config文件进行配置。新增Dataset类型放置在 `ppfleetx/data/dataset`,同时其构造参数于其对应的Dataset字段一致。比如：\n\n```python\nclass GPTDataset(paddle.io.Dataset):\n    def __init__(self,\n                 input_dir,\n                 split,\n                 max_seq_len,\n                 num_samples,\n                 mode,\n                 seed=1234):\n```\n对应config中的yaml字段：\n\n```yaml\nData:\n  Train:\n    dataset:\n      name: GPTDataset\n      input_dir: ./data/\n      split: [949, 50, 1]\n      max_seq_len: 1024\n    sampler:\n      name: DistributedBatchSampler\n      shuffle: False\n      drop_last: True\n    loader:\n      num_workers: 1\n      return_list: False\n      collate_fn: gpt_collate_fn\n```\n\n4、通过config配置Optimizer和LR\n\n\n```yaml\nOptimizer:\n  name: FusedAdamW\n  weight_decay: 0.01\n  beta1: 0.9\n  beta2: 0.999\n  epsilon: 1.0e-8\n  lr:\n    name: CosineAnnealingWithWarmupDecay\n    decay_steps: 360000\n    warmup_rate: 0.01\n    max_lr: 5.0e-5\n    min_lr: 1.0e-5\n  grad_clip:\n    name: \"ClipGradByGlobalNorm\"\n    clip_norm: 1.0\n  tensor_fusion: False\n```\n\n5、运行模型相关的配置文件以及相应的运行脚本，放置在[projects](https://github.com/PaddlePaddle/PaddleFleetX/tree/develop/projects)目录。\n\n\n### 5.模型推理示例\n\n模型训练完成后，可使用飞桨高性能推理引擎Paddle Inference通过如下方式进行推理部署。\n总共分为两个步骤：模型导出和推理部署。可以参考[GPT的模型推理](https://github.com/PaddlePaddle/PaddleFleetX/blob/develop/docs/inference.md)。\n"
  },
  {
    "path": "examples/transformer/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "examples/transformer/models/GPT/docs/README.md",
    "content": "# GPT\n\n## 模型介绍\nGPT-[2](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)/[3](https://arxiv.org/pdf/2005.14165.pdf) 是以[Transformer](https://arxiv.org/abs/1706.03762) 解码器为网络基本组件，使用自回归的方式在大规模无标注文本语料上进行预训练得到的语言生成模型。\n\n本项目是语言模型 GPT 的 PaddlePaddle 大模型实现。目前，PaddleFleetX 提供了 [GPT-345M](https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz) 的预训练模型文件；分别基于 [LAMBADA](https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl) 和 [WikiText](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip) 数据集，采用 ACC(accuracy) 和 PPL(perplexity) 指标后的评估结果如下：\n\n| **模型文件** | **ACC** | **PPL** |\n|---------|-----------|---------------|\n| GPT-345M | 44.17% |  18.01  |\n\n下面是本例的简要目录结构及说明：\n\n```text\n.\n├── docs              # 一站式文档入口\n├── finetune          # GLUE 下游任务微调入口\n├── generation        # 文本生成体验入口\n├── offline-eval      # 模型精度离线评估入口\n├── pretrain          # 预训练入口\n\n```\n\n## 快速开始\n\n### 环境依赖\n\n请确保已根据根目录 requirements.txt 安装所需依赖，或者通过以下命令快速安装\n\n```shell\ncd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略\n\npython -m pip install -r https://raw.githubusercontent.com/PaddlePaddle/PaddleFleetX/develop/requirements.txt -i https://mirror.baidu.com/pypi/simple\n```\n\n### 数据准备\n\n数据获取和制作详见[GPT 模型预训练数据准备流程](https://github.com/PaddlePaddle/PaddleFleetX/tree/develop/ppfleetx/data/data_tools/gpt)\n\n为了方便用户运行测试本模型，此处提供处理好的300M的训练样本，在单卡训练或混合并行训练前都需要通过以下命令获取数据。\n\n**数据下载命令**\n```shell\ncd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略\n\n# 下载样例数据\nmkdir data && cd data\nwget -O gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy\nwget -O gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz\n\ncd .. # 回到 GPT 目录下\n```\n\n### 模型训练\n\n除了单卡训练，飞桨还支持数据并行、混合并行、自动并行、重计算等多种分布式策略，减少显存占用、加速训练，达到大模型可训练且训得快的效果。在模型训练前，需要根据模型规模选择合适的并行策略。下面分别从单卡训练和混合并行训练两个方面来介绍GPT模型训练的配置文件和启动方式。\n\n\n- [单卡训练](./single_card.md)\n\n- [混合并行训练](./hybrid_parallel.md)\n\n\n### 文本生成体验\n\n- [单卡预训练模型文本生成](./single_card.md#GPT-Zero-shot-文本生成)\n\n- [混合并行预训练模型文本生成](./hybrid_parallel.md#GPT-Zero-shot-文本生成)\n\n\n### 模型压缩\n\n- [量化训练](./quantization_aware_training.md)\n\n### 推理部署\n\n- [推理部署](inference.md)\n### GLUE 下游任务微调\n\n- [单卡微调](./single_finetune.md)\n\n\n## 参数释义\n\n\n### 全局信息\n全局参数指定训练的batch size，以及设备、随机种子等信息；除此之外，模型训练/验证/推理等过程中的必要参数设置也在这里完成。\n```yaml\n  Global:\n    device: gpu\n    seed: 1024\n\n    global_batch_size: \n    local_batch_size: 1\n    micro_batch_size: 1\n\n    max_steps: 500000\n    num_train_epochs: 1\n    accumulate_steps: \n    logging_freq: 1\n    eval_freq: 500\n    eval_iters: 10\n    test_iters:\n    mix_precision:\n      enable: True\n      dtype: \"float16\"\n      level: \"O2\"\n      scale_loss: 32768.0\n      custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\"]\n      custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n    save_load:\n      save_steps: 1000\n      save_epoch: 1\n      output_dir: ./output\n      ckpt_dir:\n```\n其中参数对应的释义如下：\n\n| **参数名**                      | **参数释义**               |\n|------------------------------|------------------------|\n| device | 设备信息 |\n| seed | 随机数种子 |\n| global_batch_size | 全局的batch size大小，即一次参数更新等效的batch size |\n| local_batch_size  | 每个进程训练的batch size大小                  |\n| micro_batch_size  | 每次前向计算的batch size大小                  |\n| max_steps         | 最大训练步数                               |\n| num_train_epochs  | 训练的epoch数量                           |\n| accumulate_steps  | 梯度累加次数                           |\n| logging_freq      | 训练日志打印的频率                            |\n| eval_freq         | 模型评估间隔                               |\n| eval_iters        | 模型评估时训练评估测试集的轮数                      |\n| test_iters        | 模型测试或推理时的轮数                      |\n| enable            | 是否使用混合精度策略进行训练                     |\n| dtype             | 混合精度训练数据类型使用float16还是bfloat16，默认为float16类型 |\n| level             | 混合精度训练模式，默认``O2``模式                 |\n| scale_loss        | 使用fp16混合精度策略下，loss的放缩比例                  |\n| custom_black_list | 自定义算子黑名单。这个名单中的算子在支持混合精度计算时会被认为是数值危险的，它们的影响也可能会在下游操作中观察到。这些算子通常不会转为float16/bfloat16计算 |\n| custom_white_list | 自定义算子白名单。这个名单中的算子在支持混合精度计算时会被认为是数值安全的，并且对性能至关重要。如果设置了白名单，该名单中的算子会使用float16/bfloat16计算 |\n| save_steps        | 保存模型间隔step数                         |\n| save_epoch        | 保存模型间隔epoch数                        |\n| output_dir        | 指定输出文件                              |\n| ckpt_dir          | checkpoint的加载目录                      |\n\n### 模型网络\n\n网络部分完成了网络的组网操作，GPT在[single_model.py](https://github.com/PaddlePaddle/PaddleFleetX/blob/develop/ppfleetx/models/language_model/gpt/dygraph/single_model.py)下。 \n可以使用配置文件配置模型的规模，如：\n\n```yaml\n  Model:\n    name: \"GPT\"\n    vocab_size: 50304\n    hidden_size: 1024\n    num_layers: 24\n    num_attention_heads: 16\n    ffn_hidden_size:\n    hidden_dropout_prob: 0.1\n    attention_probs_dropout_prob: 0.1\n    max_position_embeddings: 1024\n    type_vocab_size: 16\n    initializer_range: 0.02\n    use_recompute: True\n    recompute_granularity:\n    no_recompute_layers:\n    fused_linear: True\n    fuse_attn_qkv: True\n    sequence_parallel: False\n```\n\n其中参数对应的释义如下：\n| **参数名**                      | **参数释义**               |\n|------------------------------|------------------------|\n| vocab_size                   | 训练词表大小                 |\n| hidden_size                  | 隐藏层大小                  |\n| num_layers                   | transformer层数          |\n| num_attention_heads          | attention head的数量      |\n| max_seq_len                  | 输入文本序列的长度              |\n| ffn_hidden_size              | ffn层大小，一般为隐藏层的四倍       |\n| attention_probs_dropout_prob | attention中的dropout的失活率 |\n| max_position_embeddings      | position embedding的长度  |\n| type_vocab_size              | 词表类型                   |\n| initializer_range            | 参数初始化的范围               |\n| use_recompute     | 是否使用recompute训练                      |\n| recompute_granularity | recompute训练的粒度，可选 `full` `full_attn` `core_attn`，full即recompute全部transformer，full_attn表明只recompute所有self attention部分，core_attn表明只recompute `softmax(qkT)v` 部分。注：显存占用方面，`core_attn` > `full_attn` > `full`，若所选策略产生OOM错误，可以适当更改recompute_granularity |\n|no_recompute_layers| list of integer，标识哪些层的transformer不需要进行recompute。所有在该list中的值应该 >= 0 同时应该 < num_layers。向该参数中增加不进行recompute 的层数可以提升模型训练的整体吞吐，但是会适当的增加显存。若训练中发现有显存富裕，可以适当增加不进行recompute的层数。如果使用该参数后出现OOM错误，可以适当减小不进行recompute的层数。 ｜\n| fused_linear      | 是否使用fused_linear代替传统Linear加速训练。注：该功能需要cuda 11.6及以上编译的paddle支持。       |\n| fuse_attn_qkv     | 是否对attention层中的qkv计算使用fuse策略以加速训练 |\n| sequence_parallel | 是否使用序列并行策略以加速训练。注：只有混合并行的GPT才支持该功能，它与张量模型并行共用通信组，当mp_degree=1时，序列并行策略会被强制关闭。 |\n| virtual_pp_degree | 虚拟流水线并行维度，该参数会减小流水线bubble的占比以提升流水线的吞吐。但是该参数会增加流水线间的通讯，所以该参数的推荐值为2。并且，只有 num_layers可以被 pp_degree * virtual_pp_degree 整除时，才可以使用虚拟流水线并行。 |\n\n### 数据集\n\n数据集参数分为“Train”、“Eval”和“Test”三部分，分别对应模型预训练、离线评估、推理等三个模块。\n\n每个模型的配置参数都包含以下内容：\n\n```yaml\n  Data:\n    Train:\n      dataset:\n        name: GPTDataset\n        input_dir: ./data/\n        split: [949, 50, 1]\n        max_seq_len: 1024\n      sampler:\n        name: DistributedBatchSampler\n        shuffle: False\n        drop_last: True\n      loader:\n        num_workers: 1\n        return_list: False\n        collate_fn: gpt_collate_fn\n```\n\n其中参数对应的释义如下：\n| **参数名**                      | **参数释义**               |\n|------------------------------|------------------------|\n| dataset.name         | 指定自定义数据集的名称  |\n| input_dir         | 指定输入文件，可以使用目录，指定目录时将包括目录中的所有文件       |\n| split             | 训练集，验证集和测试集的切分比例                     |\n| max_seq_len       | 输入文本序列的长度                            |\n| sampler.name         | 指定自定义采样器的名称  |\n| shuffle         | 是否需要在生成样本下标时打乱顺序     |\n| drop_last             | 是否需要丢弃最后无法凑整一个mini-batch的样本        |\n| num_workers        | 用于加载数据的子进程个数  |\n| return_list         | 每个设备上的数据是否以list形式返回    |\n| collate_fn             | 通过此参数指定如果将样本列表组合为mini-batch数据；支持自定义     |\n\n\n### 优化器\n\n\nGPT训练默认使用AdamW优化器以及cosine学习率衰减，这里通过配置文件配置优化器的参数，如：\n\n```yaml\n  Optimizer:\n    name: AdamW\n    weight_decay: 0.01\n    beta1: 0.9\n    beta2: 0.999\n    epsilon: 1.0e-8\n    lr:\n      name: CosineAnnealingWithWarmupDecay\n      decay_steps: 360000\n      warmup_rate: 0.01\n      max_lr: 5.0e-5\n      min_lr: 1.0e-5\n    grad_clip:\n      name: \"ClipGradByGlobalNorm\"\n      clip_norm: 1.0\n    tensor_fusion: False\n```\n\n其中参数说明：\n\n| **参数名**      | **参数释义**                  |\n|--------------|---------------------------|\n| name | 指定自定义优化器的名称               |\n| weight_decay | weight的衰减率                |\n| beta1   | 一阶矩估计的指数衰减率               |\n| beta2   | 二阶矩估计的指数衰减率               |\n| epsilon | 指定优化器需要优化的参数              |\n| lr.name | 指定自定义学习率策略的名称               |\n| decay_steps  | 衰减的步长                     |\n| warmup_rate  | warmup 率                  |\n| max_lr       | Adam 的初始最大学习率             |\n| min_lr       | Adam 的初始最小学习率             |\n| grad_clip.name    | 指定自定义梯度裁剪策略的名称 |\n| clip_norm    | 所允许的范数最大值 |\n| tensor_fusion    | 是否使用tensor_fustion功能加速训练 |\n\n另外，[Profiler](./hybrid_profiler.md)中还介绍了在 GPT 中开启 Profiler 并分析调试分析结果的方法及相关的参数解释。\n\n### 模型压缩\nPaddleFleetX 集成了 PaddleSlim 中的常见的压缩方法：量化训练（Qutization Aware Training，QAT）、结构化稀疏（Structured Pruning，SP）和知识蒸馏（Knowledge Distillation，KD）。详细参数介绍见[模型压缩介绍](https://github.com/PaddlePaddle/PaddleFleetX/blob/develop/docs/compression.md)。\n\n\n## 参考文献\n- [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)\n- [Language Models are Few-Shot Learners](https://arxiv.org/pdf/2005.14165.pdf)\n- [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413)\n"
  },
  {
    "path": "examples/transformer/models/GPT/docs/hybrid_parallel.md",
    "content": "# GPT 混合并行模型训练\n\n当训练超大模型时，就必须借助混合并行策略，混合并行策略分别指数据并行、张量模型并行、流水线并行和分组切片并行。其中数据并行保存完整的模型参数并独立处理一份子数据集，以加速模型训练过程；张量模型并行将网络中的张量（Tensor）切分到不同的设备，从而降低单个设备的显存消耗；流水线并行将模型的不同层放置到不同的计算设备，降低单个计算设备的显存消耗；分组切片并行将参数和模型状态划分到不同卡上，每个GPU只保存部分副本，以减少显存占用。联合四种训练方式，可以实现更大模型、更快训练的效果。具体策略以及相关FleetAPI介绍可以参考以下教程：\n\n- [数据并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/data_parallel/index_cn.html)\n\n- [张量模型并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/model_parallel_cn.html\n)\n- [流水线并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/pipeline_parallel_cn.html)\n\n- [分组切片并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/group_sharded_parallel_cn.html)\n\n\n## 参数释义\n\n### 并行维度\n\n当前GPT模型已适配3D混合并行，并能够在训练超大模型，用户可以通过配置文件选择并行的维度。\n\n```yaml\n  Distributed:\n    dp_degree: 2\n    mp_degree: 2\n    pp_degree: 2\n    sharding:\n      sharding_degree: 1\n      sharding_stage: 1\n      sharding_offload: False\n      reduce_overlap: False\n      broadcast_overlap: False\n```\n\n其中参数说明：\n\n| **参数名**          | **参数释义**                             |\n|------------------|--------------------------------------|\n| dp_degree        | 数据并行维度                               |\n| mp_degree        | 张量模型并行维度                             |\n| pp_degree        | 流水线并行维度                              |\n| sharding_degree  | 分组切分并行维度                             |\n| sharding_stage   | 切分策略；1表示仅切分优化器状态，2表示再切分梯度，3表示再切分前向参数 |\n| sharding_offload | CPU offload策略                        |\n|reduce_overlap| 是否在sharding stage 2的模式下进行reduce通讯与反向计算的overlap，该策略暂时不支持sharding_offload|\n|broadcast_overlap| 是否在sharding stage 2的模式下进行broadcast通讯与下一个batch的 前向计算的overlap，该策略暂时不支持sharding_offload。若使用该模型，在evaluation与save之前，必须调用 `paddle.device.cuda.synchronize()` 方法|\n\n## 运行方式\n本目录中按照345M、1.3B、6.7B和175B规模大小，给出32G V100环境下GPT模型混合并行训练的策略配置如下：\n\n| 模型规模 | 训练策略                 | yaml文件                   |\n|----------|---------------------------|------------------------------|\n| 345M     | fp16+mp8+qat              | qat_gpt_345M_mp8.yaml    |\n| 1.3B     | fp16+dp8+recompute        | pretrain_gpt_1.3B_dp8.yaml   |\n| 6.7B     | fp16+sharding16+recompute | pretrain_gpt_6.7B_sharding16.yaml  |\n| 175B     | fp16+mp8+pp16+recompute   | pretrain_gpt_175B_mp8_pp16.yaml   |\n\n若要在显存容量更小的16G V100环境下进行GPT大模型训练，可将对应yaml文件中的`Model`-`hidden size`值改为原来的1/2即可。\n\n### 策略支持\n\n飞桨的混合并行技术包括4个维度：数据并行、张量模型并行、流水线并行和分组切片并行，此外还支持重计算、offload、混合精度、序列并行等策略，来减少显存占用、加速训练。\n\n目前，GPT模型训练已支持前3个维度的任意策略组合，但分组切片并行stage2/3仅支持与数据并行策略组合使用；详见下表。\n\n|                 | data parallel | tensor parallel | pipeline parallel | pure fp16 | recompute |\n|-----------------|---------------|-----------------|-------------------|-----------|-----------|\n| sharding stage1 | ✓             | ✓               | ✓                 | ✓         | ✓         |\n| sharding stage2 | ✓             | ㄨ               | ㄨ                 | ✓         | ✓         |\n| sharding stage3 | ✓             | ㄨ               | ㄨ                 | ✓         | ✓         |\n\n### 单机训练\n\n以单机1.3B模型数据并行训练为例，通过``paddle.distributed.launch``启动多进程训练，该gpt程序需要8卡32G V100以运行。\n\n**启动命令**\n```shell\ncd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略\n\nlog_dir=log_dp8\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    pretrain/run.py \\\n    -c pretrain/configs/pretrain_gpt_1.3B_dp8.yaml\n```\n\n若要在显存容量更小的16G V100环境下进行GPT模型单机训练，可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练，命令如下：\n\n**启动命令**\n```shell\nlog_dir=log_dp8\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    pretrain/run.py \\\n    -c pretrain/configs/pretrain_gpt_1.3B_dp8.yaml \\\n    -o Model.hidden_size=1024\n```\n\n每张GPU的运行日志`workerlog.x`可在launch命令中指定的`log_dir`路径下找到；若未指定，日志路径为`log/workerlog.x`。运行日志具体内容如下：\n\n**运行日志**\n\n```\n[2022-09-21 05:43:58,797] [    INFO] - [train] epoch: 0, batch: 0, loss: 10.992407799, avg_batch_cost: 5.51734 sec, speed: 0.18 step/s, ips_total: 11878 tokens/s, ips: 1485 tokens/s, learning rate: 2.77778e-08\n[2022-09-21 05:43:59,508] [    INFO] - [train] epoch: 0, batch: 1, loss: 11.000075340, avg_batch_cost: 0.71029 sec, speed: 1.41 step/s, ips_total: 92267 tokens/s, ips: 11533 tokens/s, learning rate: 4.16667e-08\n[2022-09-21 05:44:00,242] [    INFO] - [train] epoch: 0, batch: 2, loss: 11.017463684, avg_batch_cost: 0.73301 sec, speed: 1.36 step/s, ips_total: 89406 tokens/s, ips: 11176 tokens/s, learning rate: 5.55556e-08\n[2022-09-21 05:44:00,965] [    INFO] - [train] epoch: 0, batch: 3, loss: 10.983654976, avg_batch_cost: 0.72319 sec, speed: 1.38 step/s, ips_total: 90620 tokens/s, ips: 11328 tokens/s, learning rate: 6.94444e-08\n[2022-09-21 05:44:01,678] [    INFO] - [train] epoch: 0, batch: 4, loss: 11.014451981, avg_batch_cost: 0.71223 sec, speed: 1.40 step/s, ips_total: 92016 tokens/s, ips: 11502 tokens/s, learning rate: 8.33333e-08\n[2022-09-21 05:44:02,385] [    INFO] - [train] epoch: 0, batch: 5, loss: 11.005180359, avg_batch_cost: 0.70707 sec, speed: 1.41 step/s, ips_total: 92687 tokens/s, ips: 11586 tokens/s, learning rate: 9.72222e-08\n[2022-09-21 05:44:03,100] [    INFO] - [train] epoch: 0, batch: 6, loss: 10.989698410, avg_batch_cost: 0.71402 sec, speed: 1.40 step/s, ips_total: 91785 tokens/s, ips: 11473 tokens/s, learning rate: 1.11111e-07\n[2022-09-21 05:44:03,806] [    INFO] - [train] epoch: 0, batch: 7, loss: 10.992337227, avg_batch_cost: 0.70554 sec, speed: 1.42 step/s, ips_total: 92888 tokens/s, ips: 11611 tokens/s, learning rate: 1.25000e-07\n[2022-09-21 05:44:04,516] [    INFO] - [train] epoch: 0, batch: 8, loss: 10.972790718, avg_batch_cost: 0.71011 sec, speed: 1.41 step/s, ips_total: 92290 tokens/s, ips: 11536 tokens/s, learning rate: 1.38889e-07\n[2022-09-21 05:44:05,228] [    INFO] - [train] epoch: 0, batch: 9, loss: 10.983499527, avg_batch_cost: 0.71128 sec, speed: 1.41 step/s, ips_total: 92138 tokens/s, ips: 11517 tokens/s, learning rate: 1.52778e-07\n```\n\n### 多机训练\n\n若需要在更多机器上进行大模型训练，则需要在每个参与训练的节点上设置master节点ip/port信息后执行启动命令（master节点ip为训练所用某一台机器的ip即可）。\n\n以2机16卡32G V100上的6.7B模型分组切分并行训练为例，启动命令为：\n\n```shell\nmaster_ip=master节点ip\nmaster_port=可用的空闲端口号\n\nlog_dir=log_sharding16\npython -m paddle.distributed.launch --log_dir $log_dir \\\n    --master=$master_ip:$master_port --nnodes=2 --devices \"0,1,2,3,4,5,6,7\" \\\n    pretrain/run.py -c pretrain/configs/pretrain_gpt_6.7B_sharding16.yaml\n```\n\n若要在显存容量更小的16G V100环境下进行GPT模型两机训练，也可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练，命令如下：\n\n```shell\nmaster_ip=master节点ip\nmaster_port=可用的空闲端口号\n\nlog_dir=log_sharding16\npython -m paddle.distributed.launch --log_dir $log_dir \\\n    --master=$master_ip:$master_port --nnodes=2 --devices \"0,1,2,3,4,5,6,7\" pretrain/run.py \\\n    -c pretrain/configs/pretrain_gpt_6.7B_sharding16.yaml \\\n    -o Model.hidden_size=2048\n```\n\n若要执行16机175B大模型混合并行训练，以运行启动命令为：\n\n```shell\nmaster_ip=master节点ip\nmaster_port=可用的空闲端口号\n\nlog_dir=log_mp8_pp16\npython -m paddle.distributed.launch --log_dir $log_dir \\\n    --master=$master_ip:$master_port --nnodes=16 --devices \"0,1,2,3,4,5,6,7\" pretrain/run.py \\\n    -c pretrain/configs/pretrain_gpt_175B_mp8_pp16.yaml\n```\n\n当节点较多时，可以考虑使用 `ssh` 脚本或 `mpirun` 进行跨节点命令分发。\n\n### 量化训练\n\n\n若需要对模型进行量化训练，按照以上在配置文件中添加量化参数，可参考`qat_gpt_345M_mp8.yaml`，量化训练时可以可以适当减少训练轮数和学习率。以单机345M模型模型并行训练为例，通过``paddle.distributed.launch``启动多进程训练，该gpt程序需要8卡32G V100以运行，命令如下：\n\n```shell\nlog_dir=log_mp8\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" pretrain/run.py \\\n    -c pretrain/configs/qat_gpt_345M_mp8.yaml \\\n    -o Global.max_steps=100000 \\\n    -o Optimizer.lr.decay_steps=72000 \\\n    -o Optimizer.lr.max_lr=5.0e-6 \\\n    -o Optimizer.lr.min_lr=1.0e-6 \n```\n\n\n# GPT Zero-shot 文本生成\n\n## 参数释义\n\n```yaml\nGeneration:\n  top_k: 50\n  top_p: 0.75\n  temperature: 1.0\n  min_dec_len: 1\n  max_dec_len: 200\n  num_return_sequences: 1\n  decode_strategy: \"sampling\"\n```\n\n其中参数说明：\n\n| **参数名**      | **参数释义**                  |\n|--------------|---------------------------|\n| top_k | 每次为采样挑选保留分数最高的 k 个 token        |\n| top_p   | 如果设置小于 1.0 的小数，则保留加起来为 top_p 或更高的最可能的概率的 token。默认值为 1.0        |\n| temperature   |  调节下一个 token 的概率温度，logits = logits / temperature，默认值为 1.0           |\n| min_dec_len | 最小生成 token 长度              |\n| max_dec_len  | 最大生成 token 长度                     |\n| num_return_sequences  | 每个输入生成的序列个数，默认值为 1                  |\n| decode_strategy       | 解码策略，默认值为 \"sampling\"，目前只支持 \"sampling\"，未来会支持 \"greedy_search\"，\"beam_search\" |\n\n## 文本生成\n\n下载预训练好的模型，快速体验文本生成\n\n```shell\ncd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略\n\nmkdir -p ckpt\nwget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz\ntar -xzf ckpt/GPT_345M.tar.gz -C ckpt/\n\n# --devices 根据并行策略设置设备\n\npython -m paddle.distributed.launch --devices \"0\" generation/run.py \\\n    -c generation/configs/generation_gpt_345M_dp8.yaml \\\n    -o Global.save_load.ckpt_dir=./ckpt/PaddleFleetX_GPT_345M_220826/\n\n# 生成的文本，由于 checkpoint 不同，超参不同，随机数不同，您执行可能会生成不一样的内容\n\nPrompt: Hi, GPT2. Tell me who Jack Ma is.\nGeneration: Hi, GPT2. Tell me who Jack Ma is. I don’t want to hear that.”\n\nFor now, the only question the crowd is asking is whether or not Jack Ma will step down from the board of directors of Alibaba.\n\nJack Ma on why he never wanted to run for President in 2016:\n\nThere were two reasons. One is that I wanted to spend more time with my family. I thought it was better to spend more time with my family and spend more time with my children. So it was a very personal reason. But the second reason was that I thought it would be difficult to get elected, because there are a lot of political interests in this country. So I thought it was better to spend more time with my family.\n\nOn how Alibaba will evolve into a new player in China’s transportation and logistics sector:\n\nI think that we are going to become a very important player in the logistics industry. So our strategy is to make it easy for people to travel.\n```\n\n### 剖析体验文本生成\n\n#### GPT 文本生成模块初始化\n\n```python\n    module = build_module(cfg)\n    module.model.eval()\n```\n\n#### 预训练模型加载\n\n```python\n    # 获取到预训练 checkpoint 的根目录\n    ckpt_dir = cfg.Global.save_load.ckpt_dir\n\n    # 构造出具体路径\n    model_path = os.path.join(ckpt_dir, \"model.pdparams\")\n\n    # 加载模型参数\n    model_dict = paddle.load(model_path)\n\n    # FP16 模型参数转成 FP32 模型参数\n    for key, value in model_dict.items():\n        model_dict[key] = model_dict[key].astype(paddle.float32)\n\n    # 设置模型参数为预训练参数\n    module.model.set_state_dict(model_dict)\n```\n\n#### 文本生成与结果展示\n\n```python\n    input_text = \"Historical Records: Tell us about the history of the Great Wall.\"\n    result = module.generate(input_text)\n\n    print(f'Prompt: {input_text}')\n    print(f'Generation: {result[0]}')\n```\n"
  },
  {
    "path": "examples/transformer/models/GPT/docs/hybrid_profiler.md",
    "content": "# Profiler\n\n本文档主要包括在 GPT 中开启 Profiler 并分析调试分析结果的方法，在模型开发中使用 Profiler 分析工具的方法请参考[教程](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/performance_improving/profiling_model.html)和[API文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/profiler/Profiler_cn.html)。\n\n## 参数配置\n\n使用 Profiler 功能需要在任务配置文件中添加 Profiler 配置信息并确保字段为 `enable: True` 以开启分析器。\n\n完整的可配置参数如下所示，可以根据使用场景调整配置。\n\n```\nProfiler:\n  enable: True\n  scheduler: [1, 5]\n  profiler_log: log_path\n  detailed: True\n  record_shapes: True\n  profile_memory: True\n  summary:\n    overview: True\n    device: True\n    model: True\n    dist: True\n    kernel: True\n    op: True\n    mem: True\n    memcpy: True\n```\n\n其中参数说明：\n\n| **参数名**                      | **参数释义**               |  **默认值** |\n|------------------------------|------------------------|------------------------|\n|  enable |   是否开启 Profiler | False |\n|  scheduler  | 定义分析区间，如 [1, 5] 记录 step 1 到 step 4 的分析数据 | None |\n|  profiler_log  | 日志文件目录 |   profiler_log |\n|  detailed  | 是否显示详细信息 |   False |\n|  record_shapes  |   是否记录 tensor shape 相关信息 | True |\n|  profile_memory |   是否统计 memory 相关信息 | True |\n\n其中，当 detailed=True 时会打印所有 summary 表格数据，当 detailed=False 时用户可以根据以下说明定制需要展示的表格信息。\n\n| **参数名**                      | **参数释义**               |  **默认值** |\n|------------------------------|------------------------|------------------------|\n|  summary.overview | 显示每种类型的 Event 时间消耗 |  True |\n|  summary.device | 显示 CPU 和 GPU 的平均利用率信息 |  False |\n|  summary.model  | 显示模型 dataloader、forward、backward、optimization 时间消耗 |  True |\n|  summary.dist  | 显示计算、通信以及重叠时间 |  False |\n|  summary.kernel  | 显示 GPU 执行的 kernel 信息 |  True |\n|  summary.op  | 显示框架中算子 (op) 的执行信息 |  True |\n|  summary.mem  | 显示内存/显存占用统计信息 |  False |\n|  summary.memcpy  | 显示框架中调用内存操作所花费的时间 | False |\n\n## 运行分析\n\n本节以 gpt混合并行 为例，首先进入目录，\n\n```\ncd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略\n```\n\n\n修改`pretrain/configs/pretrain_gpt_base.yaml` 中 Profiler.enable 为 True, 同时可以根据上节说明调整相关配置，或者使用命令行参数覆盖，例如可以使用以下命令运行程序，\n```\npython -m paddle.distributed.launch \\\n    ./pretrain/run.py -c \\\n    ./pretrain/configs/pretrain_gpt_1.3B_dp8.yaml -o Profiler.enable=True\n\n```\n\n> 在使用 Profiler 工具进行性能分析时，建议减少 train 的步数，获得分析数据即可停止训练。\n\n## 结果分析\n\n在训练结束后会有以下数据：\n\n* 根据配置信息在控制台打印 summary 表格\n* 在配置的 `profiler_log` 目录保存 profiler json 文件\n\n这里保存的 json 文件可以通过如下两种方式查看：\n\n* 在 chrome 浏览器中打开 chrome://tracing/，然后打开 json 文件查看\n* 根据控制台信息安装并启动 `visualdl --logdir log_path` 然后根据提示在浏览器中**性能分析**模块查看\n\n具体的信息含义解释以及分析方法请参考[文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/performance_improving/profiling_model.html)。\n\n> 在使用 visualdl 时，如果 log 文件数据较大，启动会比较耗时，请耐心等待。\n\n## 附录\n\n控制台打印的 summary 信息示例如下所示。\n\n**Overview Summary**\n```\n---------------------------------------------Overview Summary---------------------------------------------\nTime unit: ms\n-------------------------  -------------------------  -------------------------  -------------------------\nEvent Type                 Calls                      CPU Time                   Ratio (%)\n-------------------------  -------------------------  -------------------------  -------------------------\nProfileStep                4                          18591.04                   100.00\n  CudaRuntime              87527                      8555.11                    46.02\n  Operator                 21912                      1883.11                    10.13\n  UserDefined              13116                      1841.33                    9.90\n  OperatorInner            33668                      1018.39                    5.48\n  Forward                  8                          731.46                     3.93\n  Backward                 4                          671.82                     3.61\n  Optimization             4                          315.91                     1.70\n  Dataloader               4                          1.37                       0.01\n-------------------------  -------------------------  -------------------------  -------------------------\n                           Calls                      GPU Time                   Ratio (%)\n-------------------------  -------------------------  -------------------------  -------------------------\n  Kernel                   16092                      4924.90                    26.49\n  Memcpy                   4278                       3617.26                    19.46\n  Memset                   780                        2.31                       0.01\n  Communication            192                        2363.13                    12.71\n-------------------------  -------------------------  -------------------------  -------------------------\n```\n\n**Model Summary**\n\n```\n-----------------------------------------------------Model Summary-----------------------------------------------------\nTime unit: ms\n---------------  ------  -----------------------------------------------  ---------------------------------------------  \nName             Calls   CPU Total / Avg / Max / Min / Ratio(%)           GPU Total / Avg / Max / Min / Ratio(%)         \n---------------  ------  -----------------------------------------------  ---------------------------------------------  \nProfileStep      4       18591.04 / 4647.76 / 14114.47 / 757.27 / 100.00  4924.90 / 1231.22 / 2853.61 / 682.04 / 100.00  \n  Dataloader     4       1.37 / 0.34 / 0.85 / 0.16 / 0.01                 0.00 / 0.00 / 0.00 / 0.00 / 0.00               \n  Forward        8       731.46 / 91.43 / 133.28 / 49.03 / 3.93           714.83 / 89.35 / 174.91 / 4.72 / 14.51         \n  Backward       4       671.82 / 167.96 / 168.29 / 167.52 / 3.61         1701.53 / 425.38 / 426.97 / 424.10 / 34.55     \n  Optimization   4       315.91 / 78.98 / 89.07 / 73.78 / 1.70            108.27 / 27.07 / 27.09 / 27.06 / 2.20          \n  Others         -       16870.48 / - / - / - / 90.75                     2400.27 / - / - / - / 48.74                    \n---------------  ------  -----------------------------------------------  ---------------------------------------------  \n```\n\n**Operator Summary**\n\n```\n----------------------------------------------------------------Operator Summary-----------------------------------------------------------------\nTime unit: ms\n----------------------------------------------------  ------  -----------------------------------------  ----------------------------------------\nName                                                  Calls   CPU Total / Avg / Max / Min / Ratio(%)     GPU Total / Avg / Max / Min / Ratio(%)\n----------------------------------------------------  ------  -----------------------------------------  ----------------------------------------\n-----------------------------------------------------------Thread: All threads merged------------------------------------------------------------\nGradNodePyLayer_RecomputeFunction_backward            96      663.37 / 6.91 / 17.17 / 4.01 / 18.56       1629.87 / 16.98 / 17.41 / 16.69 / 26.98\n  TransformerDecoderLayer                             96      262.68 / 2.74 / 5.91 / 1.90 / 39.60        661.18 / 6.89 / 7.11 / 6.73 / 40.57\n  backward                                            96      318.62 / 3.32 / 10.57 / 1.31 / 48.03       968.69 / 10.09 / 10.31 / 9.91 / 59.43\nmatmul dygraph                                        2312    200.13 / 0.09 / 1.61 / 0.04 / 5.60         1487.76 / 0.64 / 9.81 / 0.22 / 24.63\n  matmul infer_meta                                   964     1.42 / 0.00 / 0.01 / 0.00 / 0.71           0.00 / 0.00 / 0.00 / 0.00 / 0.00\n  matmul compute                                      964     71.38 / 0.07 / 1.59 / 0.03 / 35.67         644.02 / 0.67 / 9.81 / 0.22 / 43.29\n    MEMSET                                            192     - / - / - / - / -                          0.42 / 0.00 / 0.00 / 0.00 / 0.07\n    volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_nn      384     - / - / - / - / -                          199.35 / 0.52 / 0.83 / 0.22 / 30.95\n    volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_nn      384     - / - / - / - / -                          263.96 / 0.69 / 0.79 / 0.59 / 40.99\n    volta_h884gemm_64x128_ldg8_nn                     192     - / - / - / - / -                          141.13 / 0.74 / 0.92 / 0.61 / 21.91\n    void cutlass::Kernel<cutlass_70_tensorop_f16_...  4       - / - / - / - / -                          39.15 / 9.79 / 9.81 / 9.78 / 6.08\n  matmul node_creation                                676     2.05 / 0.00 / 0.03 / 0.00 / 1.02           0.00 / 0.00 / 0.00 / 0.00 / 0.00\n...\n```\n\n**Kernel Summary**\n```\n---------------------------------------------------------------Kernel Summary---------------------------------------------------------------\nTime unit: ms\n------------------------------------------------------------------------------------------  ------  ----------------------------------------\nName                                                                                        Calls   GPU Total / Avg / Max / Min / Ratio(%)\n------------------------------------------------------------------------------------------  ------  ----------------------------------------\nncclKernel_AllReduce_RING_LL_Sum_half(ncclWorkElem)                                         96      2360.57 / 24.59 / 2202.54 / 0.46 / 47.93\nvolta_fp16_s884gemm_fp16_256x128_ldg8_f2f_nn                                                384     263.96 / 0.69 / 0.79 / 0.59 / 5.36\nvolta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn                                    384     241.74 / 0.63 / 0.84 / 0.22 / 4.91\nvoid paddle::operators::VectorizedRandomGenerator<phi::dtype::float16, unsigned char>       580     209.08 / 0.36 / 0.97 / 0.06 / 4.25\nvolta_h884gemm_64x128_ldg8_nn                                                               288     203.89 / 0.71 / 0.92 / 0.57 / 4.14\nvolta_fp16_s884gemm_fp16_128x128_ldg8_f2f_nn                                                384     199.35 / 0.52 / 0.83 / 0.22 / 4.05\nvolta_h884gemm_256x64_ldg8_tn                                                               288     149.52 / 0.52 / 0.54 / 0.45 / 3.04\nvoid phi::funcs::VectorizedBroadcastKernel<phi::dtype::float16, phi::dtype::float16, ph...  1352    123.12 / 0.09 / 0.40 / 0.05 / 2.50\nvoid paddle::operators::SoftmaxMaskFuseUpperTriangleGPUKernel<phi::dtype::float16, 10>      192     122.37 / 0.64 / 0.66 / 0.60 / 2.48\nvoid cutlass::Kernel<cutlass_70_tensorop_f16_s884gemm_f16_256x128_nt_align8>                100     103.07 / 1.03 / 8.08 / 0.73 / 2.09\nvoid phi::funcs::VectorizedElementwiseKernel<phi::dtype::float16, paddle::operators::Cu...  292     90.80 / 0.31 / 0.83 / 0.06 / 1.84\nvolta_h884gemm_64x128_ldg8_nt                                                               192     79.76 / 0.42 / 0.43 / 0.40 / 1.62\nvoid Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eige...  576     75.36 / 0.13 / 0.20 / 0.07 / 1.53\n...\n```\n"
  },
  {
    "path": "examples/transformer/models/GPT/docs/inference.md",
    "content": "\n# 推理部署\n\n模型训练完成后，可使用飞桨高性能推理引擎Paddle Inference通过如下方式进行推理部署。\n\n## 1. 模型导出\n\n首先需要安装`ppfleetx-ops`\n\n```bash\ncd PaddleFleetX/ # 如果已在此目录下，则忽略\n\ncd ppfleetx/ops && python setup_cuda.py install && cd ../..\n```\n\n### 1.1 非量化模型导出\n\n以`GPT-3(345M)`模型为例，通过如下方式下载PaddleFleetX发布的训练好的权重。若你已下载或使用训练过程中的权重，可跳过此步。\n\n```bash\nmkdir -p ckpt\nwget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz\ntar -xzf ckpt/GPT_345M.tar.gz -C ckpt/\n```\n\n通过如下方式进行推理模型导出\n导出单卡`GPT-3(345M)`模型：\n```bash\ncd PaddleFleetX/ # 如果已在此目录下，则忽略\n\nsh projects/gpt/auto_export_gpt_345M_single_card.sh\n```\n\n导出单卡`GPT-3(6.7B)`模型：\n```bash\ncd PaddleFleetX/ # 如果已在此目录下，则忽略\n\nsh projects/gpt/auto_export_gpt_6.7B_mp1.sh\n```\n\n导出8卡`GPT-3(175B)`模型：\n```bash\ncd PaddleFleetX/ # 如果已在此目录下，则忽略\n\nsh projects/gpt/auto_export_gpt_175B_mp8.sh\n```\n\n### 1.2 量化模型导出\n\n导出单卡`GPT-3(345M)`量化模型：\n\n```shell\ncd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略\n\n# 为了方便快速体验，这里给出345M量化训练的模型，若已有量化模型，则无需下载\nwget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_QAT_wo_analysis.tar\ntar xf GPT_345M_QAT_wo_analysis.tar\n\nexport CUDA_VISIBLE_DEVICES=0\npython generation/export.py \\\n    -c ./generation/configs/generation_qat_gpt_345M_single_card.yaml \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0 \\\n    -o Global.save_load.ckpt_dir='./GPT_345M_QAT_wo_analysis/'\n```\n\n导出单卡`GPT-3(6.7B)`量化模型：\n\n```shell\nexport CUDA_VISIBLE_DEVICES=0\npython generation/export.py \\\n    -c ./generation/configs/generation_qat_gpt_6.7B_single_card.yaml \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0\n```\n\n## 2. 推理部署\n\n模型导出后，可通过`generation/inference.py`脚本进行推理部署。\n\n单卡`GPT-3(345M)`非量化模型推理\n```bash\ncd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略\n\npython generation/inference.py \\\n    -c generation/configs/inference_gpt_345M_single_card.yaml\n```\n\n\n## 3. Benchmark\n- 运行benchmark脚本\n```\ncd PaddleFleetX/ # 如果已在此目录下，则忽略\n\ncd ppfleetx/ops && python setup_cuda.py install && cd ../..\nbash projects/gpt/run_benchmark.sh\n```\n\n| 模型          | 输入长度 | 输出长度 | batch size | GPU卡数 | FP16推理时延 | INT8推理时延 |\n| :------------ | :------: | :------: | :--------: | :-----: | :----------: | :----------: |\n| GPT-3(345M)   |    128   |    8     |     1      |    1    |   18.91ms    |   18.30ms    |\n| GPT-3(345M)   |    128   |    8     |     2      |    1    |   20.01ms    |   18.88ms    |\n| GPT-3(345M)   |    128   |    8     |     4      |    1    |   20.83ms    |   20.77ms    |\n| GPT-3(345M)   |    128   |    8     |     8      |    1    |   24.06ms    |   23.90ms    |\n| GPT-3(345M)   |    128   |    8     |    16      |    1    |   29.32ms    |   27.95ms    |\n| GPT-3(6.7B)   |    128   |    8     |     1      |    1    |   84.93ms    |   63.96ms    |\n| GPT-3(6.7B)   |    128   |    8     |     2      |    1    |   91.93ms    |   67.25ms    |\n| GPT-3(6.7B)   |    128   |    8     |     4      |    1    |   105.50ms   |   78.98ms    |\n| GPT-3(6.7B)   |    128   |    8     |     8      |    1    |   138.56ms   |   99.54ms    |\n| GPT-3(6.7B)   |    128   |    8     |    16      |    1    |   204.33ms   |   140.97ms   |\n| GPT-3(175B)   |    128   |    8     |     1      |    8    |   327.26ms   |   230.11ms   |\n| GPT-3(175B)   |    128   |    8     |     2      |    8    |   358.61ms   |   244.23ms   |\n| GPT-3(175B)   |    128   |    8     |     4      |    8    |   428.93ms   |   278.63ms   |\n| GPT-3(175B)   |    128   |    8     |     8      |    8    |   554.28ms   |   344.00ms   |\n| GPT-3(175B)   |    128   |    8     |    16      |    8    |   785.92ms   |   475.19ms   |\n\n以上性能数据基于PaddlePaddle[每日版本](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/Tables.html#whl-develop) ，依赖CUDA 11.6测试环境。\n"
  },
  {
    "path": "examples/transformer/models/GPT/docs/quantization_aware_training.md",
    "content": "\n# GPT模型量化训练\n\n本项目对语言模型 GPT 进行量化训练。目前，PaddleFleetX 提供了 [GPT-345M量化模型](https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_QAT_w_analysis.tar) 的预训练模型文件；基于 [LAMBADA](https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl)，采用 ACC(accuracy) 指标后的评估结果如下：\n\n| **模型文件** | **数据类型** | **ACC** |\n|---------|-----------|---------------|\n| GPT-345M | FP16 |  44.17%  |\n| GPT-345M | INT8 |  44.94%  |\n\n\n### 环境依赖和数据准备\n环境依赖和数据准备请参考[GPT文档](./README.md)。\n\n\n### 预训练模型准备\n量化训练需加载[GPT-345M](https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz) 的预训练模型。\n\n**预训练模型下载命令**\n```shell\nwget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz\ntar xf GPT_345M.tar.gz\n```\n\n### 量化训练\n\n- [345M模型单卡训练](../pretrain/configs/qat_gpt_345M_single_card.yaml)\n\n快速启动：\n```shell\ncd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略\n\nexport CUDA_VISIBLE_DEVICES=0\n\nlog_dir=log_hybrid\nrm -rf $log_dir\n\npython pretrain/run.py \\\n    -c ./pretrain/configs/qat_gpt_345M_single_card.yaml \\\n    -o Global.max_steps=100000 \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0 \\\n    -o Optimizer.lr.decay_steps=72000 \\\n    -o Optimizer.weight_decay=0.02 \\\n    -o Optimizer.lr.max_lr=5.0e-6 \\\n    -o Optimizer.lr.min_lr=1.0e-6 \\\n    -o Compress.pretrained='./PaddleFleetX_GPT_345M_220826'\n    \n```\n\n- [345M模型模型并行训练](../pretrain/configs/qat_gpt_345M_mp8.yaml)\n\n快速启动：\n```shell\ncd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略\n\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n\nlog_dir=log_hybrid\nrm -rf $log_dir\n\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    pretrain/run.py \\\n    -c ./pretrain/configs/qat_gpt_345M_mp8.yaml \\\n    -o Global.max_steps=100000 \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0 \\\n    -o Optimizer.lr.decay_steps=72000 \\\n    -o Optimizer.weight_decay=0.02 \\\n    -o Optimizer.lr.max_lr=5.0e-6 \\\n    -o Optimizer.lr.min_lr=1.0e-6 \\\n    -o Compress.pretrained='./PaddleFleetX_GPT_345M_220826'\n```\n\nTips：尽管设置的最大训练轮数为100000轮，但实验经验4000轮即可达到最优效果。\n\n\n### 量化训练精度调优\n针对生成式预训练语言模型的模型压缩一直是学界上的难点，潜在的原因目前并不清楚。经我们研究分析发现，生成式预训练语言模型的Transformer层的权重分布差异较大，且由于生成式预训练语言模型的从左到右预测的性质，量化误差会逐步累积，精度损失较大。为了保证量化模型的精度，PaddleSlim提供量化训练敏感度分析工具，可以有效定位模型某层带来的量化损失较大，以规避一些敏感层并提高量化模型精度。\n\nPaddleSlim中的量化训练敏感度分析工具仅支持静态图模型，需要将量化模型导出为静态图模型。导出命令为：\n\n```shell\n# 下载未经过分析的量化模型\nwget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_QAT_wo_analysis.tar\ntar xf GPT_345M_QAT_wo_analysis.tar\n\nexport CUDA_VISIBLE_DEVICES=0\n\npython pretrain/export.py \\\n    -c ./pretrain/configs/export_qat_gpt_345M_single_card.yaml \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0 \\\n    -o Global.save_load.ckpt_dir='./GPT_345M_QAT_wo_analysis/'\n```\n\n具体步骤可参考\n[GPT量化训练敏感度分析示例](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/example/quantization_analysis/GPT/README.md)。\n\n\n\n### 模型验证\n```shell\ncd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略\n\n# 下载验证数据\nwget https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl\n\n# 下载已经训练好的量化模型\nwget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_QAT_w_analysis.tar\ntar xf GPT_345M_QAT_w_analysis.tar\n\nexport CUDA_VISIBLE_DEVICES=0\npython offline-eval/run.py \\\n    -c ./offline-eval/configs/eval_qat_gpt_345M_single_card.yaml \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0 \\\n    -o Global.save_load.ckpt_dir='./GPT_345M_QAT_w_analysis' \\\n    -o Offline_Eval.eval_path=./lambada_test.jsonl \\\n    -o Offline_Eval.cloze_eval=True \n```\n\n### 模型导出\n```shell\ncd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略\n\n# 下载已经训练好的量化模型，若已有量化模型，不需要下载\nwget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_QAT_wo_analysis.tar\ntar xf GPT_345M_QAT_wo_analysis.tar\n\nexport CUDA_VISIBLE_DEVICES=0\npython generation/export.py \\\n    -c ./generation/configs/generation_qat_gpt_345M_single_card.yaml \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0 \\\n    -o Global.save_load.ckpt_dir='./GPT_345M_QAT_wo_analysis/'\n```\n"
  },
  {
    "path": "examples/transformer/models/GPT/docs/single_card.md",
    "content": "# GPT 单卡模型训练\n\n## 运行方式\n\n本文档按照345M和1.3B规模大小，给出32G V100环境下GPT模型单卡训练的策略配置如下：\n\n| 模型规模 | 训练策略       | yaml文件                    | 显存占用 |\n|----------|----------------|-------------------------------|----------|\n| 345M     | fp16           | pretrain_gpt_345M_single_card.yaml | 30.9GB   |\n| 1.3B     | fp16+recompute | pretrain_gpt_1.3B_single_card.yaml | 26.0GB   |\n\n**启动命令**\n```shell\ncd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略\n\n# 345M\npython pretrain/run.py -c pretrain/configs/pretrain_gpt_345M_single_card.yaml\n\n# 1.3B\npython pretrain/run.py -c pretrain/configs/pretrain_gpt_1.3B_single_card.yaml\n```\n\n若要在显存容量更小的16G V100环境下进行GPT模型单机训练，可通过减小`Model.hidden_size`调整模型规模至合适大小，或使用重计算等显存优化策略再启动训练，命令如下：\n\n```shell\n# 345M\npython pretrain/run.py \\\n    -c pretrain/configs/pretrain_gpt_345M_single_card.yaml \\\n    -o Model.use_recompute=True\n\n# 1.3B\npython pretrain/run.py \\\n    -c pretrain/configs/pretrain_gpt_1.3B_single_card.yaml \\\n    -o Model.hidden_size=1024\n```\n\n**运行日志**\n\n```\n[2022-09-21 05:45:27,009] [    INFO] - [train] epoch: 0, batch: 0, loss: 10.999595642, avg_batch_cost: 2.53083 sec, speed: 0.40 step/s, ips_total: 3237 tokens/s, ips: 3237 tokens/s, learning rate: 2.77778e-08\n[2022-09-21 05:45:27,518] [    INFO] - [train] epoch: 0, batch: 1, loss: 10.997043610, avg_batch_cost: 0.50907 sec, speed: 1.96 step/s, ips_total: 16092 tokens/s, ips: 16092 tokens/s, learning rate: 4.16667e-08\n[2022-09-21 05:45:28,021] [    INFO] - [train] epoch: 0, batch: 2, loss: 10.994422913, avg_batch_cost: 0.50265 sec, speed: 1.99 step/s, ips_total: 16298 tokens/s, ips: 16298 tokens/s, learning rate: 5.55556e-08\n[2022-09-21 05:45:28,526] [    INFO] - [train] epoch: 0, batch: 3, loss: 11.005314827, avg_batch_cost: 0.50378 sec, speed: 1.98 step/s, ips_total: 16261 tokens/s, ips: 16261 tokens/s, learning rate: 6.94444e-08\n[2022-09-21 05:45:29,029] [    INFO] - [train] epoch: 0, batch: 4, loss: 10.988020897, avg_batch_cost: 0.50237 sec, speed: 1.99 step/s, ips_total: 16307 tokens/s, ips: 16307 tokens/s, learning rate: 8.33333e-08\n[2022-09-21 05:45:29,531] [    INFO] - [train] epoch: 0, batch: 5, loss: 10.983006477, avg_batch_cost: 0.50179 sec, speed: 1.99 step/s, ips_total: 16326 tokens/s, ips: 16326 tokens/s, learning rate: 9.72222e-08\n[2022-09-21 05:45:30,035] [    INFO] - [train] epoch: 0, batch: 6, loss: 10.988540649, avg_batch_cost: 0.50379 sec, speed: 1.98 step/s, ips_total: 16261 tokens/s, ips: 16261 tokens/s, learning rate: 1.11111e-07\n[2022-09-21 05:45:30,540] [    INFO] - [train] epoch: 0, batch: 7, loss: 10.966930389, avg_batch_cost: 0.50387 sec, speed: 1.98 step/s, ips_total: 16258 tokens/s, ips: 16258 tokens/s, learning rate: 1.25000e-07\n[2022-09-21 05:45:31,044] [    INFO] - [train] epoch: 0, batch: 8, loss: 10.980175018, avg_batch_cost: 0.50365 sec, speed: 1.99 step/s, ips_total: 16265 tokens/s, ips: 16265 tokens/s, learning rate: 1.38889e-07\n[2022-09-21 05:45:31,562] [    INFO] - [train] epoch: 0, batch: 9, loss: 10.966150284, avg_batch_cost: 0.51796 sec, speed: 1.93 step/s, ips_total: 15816 tokens/s, ips: 15816 tokens/s, learning rate: 1.52778e-07\n```\n\n\n# GPT 单卡模型评估\n\n我们提供了对[WikiText](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip)、[LAMBADA](https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl)两种数据集的评估脚本，其中数据集WikiText采用的是PPL(perplexity)评估指标，LAMBADA采用的是ACC(accuracy)指标。\n\n## 参数释义\n\n请在模型评估前将前述数据集下载到FleetX根目录下(WikiText数据集需要解压缩)，然后可以使用配置文件配置评估相关的参数，包括：\n\n```yaml\n  Offline_Eval:\n    eval_path: ./wikitext-103/wiki.valid.tokens\n    cloze_eval: False\n    overlapping_eval: 32\n    batch_size: 8\n    max_seq_len: 1024\n    logging_freq: 10\n```\n\n其中参数对应的释义如下：\n\n| **参数名**                      | **参数释义**          |\n|------------------------------|------------------------|\n| eval_path         | 评估数据集地址                      |\n| cloze_eval  | lambada数据集参数                     |\n| overlapping_eval  | wikitext数据集参数              |\n| batch_size         | 模型评估时batch size             |\n| max_seq_len        | 模型评估时文本序列长度           |\n| logging_freq     | 评估日志的打印频率                |\n\n## 运行方式\n\n以单卡345M模型评估为例，可以使用如下命令启动评估：\n\n### WikiText数据集评估\n\n```shell\ncd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略\n\nmkdir -p ckpt\nwget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz\ntar -xzf ckpt/GPT_345M.tar.gz -C ckpt/\n\nwget -O wikitext-103-v1.zip https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip\nunzip -q wikitext-103-v1.zip\n\nckpt_dir=ckpt/PaddleFleetX_GPT_345M_220826/\neval_dir=./wikitext-103\n\npython offline-eval/run.py -c offline-eval/configs/eval_gpt_345M_single_card.yaml \\\n    -o Global.save_load.ckpt_dir=$ckpt_dir \\\n    -o Offline_Eval.eval_path=$eval_dir/wiki.valid.tokens \\\n    -o Offline_Eval.overlapping_eval=32 \\\n    -o Offline_Eval.batch_size=16\n```\n\n评估日志如下：\n```shell\n[2022-09-21 05:28:26,263] [    INFO] - [eval] epoch: 0, batch: 0, loss: 0.170368048, speed: 0.29 step/s\n[2022-09-21 05:28:39,642] [    INFO] - [eval] epoch: 0, batch: 10, loss: 0.231640193, speed: 0.75 step/s\n[2022-09-21 05:28:53,469] [    INFO] - [eval] epoch: 0, batch: 20, loss: 0.292417919, speed: 0.72 step/s\n[2022-09-21 05:29:07,012] [    INFO] - [eval] epoch: 0, batch: 30, loss: 0.351391476, speed: 0.74 step/s\n[2022-09-21 05:29:27,359] [    INFO] - [eval] epoch: 0, batch: 40, loss: 0.415404772, speed: 0.49 step/s\n```\n\n评估结果如下：\n\n```shell\n[2022-09-21 05:40:32,820] [    INFO] - validation results on ./wikitext-103/wiki.valid.tokens | avg loss: 2.9554E+00 | ppl: 1.9210E+01 | adjusted ppl: 2.4948E+01 | token ratio: 1.0884484081583892\n```\n\n### LAMBADA数据集评估\n\n```shell\ncd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略\n\nmkdir -p ckpt\nwget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz\ntar -xzf ckpt/GPT_345M.tar.gz -C ckpt/\n\nwget -O lambada_test.jsonl https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl\n\nckpt_dir=ckpt/PaddleFleetX_GPT_345M_220826/\n\npython offline-eval/run.py -c offline-eval/configs/eval_gpt_345M_single_card.yaml \\\n    -o Global.save_load.ckpt_dir=$ckpt_dir \\\n    -o Offline_Eval.eval_path=./lambada_test.jsonl \\\n    -o Offline_Eval.cloze_eval=True \\\n    -o Offline_Eval.batch_size=16\n\n```\n\n评估日志如下：\n```shell\n[2022-09-21 05:18:24,152] [    INFO] - [eval] epoch: 0, batch: 0, number correct: 50.000000000, speed: 0.29 step/s\n[2022-09-21 05:18:37,264] [    INFO] - [eval] epoch: 0, batch: 10, number correct: 130.000000000, speed: 0.76 step/s\n[2022-09-21 05:18:50,408] [    INFO] - [eval] epoch: 0, batch: 20, number correct: 209.000000000, speed: 0.76 step/s\n[2022-09-21 05:19:03,578] [    INFO] - [eval] epoch: 0, batch: 30, number correct: 279.000000000, speed: 0.76 step/s\n[2022-09-21 05:19:16,760] [    INFO] - [eval] epoch: 0, batch: 40, number correct: 343.000000000, speed: 0.76 step/s\n```\n\n评估结果如下：\n\n```shell\n[2022-09-21 05:25:28,662] [    INFO] - validation results on ./lambada_test.jsonl | number correct: 2.1240E+03 | total examples: 5.1530E+03 | avg accuracy: 4.1219E-01\n```\n\n# GPT Zero-shot 文本生成\n\n## 参数释义\n\n```yaml\n  Generation:\n    top_k: 50\n    top_p: 0.75\n    temperature: 1.0\n    min_dec_len: 1\n    max_dec_len: 200\n    num_return_sequences: 1\n    decode_strategy: \"sampling\"\n```\n\n其中参数说明：\n\n| **参数名**      | **参数释义**                  |\n|--------------|---------------------------|\n| top_k | 每次为采样挑选保留分数最高的 k 个 token        |\n| top_p   | 如果设置小于 1.0 的小数，则保留加起来为 top_p 或更高的最可能的概率的 token。默认值为 1.0        |\n| temperature   |  调节下一个 token 的概率温度，logits = logits / temperature，默认值为 1.0           |\n| min_dec_len | 最小生成 token 长度              |\n| max_dec_len  | 最大生成 token 长度                     |\n| num_return_sequences  | 每个输入生成的序列个数，默认值为 1                  |\n| decode_strategy       | 解码策略，默认值为 \"sampling\"，目前只支持 \"sampling\"，未来会支持 \"greedy_search\"，\"beam_search\" |\n\n## 文本生成\n\n下载预训练好的模型，快速体验文本生成\n\n### 快速体验文本生成\n\n\n```shell\ncd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略\n\nmkdir -p ckpt\nwget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz\ntar -xzf ckpt/GPT_345M.tar.gz -C ckpt/\n\npython generation/run.py \\\n    -c generation/configs/generation_gpt_345M_single_card.yaml \\\n    -o Global.save_load.ckpt_dir=./ckpt/PaddleFleetX_GPT_345M_220826/\n\n# 生成的文本，由于 checkpoint 不同，超参不同，随机数不同，您执行可能会生成不一样的内容\n\nPrompt: Hi, GPT2. Tell me who Jack Ma is.\nGeneration: Hi, GPT2. Tell me who Jack Ma is. I don’t want to hear that.”\n\nFor now, the only question the crowd is asking is whether or not Jack Ma will step down from the board of directors of Alibaba.\n\nJack Ma on why he never wanted to run for President in 2016:\n\nThere were two reasons. One is that I wanted to spend more time with my family. I thought it was better to spend more time with my family and spend more time with my children. So it was a very personal reason. But the second reason was that I thought it would be difficult to get elected, because there are a lot of political interests in this country. So I thought it was better to spend more time with my family.\n\nOn how Alibaba will evolve into a new player in China’s transportation and logistics sector:\n\nI think that we are going to become a very important player in the logistics industry. So our strategy is to make it easy for people to travel.\n```\n\n### 剖析体验文本生成\n\n#### GPT 文本生成模块初始化\n\n```python\n    module = build_module(cfg)\n    module.model.eval()\n```\n\n#### 预训练模型加载\n\n```python\n    # 获取到预训练 checkpoint 的根目录\n    ckpt_dir = cfg.Global.save_load.ckpt_dir\n\n    # 构造出具体路径\n    model_path = os.path.join(ckpt_dir, \"model.pdparams\")\n\n    # 加载模型参数\n    model_dict = paddle.load(model_path)\n\n    # FP16 模型参数转成 FP32 模型参数\n    for key, value in model_dict.items():\n        model_dict[key] = model_dict[key].astype(paddle.float32)\n\n    # 设置模型参数为预训练参数\n    module.model.set_state_dict(model_dict)\n```\n\n#### 文本生成与结果展示\n\n```python\n    input_text = \"Historical Records: Tell us about the history of the Great Wall.\"\n    result = module.generate(input_text)\n\n    print(f'Prompt: {input_text}')\n    print(f'Generation: {result[0]}')\n```\n"
  },
  {
    "path": "examples/transformer/models/GPT/docs/single_finetune.md",
    "content": "# GPT2 微调\n\n本教程主要针对于 GLUE (General Language Understanding Evaluation) benchmark 中的数据集进行微调，涉及到分类和回归任务。\n\n## 下载 GPT345M 预训练模型\n```\n# 如果已经下载可以忽略\nmkdir -p ckpt\nwget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz\ntar -xzf ckpt/GPT_345M.tar.gz -C ckpt/\n```\n\n## 快速体验运行\n\n```\n# cd PaddleFleetX/examples/transformer/models/GPT\n# bash finetune/run_task.sh taskname [split]\n\n# taskname 可选: CoLA, SST2, MRPC, QQP, STSB, MNLI, QNLI, RTE, WNLI\n# 例如 bash finetune/run_task.sh CoLA\n\n# 注：当数据集为 MNLI 时，验证集有两种，分别是 dev_matched 和 dev_mismatched，\n# 其他数据集，只有一种验证集，因此不用选择\n# 可以通过 bash finetune/run_task.sh MNLI dev_matched\n# 或者 bash finetune/run_task.sh MNLI dev_mismatched\n# 进行 finetune 训练\n\nbash finetune/run_task.sh SST2\n```\n\n## GLUE benchmark 数据集\n\nGLUE benchmark 包含 9 个数据集，分别是 **CoLA**、**SST-2**、**MRPC**、**QQP**、**STS-B**、**MNLI**、**QNLI**、**RTE**、**WNLI**，涉及到 **自然语言推断**，**文本蕴含**，**情感分析**，**语义相似** 等任务，整体可以归位 3 类，分别是单句任务：CoLA、SST-2；相似性：MRPC、QQP、STS-B；释义：MNLI、QNLI、RTE、WNLI。\n\n以下介绍载自 [huggingface](https://huggingface.co/datasets/glue/blob/main/glue.py).\n\n* CoLA: The Corpus of Linguistic Acceptability consists of English acceptability judgments drawn from books and journal articles on linguistic theory. Each example is a sequence of words annotated with whether it is a grammatical English sentence.\n* SST-2: The Stanford Sentiment Treebank consists of sentences from movie reviews and human annotations of their sentiment. The task is to predict the sentiment of a given sentence. We use the two-way (positive/negative) class split, and use only sentence-level labels.\n* MRPC: The Microsoft Research Paraphrase Corpus (Dolan & Brockett, 2005) is a corpus of sentence pairs automatically extracted from online news sources, with human annotations for whether the sentences in the pair are semantically equivalent.\n* QQP: The Quora Question Pairs2 dataset is a collection of question pairs from the community question-answering website Quora. The task is to determine whether a pair of questions are semantically equivalent.\n* STS-B: The Semantic Textual Similarity Benchmark (Cer et al., 2017) is a collection of sentence pairs drawn from news headlines, video and image captions, and natural language inference data. Each pair is human-annotated with a similarity score from 1 to 5.\n* MNLI: The Multi-Genre Natural Language Inference Corpus is a crowdsourced collection of sentence pairs with textual entailment annotations. Given a premise sentence and a hypothesis sentence, the task is to predict whether the premise entails the hypothesis (entailment), contradicts the hypothesis (contradiction), or neither (neutral). The premise sentences are gathered from ten different sources, including transcribed speech, fiction, and government reports. We use the standard test set, for which we obtained private labels from the authors, and evaluate on both the matched (in-domain) and mismatched (cross-domain) section. We also use and recommend the SNLI corpus as 550k examples of auxiliary training data.\n* QNLI: The Stanford Question Answering Dataset is a question-answering dataset consisting of question-paragraph pairs, where one of the sentences in the paragraph (drawn from Wikipedia) contains the answer to the corresponding question (written by an annotator). We convert the task into sentence pair classification by forming a pair between each question and each sentence in the corresponding context, and filtering out pairs with low lexical overlap between the question and the context sentence. The task is to determine whether the context sentence contains the answer to the question. This modified version of the original task removes the requirement that the model select the exact answer, but also removes the simplifying assumptions that the answer is always present in the input and that lexical overlap is a reliable cue.\n* RTE: The Recognizing Textual Entailment (RTE) datasets come from a series of annual textual entailment challenges. We combine the data from RTE1 (Dagan et al., 2006), RTE2 (Bar Haim et al., 2006), RTE3 (Giampiccolo et al., 2007), and RTE5 (Bentivogli et al., 2009).4 Examples are constructed based on news and Wikipedia text. We convert all datasets to a two-class split, where for three-class datasets we collapse neutral and contradiction into not entailment, for consistency.\n* WNLI: The Winograd Schema Challenge (Levesque et al., 2011) is a reading comprehension task in which a system must read a sentence with a pronoun and select the referent of that pronoun from a list of choices. The examples are manually constructed to foil simple statistical methods: Each one is contingent on contextual information provided by a single word or phrase in the sentence. To convert the problem into sentence pair classification, we construct sentence pairs by replacing the ambiguous pronoun with each possible referent. The task is to predict if the sentence with the pronoun substituted is entailed by the original sentence. We use a small evaluation set consisting of new examples derived from fiction books that was shared privately by the authors of the original corpus. While the included training set is balanced between two classes, the test set is imbalanced between them (65% not entailment). Also, due to a data quirk, the development set is adversarial: hypotheses are sometimes shared between training and development examples, so if a model memorizes the training examples, they will predict the wrong label on corresponding development set example. As with QNLI, each example is evaluated separately, so there is not a systematic correspondence between a model's score on this task and its score on the unconverted original task. We call converted dataset WNLI (Winograd NLI).\n\n\n## 微调相关类\n\n### `GPTForSequenceClassification`\n在 GPT 模型输出的 logits 基础上，增加一个分类层，并且用正态分布对新增的层参数进行初始化。\n\n```\nself.score = nn.Linear(self.gpt.hidden_size, num_classes, bias_attr=False)\n\nfrom paddle.nn.initializer import Normal\nnormal_ = Normal(std=self.gpt.initializer_range)\nnormal_(self.score.weight)\n```\n\n## 超参数\n微调训练也需要一套完整的超参数，但是微调涉及的核心超参数并不多。\n\n### Global\n\n| 参数字段 | 参数含义 |\n| ------ | --------|\n|run_mode| 运行的模式，需要设置为 epoch 方式|\n|num_train_epochs| 需要 finetune 的 epoch 数 |\n\n```\nGlobal:\n  run_mode: epoch\n  num_train_epochs: 3 # WNLI 和 MRPC 数据集比较小，因此 `num_train_epochs=5`。\n```\n\n### Model\n\n| 参数字段 | 参数含义 |\n| ------ | --------|\n|name | 需要设置为 \"GPT\" |\n|num_classes | finetune 时的类别数，根据语料库以及任务来设定 |\n|pretrained | 预训练的权重文件路径前缀，去掉 \".pdparams\" |\n|loss.train.name | finetune 时的训练损失函数类名 |\n|loss.eval.name | finetune 时的验证损失函数类名 |\n|metric.eval.name | finetune 时的验证评估函数类名 |\n\n微调时，不同任务对应的类别数 和 loss 函数以及评测指标不同，因此需要通过配置来改变设置。\n```\nModel:\n  name: \"GPT\"\n  num_classes: 2 # 1 or 2 or 3\n  pretrained: 'path/to/pretrained_model'\n  \n  loss:\n    train:\n      name: 'CrossEntropyLoss'\n    eval:\n      name: 'CrossEntropyLoss'\n  \n  metric:\n    eval:\n      name: 'Accuracy'\n```\n\n### Optimizer 和 LRScheduler\n\n| 参数字段 | 参数含义 |\n| ------ | --------|\n|name| 优化器类名 |\n|weight_decay| 权重衰减值 |\n|beta1| FusedAdamW 的 beta1 |\n|beta2| FusedAdamW 的 beta2 |\n|epsilon| FusedAdamW 的 epsilon |\n|multi_precision| 当使用 FP16 O2 级别时，是否开启参数使用多精度表示 |\n|tensor_fusion| 是否开启 tensor_fusion |\n|lr.name| 学习率调整策略类名 |\n|lr.warmup| 当参数时小数时，表示 warmup 步数占总步数的比例，如果是整数时，则表示 warmup 的步数 |\n|lr.learning_rate| 初始化学习率值 |\n\n注：这里的超参会跟随优化器类的不同而不同，可以自行查看优化器类和学习率调整策略类初始化函数需要设置的超参数设定。\n\n```\nOptimizer:\n  name: FusedAdamW\n  weight_decay: 0.0\n  beta1: 0.9\n  beta2: 0.999\n  epsilon: 1e-6\n  multi_precision: True\n  tensor_fusion: False\n  lr:\n    name: LinearDecayWithWarmup\n    warmup: 0.1\n    learning_rate: 2e-5\n```\n\n### Data\n\n| 参数字段 | 参数含义 |\n| ------ | --------|\n|Train.dataset| 描述 finetune 时的数据集 |\n|Train.sampler| 描述 dataloader 所需要的 batch sampler |\n|Train.loader| 描述 dataloader 所需要的相关信息，例如 num_workers 等 |\n\n注：数据集的设定会根据不同任务不同语料库不同而设定不同，例如 `split` 字段，不同数据集是有不同的设定，请参考所需要 finetune 的数据集初始化函数。\n\n```\nData:\n  Train:\n    dataset:\n      name: SST2\n      root: ./dataset/SST-2/\n      split: 'train'\n      max_length: 128\n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 32\n      shuffle: True\n      drop_last: True\n    loader:\n      num_workers: 4\n      return_list: False\n  \n  Eval:\n    dataset:\n      name: SST2\n      root: ./dataset/SST-2/\n      split: 'dev'\n      max_length: 128\n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 32\n      shuffle: False\n      drop_last: False\n    loader:\n      num_workers: 4\n      return_list: False\n```\n\n## 运行\n\nGLUE benchmark 上的语料库 finetune，大部分设置相同，可以同享一份，只有少量区别处需要改变，因此可以通过超参数的覆盖方式来设置。\n\n数据集加载时会自动判断是否已经缓存下载，如果未缓存下载会自行下载，请保证网络的畅通。当自动下载失败时，可以尝试多次以及检查是否有代理设置等。当下载失败时，也可以自己下载及解压到对应的目录中。\n\n以下是 GLUE benchmark 上的每个语料库的 finetune 单机单卡启动命令：\n\n### CoLA 数据集\n```\npython finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \\\n  -o Data.Train.dataset.name=CoLA \\\n  -o Data.Train.dataset.root=./dataset/cola_public/ \\\n  -o Data.Eval.dataset.name=CoLA \\\n  -o Data.Eval.dataset.root=./dataset/cola_public/ \\\n  -o Data.Eval.dataset.split=dev \\\n  -o Model.metric.train.name=Mcc \\\n  -o Model.metric.eval.name=Mcc\n  -o Model.num_classes=2\n```\n\n### SST2 数据集\n```\npython finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \\\n  -o Data.Train.dataset.name=SST2 \\\n  -o Data.Train.dataset.root=./dataset/SST-2/ \\\n  -o Data.Eval.dataset.name=SST2 \\\n  -o Data.Eval.dataset.root=./dataset/SST-2/ \\\n  -o Data.Eval.dataset.split=dev \\\n  -o Model.num_classes=2\n```\n\n### MRPC 数据集\n```\npython finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \\\n  -o Global.num_train_epochs=5 \\\n  -o Data.Train.dataset.name=MRPC \\\n  -o Data.Train.dataset.root=./dataset/MRPC/ \\\n  -o Data.Eval.dataset.name=MRPC \\\n  -o Data.Eval.dataset.root=./dataset/MRPC/ \\\n  -o Data.Eval.dataset.split=test \\\n  -o Model.num_classes=2 \\\n  -o Model.metric.train.name=AccuracyAndF1 \\\n  -o Model.metric.eval.name=AccuracyAndF1\n```\n\n### QQP 数据集\n```\npython finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \\\n  -o Data.Train.dataset.name=QQP \\\n  -o Data.Train.dataset.root=./dataset/QQP/ \\\n  -o Data.Eval.dataset.name=QQP \\\n  -o Data.Eval.dataset.root=./dataset/QQP/ \\\n  -o Data.Eval.dataset.split=dev \\\n  -o Model.num_classes=2 \\\n  -o Model.metric.train.name=AccuracyAndF1 \\\n  -o Model.metric.eval.name=AccuracyAndF1\n```\n\n### STSB 数据集\n```\npython finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \\\n  -o Data.Train.dataset.name=STSB \\\n  -o Data.Train.dataset.root=./dataset/STS-B/ \\\n  -o Data.Eval.dataset.name=STSB \\\n  -o Data.Eval.dataset.root=./dataset/STS-B/ \\\n  -o Data.Eval.dataset.split=dev \\\n  -o Model.num_classes=1 \\\n  -o Model.metric.train.name=PearsonAndSpearman \\\n  -o Model.metric.eval.name=PearsonAndSpearman \\\n  -o Model.loss.train.name=MSELoss \\\n  -o Model.loss.eval.name=MSELoss\n```\n\n### MNLI 数据集\n\n注：MNLI 数据集验证集分为 `dev_matched` 和 `dev_mismatched`，目前暂不支持两个集合同时评测，如果要评测两种验证集，有两种方法：\n\n* 分别 finetune 2次，Data.Eval.dataset.split 设置不同的验证集\n* 保存 finetune 后的 checkpoint，在不同验证集上离线评测。\n\n\n```\npython finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \\\n  -o Data.Train.dataset.name=MNLI \\\n  -o Data.Train.dataset.root=./dataset/multinli_1.0 \\\n  -o Data.Eval.dataset.name=MNLI \\\n  -o Data.Eval.dataset.root=./dataset/multinli_1.0 \\\n  -o Data.Eval.dataset.split=dev_matched \\\n  -o Model.num_classes=3\n```\n\n### QNLI 数据集\n```\npython finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \\\n  -o Data.Train.dataset.name=QNLI \\\n  -o Data.Train.dataset.root=./dataset/QNLI/ \\\n  -o Data.Eval.dataset.name=QNLI \\\n  -o Data.Eval.dataset.root=./dataset/QNLI/ \\\n  -o Data.Eval.dataset.split=dev \\\n  -o Model.num_classes=2\n```\n\n### RTE 数据集\n```\npython finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \\\n  -o Data.Train.dataset.name=RTE \\\n  -o Data.Train.dataset.root=./dataset/RTE/ \\\n  -o Data.Eval.dataset.name=RTE \\\n  -o Data.Eval.dataset.root=./dataset/RTE/ \\\n  -o Data.Eval.dataset.split=dev \\\n  -o Model.num_classes=2\n```\n\n### WNLI 数据集\n```\npython finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \\\n  -o Global.num_train_epochs=5 \\\n  -o Data.Train.dataset.name=WNLI \\\n  -o Data.Train.dataset.root=./dataset/WNLI/ \\\n  -o Data.Eval.dataset.name=WNLI \\\n  -o Data.Eval.dataset.root=./dataset/WNLI/ \\\n  -o Data.Eval.dataset.split=dev \\\n  -o Model.num_classes=2\n```\n\n\n## 运行结果\n\n以下的指标是通过 [GPT_345M](https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz) 预训练模型 finetune 得到的结果，仅作为参考。\n\n| Corpus | Task                | Domanin            | Metric                       | Result                       |\n| ------ | ------------------- | ------------------ | ---------------------------- | ---------------------------- |\n| CoLA   | acceptability       | Misc.              | Matthews corr                | 0.60471                      |\n| SST-2  | sentiment           | Movie reviews      | Accuracy                     | 0.93005                      |\n| MNLI   | NLI                 | Misc.              | Matched acc./Mismatched acc. | 0.84238/0.84815              |\n| QNLI   | QA/NLI              | Wikipedia          | Accuracy                     | 0.90445                      |\n| RTE    | NLI                 | News, Wikipedia    | Accuracy                     | 0.70397                      |\n| WNLI   | coreference         | Books              | Accuracy                     | 0.40845                      |\n| MRPC   | paraphrase          | News               | Accuracy/F1                  | 0.81913/0.87022              |\n| QQP    | paraphrase          | social QA question | Accuracy/F1                  | 0.86087/0.81055              |\n| STS-B  | sentence similarity | Misc.              | Pearson/Spearman corr.       | 0.85797/0.85824              |\n"
  },
  {
    "path": "examples/transformer/models/GPT/docs/structured_pruning.md",
    "content": "# GPT模型结构化稀疏\n\n本项目对语言模型 GPT 进行结构化稀疏（以下简称稀疏）。在 GPT 模型中，我们对 fused-qkv、out-linear、ffn1 和 ffn2 四层的权重进行了通道稀疏，其中，fused-qkv 和 ffn1 是在输出通道进行稀疏，out-linear 和 ffn2 是在输入通道进行稀疏。如果您需要自定义稀疏的层和通道，可以通过重写 ppfleetx/utils/compression_helper.py 中的 get_pruned_params() 函数实现。\n\n\n### 环境依赖和数据准备\n环境依赖和数据准备请参考[GPT训练文档](./README.md)。\n\n特别的，本示例需要依赖 PaddleSlim develop版本。安装命令如下：\n\n```shell\ngit clone https://github.com/PaddlePaddle/PaddleSlim.git & cd PaddleSlim\npip install -r requirements.txt\npython setup.py install\n```\n\n\n### 预训练模型准备\n稀疏训练需加载[GPT-345M](https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz) 的预训练模型。\n\n**预训练模型下载命令**\n```shell\nwget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz\ntar xf GPT_345M.tar.gz\n```\n\n### 稀疏训练\n\n- [345M模型稀疏训练](../pretrain/configs/prune_gpt_345M_single_card.yaml)\n\n快速启动：\n```shell\ncd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略\n\nexport CUDA_VISIBLE_DEVICES=0\npython pretrain/run.py \\\n    -c ./pretrain/configs/prune_gpt_345M_single_card.yaml \\\n    -o Global.max_steps=100000 \\\n    -o Optimizer.lr.decay_steps=72000 \\\n    -o Optimizer.weight_decay=0.0 \\\n    -o Optimizer.lr.max_lr=2.5e-5 \\\n    -o Optimizer.lr.min_lr=5.0e-6 \\\n    -o Compress.pretrained='./PaddleFleetX_GPT_345M_220826'\n    \n```\n\n### 模型验证\n```shell\ncd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略\n\n# 下载验证数据\nwget https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl\n\nexport CUDA_VISIBLE_DEVICES=0\npython offline-eval/run.py \\\n    -c ./offline-eval/configs/eval_pruned_gpt_345M_single_card.yaml \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0 \\\n    -o Global.save_load.ckpt_dir='./output' \\\n    -o Offline_Eval.eval_path=./lambada_test.jsonl \\\n    -o Offline_Eval.cloze_eval=True\n```\n\n### 模型导出\n```shell\ncd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略\n\nexport CUDA_VISIBLE_DEVICES=0\npython generation/export.py \\\n    -c ./generation/configs/generation_pruned_gpt_345M_single_card.yaml \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0 \\\n    -o Global.save_load.ckpt_dir='./output'\n```\n"
  },
  {
    "path": "examples/transformer/models/GPT/finetune/configs/finetune_gpt_345M_single_card_glue.yaml",
    "content": "_base_: ./finetune_gpt_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 32\n  micro_batch_size: 32\n  \n  run_mode: epoch\n  num_train_epochs: 3\n  accumulate_steps:\n  logging_freq: 10\n  eval_freq: 1\n  mix_precision:\n    enable: True\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\", \"reduce_mean\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n  save_load:\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\n\nModel:\n  name: \"GPT\"\n  num_classes: 2\n  pretrained: './ckpt/PaddleFleetX_GPT_345M_220826/model'\n  fuse_attn_qkv: True\n  fused_linear: False\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: 4096\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  recompute_granularity:\n  \n  loss:\n    train:\n      name: 'CrossEntropyLoss'\n    eval:\n      name: 'CrossEntropyLoss'\n  \n  metric:\n    eval:\n      name: 'Accuracy'\n  \n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n    \nOptimizer:\n  name: FusedAdamW\n  weight_decay: 0.0\n  beta1: 0.9\n  beta2: 0.999\n  epsilon: 1e-6\n  multi_precision: True\n  lr:\n    name: LinearDecayWithWarmup\n    warmup: 0.1\n    learning_rate: 2e-5\n  tensor_fusion: False\n    \n    \nData:\n  Train:\n    dataset:\n      name: SST2\n      root: ./dataset/SST-2/\n      split: 'train'\n      max_length: 128\n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 32\n      shuffle: True\n      drop_last: True\n    loader:\n      num_workers: 4\n      return_list: False\n  \n  Eval:\n    dataset:\n      name: SST2\n      root: ./dataset/SST-2/\n      split: 'dev'\n      max_length: 128\n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 32\n      shuffle: False\n      drop_last: False\n    loader:\n      num_workers: 4\n      return_list: False\n"
  },
  {
    "path": "examples/transformer/models/GPT/finetune/configs/finetune_gpt_base.yaml",
    "content": "Global:\n  device: gpu\n  seed: 42\n\n  global_batch_size: \n  local_batch_size: 1\n  micro_batch_size: 1\n\n  run_mode: epoch\n  max_steps: -1\n  eval_freq: 1\n  eval_iters: -1\n  test_iters: -1\n  save_load:\n    save_steps: -1\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\n\nProfiler:\n  enable: False\n  scheduler: [1, 5]\n  profiler_log: profiler_log\n  detailed: False\n"
  },
  {
    "path": "examples/transformer/models/GPT/finetune/impls.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport sys\nimport copy\nimport numpy as np\n\nimport paddle\nimport paddle.distributed as dist\n\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.distributed.apis import env\nimport ppfleetx.models.language_model.gpt as gpt\nfrom ppfleetx.data.tokenizers import GPTTokenizer, GPTChineseTokenizer\nfrom examples.transformer.models.GPT.pretrain.impls import fit_impl as pretrain_fit_impl\n\nMODEL_CLASSES = {\n    \"GPT\": (GPTTokenizer, \"gpt2\"),\n    \"GPT-cn\": (GPTChineseTokenizer, \"gpt-cpm-large-cn\"),\n}\n\n\ndef _get_model_size(l, h, v, s):\n    P = 12 * l * h * h * (1 + 13 / (12 * h) + (v + s) / (12 * l * h))\n    logger.info('Model Size: {:.2f} B'.format(P / 1000.0 / 1000.0 / 1000.0))\n\n\ndef build_model(config):\n    nranks = dist.get_world_size()\n    model_setting = copy.deepcopy(config.Model)\n\n    loss_config = model_setting.pop(\"loss\", None)\n    metric_config = model_setting.pop(\"metric\", None)\n    pretrained = model_setting.pop(\"pretrained\")\n    num_classes = model_setting.pop(\"num_classes\", 2)\n    assert pretrained is not None\n\n    l = model_setting['num_layers']\n    h = model_setting['hidden_size']\n    v = model_setting['vocab_size']\n    num_heads = model_setting['num_attention_heads']\n    s = config.Data.Train.dataset.max_length\n    _get_model_size(l, h, v, s)\n\n    model_name = model_setting.pop(\"name\")\n    tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]\n    tokenizer = tokenizer_class.from_pretrained(pretrained_name)\n\n    if nranks == 1:\n        model = gpt.GPTForSequenceClassification(\n            gpt.GPTModel(**model_setting), num_classes)\n    else:\n        raise NotImplementedError\n\n    pretrained_path = pretrained + \".pdparams\"\n    assert os.path.exists(pretrained_path), f'{pretrained_path} is not exists!'\n    model_dict = paddle.load(pretrained_path)\n\n    # Note(GuoxiaWang): Guess whether to convert fused vs non-fused parameters.\n    # 'q_proj' vs 'qkv_proj'\n    def is_fused(model_state):\n        for key in model_state:\n            if 'qkv_proj' in key:\n                return True\n        return False\n\n    def split_params(model_state, num_layers):\n        for idx in range(num_layers):\n            qkv_b = model_state.pop(\n                f'gpt.decoder.layers.{idx}.self_attn.qkv_proj.bias')\n            qkv_w = model_state.pop(\n                f'gpt.decoder.layers.{idx}.self_attn.qkv_proj.weight')\n\n            qkv_b = qkv_b.reshape((num_heads, 3, -1))\n            qkv_w = qkv_w.reshape((h, num_heads, 3, -1))\n\n            q_w, k_w, v_w = np.split(qkv_w, 3, axis=2)\n            q_w = q_w.reshape((h, -1))\n            k_w = k_w.reshape((h, -1))\n            v_w = v_w.reshape((h, -1))\n\n            q_b, k_b, v_b = np.split(qkv_b, 3, axis=1)\n            q_b = q_b.reshape((-1))\n            k_b = k_b.reshape((-1))\n            v_b = v_b.reshape((-1))\n\n            model_state[\n                f'gpt.decoder.layers.{idx}.self_attn.q_proj.bias'] = q_b\n            model_state[\n                f'gpt.decoder.layers.{idx}.self_attn.q_proj.weight'] = q_w\n\n            model_state[\n                f'gpt.decoder.layers.{idx}.self_attn.k_proj.bias'] = k_b\n            model_state[\n                f'gpt.decoder.layers.{idx}.self_attn.k_proj.weight'] = k_w\n\n            model_state[\n                f'gpt.decoder.layers.{idx}.self_attn.v_proj.bias'] = v_b\n            model_state[\n                f'gpt.decoder.layers.{idx}.self_attn.v_proj.weight'] = v_w\n\n        return model_state\n\n    def fuse_params(model_state, num_layers):\n        for idx in range(num_layers):\n            q_b = model_state.pop(\n                f'gpt.decoder.layers.{idx}.self_attn.q_proj.bias')\n            q_w = model_state.pop(\n                f'gpt.decoder.layers.{idx}.self_attn.q_proj.weight')\n\n            k_b = model_state.pop(\n                f'gpt.decoder.layers.{idx}.self_attn.k_proj.bias')\n            k_w = model_state.pop(\n                f'gpt.decoder.layers.{idx}.self_attn.k_proj.weight')\n\n            v_b = model_state.pop(\n                f'gpt.decoder.layers.{idx}.self_attn.v_proj.bias')\n            v_w = model_state.pop(\n                f'gpt.decoder.layers.{idx}.self_attn.v_proj.weight')\n\n            q_w = q_w.reshape((h, num_heads, -1))\n            k_w = k_w.reshape((h, num_heads, -1))\n            v_w = v_w.reshape((h, num_heads, -1))\n\n            qkv_w = np.stack([q_w, k_w, v_w], axis=2)\n            qkv_w = qkv_w.reshape((h, -1))\n\n            q_b = q_b.reshape((num_heads, -1))\n            k_b = k_b.reshape((num_heads, -1))\n            v_b = v_b.reshape((num_heads, -1))\n            qkv_b = np.stack([q_b, k_b, v_b], axis=1)\n            qkv_b = qkv_b.reshape((-1))\n\n            model_state[\n                f'gpt.decoder.layers.{idx}.self_attn.qkv_proj.weight'] = qkv_w\n            model_state[\n                f'gpt.decoder.layers.{idx}.self_attn.qkv_proj.bias'] = qkv_b\n        return model_state\n\n    fused = is_fused(model.state_dict())\n    load_fused = is_fused(model_dict)\n\n    if fused is True and load_fused is False:\n        model_dict = fuse_params(model_dict, l)\n    elif fused is False and load_fused is True:\n        model_dict = split_params(model_dict, l)\n\n    for name, param in model.state_dict().items():\n        if name in model_dict and param.dtype != model_dict[name].dtype:\n            model_dict[name] = model_dict[name].cast(param.dtype)\n\n    model.set_state_dict(model_dict)\n    logger.info(f'Load pretrained weight from {pretrained_path}')\n\n    # build loss fn\n    assert loss_config is not None\n    assert 'train' in loss_config and 'eval' in loss_config\n\n    train_loss = copy.deepcopy(loss_config.train)\n    train_loss_cls = train_loss.pop('name')\n    train_loss_fn = eval(f'paddle.nn.loss.{train_loss_cls}')(**train_loss)\n\n    eval_loss = copy.deepcopy(loss_config.eval)\n    eval_loss_cls = eval_loss.pop('name')\n    eval_loss_fn = eval(f'paddle.nn.loss.{eval_loss_cls}')(**eval_loss)\n\n    return model, tokenizer, train_loss_fn, eval_loss_fn\n\n\ndef fit_impl(config, batch, forward_func, **kwargs):\n    kwargs['model'].train()\n    loss = pretrain_fit_impl(config, batch, forward_func, **kwargs)\n\n    return loss\n\n\n@paddle.no_grad()\ndef eval_impl(config, batch, model, loss_fn, eval_metric):\n    model.eval()\n\n    use_fp16 = config.Global.mix_precision.enable\n    black_list = config.Global.mix_precision.custom_black_list\n    white_list = config.Global.mix_precision.custom_white_list\n\n    with paddle.amp.auto_cast(\n            use_fp16,\n            custom_black_list=black_list,\n            custom_white_list=white_list,\n            level='O2'):\n        input_ids, labels = batch\n\n        input_ids.stop_gradient = True\n        labels.stop_gradient = True\n\n        logits = model(input_ids)\n        loss = loss_fn(logits, labels)\n        correct = eval_metric.compute(logits, labels)\n        eval_metric.update(correct)\n\n    return loss\n"
  },
  {
    "path": "examples/transformer/models/GPT/finetune/run.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport sys\nimport copy\n\nimport paddle\nfrom paddle.distributed import fleet\nimport paddle.distributed as dist\n\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../')))\n\nfrom ppfleetx.distributed.apis import env, strategy, io\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.utils import device, log\nfrom ppfleetx.models.language_model import metrics\nfrom examples.transformer.utils import config as cfg\nfrom examples.transformer.utils import components as cpn\n\nimport impls\n\nif __name__ == \"__main__\":\n    # parse config from yaml\n    args = cfg.parse_args()\n    config = cfg.get_config(args.config, overrides=args.override, show=False)\n\n    paddle.set_device(config.Global.device)\n\n    # init distributed env\n    nranks = dist.get_world_size()\n    if nranks > 1:\n        raise RuntimeError(\"Only support single-card finetune for GPT model.\")\n        env.init_dist_env(config)\n\n    env.set_seed(config.Global.seed)\n    cfg.print_config(config)\n\n    # build dataloader for training/eval\n    dataset = cpn.build_dataset(config.Data.Train.dataset)\n    sampler = cpn.build_batch_sampler(config.Data.Train.sampler, dataset)\n    train_data_loader = cpn.build_dataloader(config.Data.Train.loader, dataset,\n                                             sampler)\n\n    dataset = cpn.build_dataset(config.Data.Eval.dataset)\n    sampler = cpn.build_batch_sampler(config.Data.Eval.sampler, dataset)\n    valid_data_loader = cpn.build_dataloader(config.Data.Eval.loader, dataset,\n                                             sampler)\n\n    # build GPT model\n    model, tokenizer, train_loss_fn, eval_loss_fn = impls.build_model(config)\n\n    if config.Global.mix_precision.enable:\n        scaler = paddle.amp.GradScaler(\n            init_loss_scaling=config.Global.mix_precision.scale_loss)\n        # Note: Save dtype is the same as model dtype. Also can set save_dtype='float32' when \n        # training with pure fp16 strategy, but will cause the rise of memory.\n        model = paddle.amp.decorate(models=model, level='O2')\n    else:\n        scaler = None\n\n    # build metric\n    model_setting = copy.deepcopy(config.Model)\n    metric_config = model_setting.pop(\"metric\", None)\n\n    assert metric_config is not None and 'eval' in metric_config\n\n    if 'train' in metric_config:\n        train_metric = copy.deepcopy(metric_config.train)\n        train_metric_cls = train_metric.pop('name')\n        train_metric = eval(\"metrics.{}\".format(train_metric_cls))(\n            **train_metric)\n\n    eval_metric = copy.deepcopy(metric_config.eval)\n    eval_metric_cls = eval_metric.pop('name')\n    eval_metric = eval(\"metrics.{}\".format(eval_metric_cls))(**eval_metric)\n\n    best_metric = 0.0\n\n    # build lr and optim\n    config.Optimizer.lr.update({\n        'epochs': config.Global.num_train_epochs,\n        'step_each_epoch': len(train_data_loader),\n        'total_steps': config.Global.max_steps,\n    })\n\n    if 'multi_precision' in config.Optimizer:\n        assert config.Optimizer.pop('multi_precision') \\\n            == config.Global.mix_precision.enable\n\n    lr_scheduler = cpn.build_lr_scheduler(config.Optimizer.lr)\n    optimizer = cpn.build_optimizer(\n        config.Optimizer,\n        model,\n        lr_scheduler,\n        multi_precision=config.Global.mix_precision.enable)\n\n    # call fleet wrapper\n    if nranks > 1:\n        model, optimizer, scaler = strategy.wrap_with_fleet(\n            config.Distributed, model, optimizer, scaler)\n\n    # load pretrained checkpoints\n    load_recovery = {'step': 0, 'epoch': 0, 'rng_state': -1}\n    if config.Global.save_load.ckpt_dir is not None:\n        io.load(config.Global.save_load.ckpt_dir, model, optimizer, 'train',\n                load_recovery)\n\n    # build profiler\n    if config.get('Profiler', {}).get('enable', False):\n        profiler = cpn.build_profiler(config.Profiler)\n    else:\n        profiler = None\n\n    # start training\n    assert config.Global.get('run_mode',\n                             'epoch') == 'epoch', 'run_mode must be epoch'\n\n    train_start = log.get_timestamp()\n\n    if load_recovery['rng_state'] != -1:\n        paddle.set_cuda_rng_state(load_recovery['rng_state'])\n\n    for epoch_index in range(load_recovery['epoch'],\n                             config.Global.num_train_epochs):\n        train_epoch_start = log.get_timestamp()\n\n        # time count\n        train_losses = []\n        train_step_start = log.get_timestamp()\n\n        # Note(GuoxiaWang): Do not use len(train_data_loader()),\n        # it will cause a memory leak.\n        total_train_batch = len(train_data_loader)\n        total_eval_batch = len(\n            valid_data_loader) if valid_data_loader is not None else 0\n        for step, batch in enumerate(train_data_loader):\n            if epoch_index == load_recovery['epoch']:\n                if step <= load_recovery['step']:\n                    continue\n\n            model.train()\n            fit_kwargs = {\n                \"model\": model,\n                \"scaler\": scaler,\n                \"optimizer\": optimizer,\n                \"loss_fn\": train_loss_fn,\n            }\n\n            def forward_func(batch, model, loss_fn):\n                input_ids, labels = batch\n                input_ids.stop_gradient = True\n                labels.stop_gradient = True\n\n                logits = model(input_ids)\n                loss = loss_fn(logits, labels)\n\n                return loss\n\n            loss = impls.fit_impl(config, batch, forward_func, **fit_kwargs)\n            train_losses.append(loss)\n\n            # training step log\n            if (step + 1) % config.Global.logging_freq == 0:\n                train_step_cost = log.get_timestamp() - train_step_start\n                numpy_losses = [float(loss) for loss in train_losses]\n\n                train_cost = train_step_cost \\\n                    if step == 0 else train_step_cost / config.Global.logging_freq\n                speed = 1. / train_cost\n                default_global_tokens_num = config.Global.global_batch_size * \\\n                    config.Data.Train.dataset.max_length\n                ips_total = speed * default_global_tokens_num\n                ips = ips_total / env.get_data_world_size()\n\n                logger.info(\n                    \"[train] epoch: [%d/%d], step: [%d/%d], learning rate: %.7f, loss: %.9f, avg_batch_cost: \" \\\n                    \"%.5f sec, speed: %.2f step/s, ips_total: %.0f tokens/s, ips: %.0f tokens/s\"\n                    % (epoch_index, config.Global.num_train_epochs, step, total_train_batch, optimizer.get_lr(),\n                    sum(numpy_losses) / len(numpy_losses), train_cost, speed, ips_total, ips))\n\n                train_step_start = log.get_timestamp()\n                train_losses = []\n\n            if lr_scheduler is not None:\n                lr_scheduler.step()\n\n            optimizer.clear_grad()\n\n            # save model/optim states in 'step' mode\n            if step > 0 and config.Global.save_load.save_steps > 0 and \\\n                step % config.Global.save_load.save_steps == 0:\n                device.synchronize()\n                io.save(\n                    config.Global.save_load.output_dir,\n                    model,\n                    optimizer,\n                    step=step,\n                    epoch=epoch_index,\n                    sharding_stage=config.Distributed.sharding.sharding_stage)\n\n            if profiler:\n                profiler.step()\n\n        # training epoch log\n        train_epoch_cost = log.get_timestamp() - train_epoch_start\n        logger.info(\"[Training] epoch: %d, total time: %.5f sec\" %\n                    (epoch_index, train_epoch_cost))\n\n        eval_epoch_start = log.get_timestamp()\n\n        # start eval in 'epoch' mode\n        eval_step_start = log.get_timestamp()\n        eval_losses = []\n        total_eval_batch = len(valid_data_loader)\n\n        for eval_step, batch in enumerate(valid_data_loader):\n            loss = impls.eval_impl(config, batch, model, eval_loss_fn,\n                                   eval_metric)\n\n            eval_losses.append(float(loss))\n\n            if eval_step % config.Global.logging_freq == 0:\n                eval_step_cost = log.get_timestamp() - eval_step_start\n\n                speed = 1. / eval_step_cost\n                logger.info(\n                    \"[eval] epoch: %d, batch: %d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s\"\n                    % (epoch_index, eval_step, sum(eval_losses) /\n                       len(eval_losses), eval_step_cost, speed))\n\n                eval_step_start = log.get_timestamp()\n                eval_losses = []\n\n        eval_epoch_cost = log.get_timestamp() - eval_epoch_start\n\n        # eval epoch log\n        res = eval_metric.accumulate()\n        eval_metric.reset()\n\n        if isinstance(eval_metric, metrics.AccuracyAndF1):\n            msg = \"acc: %.5f, precision: %.5f, recall: %.5f, f1: %.5f, acc and f1: %.5f\" % (\n                res[0], res[1], res[2], res[3], res[4])\n            metric = res[4]\n        elif isinstance(eval_metric, metrics.Mcc):\n            msg = \"mcc: %.5f\" % (res[0])\n            metric = res[0]\n        elif isinstance(eval_metric, metrics.PearsonAndSpearman):\n            msg = \"pearson: %.5f, spearman: %.5f, pearson and spearman: %.5f\" % (\n                res[0], res[1], res[2])\n            metric = res[2]\n        else:\n            msg = \"acc: %.5f\" % (res)\n            metric = res\n\n        if metric > best_metric:\n            best_metric = metric\n\n        logger.info(\n            \"[Eval] epoch: %d, total time: %.5f sec, %s, best_metric: %.5f\" %\n            (epoch_index, eval_epoch_cost, msg, best_metric))\n\n        # save model/optim states in 'epoch' mode\n        if config.Global.save_load.save_epoch > 0 and \\\n            epoch_index % config.Global.save_load.save_steps == 0:\n            device.synchronize()\n            io.save(\n                config.Global.save_load.output_dir,\n                model,\n                optimizer,\n                step=len(train_data_loader),\n                epoch=epoch_index,\n                sharding_stage=config.Distributed.sharding.sharding_stage)\n\n    # training end log\n    logger.info(\n        \"The training process is complete and total cost of time for training is : {}\".\n        format(\n            log.convert_timestamp_to_data(log.get_timestamp() - train_start)))\n\n    if profiler:\n        cpn.profiler_done(profiler, config.Profiler)\n"
  },
  {
    "path": "examples/transformer/models/GPT/finetune/run_task.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nexport CUDA_VISIBLE_DEVICES=0\n\n# Single-Sentence Tasks\nif [ $1 == \"CoLA\" ]\nthen\n    python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \\\n      -o Data.Train.dataset.name=CoLA \\\n      -o Data.Train.dataset.root=./dataset/cola_public/ \\\n      -o Data.Eval.dataset.name=CoLA \\\n      -o Data.Eval.dataset.root=./dataset/cola_public/ \\\n      -o Data.Eval.dataset.split=dev \\\n      -o Model.metric.train.name=Mcc \\\n      -o Model.metric.eval.name=Mcc \\\n      -o Model.num_classes=2\nelif [ $1 == \"SST2\" ]\nthen\n    python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \\\n      -o Data.Train.dataset.name=SST2 \\\n      -o Data.Train.dataset.root=./dataset/SST-2/ \\\n      -o Data.Eval.dataset.name=SST2 \\\n      -o Data.Eval.dataset.root=./dataset/SST-2/ \\\n      -o Data.Eval.dataset.split=dev \\\n      -o Model.num_classes=2\n# Similarity and Paraphrase Tasks\nelif [ $1 == \"MRPC\" ]\nthen\n    python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \\\n      -o Global.num_train_epochs=5 \\\n      -o Data.Train.dataset.name=MRPC \\\n      -o Data.Train.dataset.root=./dataset/MRPC/ \\\n      -o Data.Eval.dataset.name=MRPC \\\n      -o Data.Eval.dataset.root=./dataset/MRPC/ \\\n      -o Data.Eval.dataset.split=test \\\n      -o Model.num_classes=2 \\\n      -o Model.metric.train.name=AccuracyAndF1 \\\n      -o Model.metric.eval.name=AccuracyAndF1\nelif [ $1 == \"QQP\" ]\nthen\n    python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \\\n      -o Data.Train.dataset.name=QQP \\\n      -o Data.Train.dataset.root=./dataset/QQP/ \\\n      -o Data.Eval.dataset.name=QQP \\\n      -o Data.Eval.dataset.root=./dataset/QQP/ \\\n      -o Data.Eval.dataset.split=dev \\\n      -o Model.num_classes=2 \\\n      -o Model.metric.train.name=AccuracyAndF1 \\\n      -o Model.metric.eval.name=AccuracyAndF1\nelif [ $1 == \"STSB\" ]\nthen\n    python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \\\n      -o Data.Train.dataset.name=STSB \\\n      -o Data.Train.dataset.root=./dataset/STS-B/ \\\n      -o Data.Eval.dataset.name=STSB \\\n      -o Data.Eval.dataset.root=./dataset/STS-B/ \\\n      -o Data.Eval.dataset.split=dev \\\n      -o Model.num_classes=1 \\\n      -o Model.metric.train.name=PearsonAndSpearman \\\n      -o Model.metric.eval.name=PearsonAndSpearman \\\n      -o Model.loss.train.name=MSELoss \\\n      -o Model.loss.eval.name=MSELoss\n# Inference Tasks\nelif [ $1 == \"MNLI\" ]\nthen\n    python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \\\n      -o Data.Train.dataset.name=MNLI \\\n      -o Data.Train.dataset.root=./dataset/multinli_1.0 \\\n      -o Data.Eval.dataset.name=MNLI \\\n      -o Data.Eval.dataset.root=./dataset/multinli_1.0 \\\n      -o Data.Eval.dataset.split=${2:-\"dev_matched\"} \\\n      -o Model.num_classes=3\nelif [ $1 == \"QNLI\" ]\nthen\n    python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \\\n      -o Data.Train.dataset.name=QNLI \\\n      -o Data.Train.dataset.root=./dataset/QNLI/ \\\n      -o Data.Eval.dataset.name=QNLI \\\n      -o Data.Eval.dataset.root=./dataset/QNLI/ \\\n      -o Data.Eval.dataset.split=dev \\\n      -o Model.num_classes=2\nelif [ $1 == \"RTE\" ]\nthen\n    python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \\\n      -o Data.Train.dataset.name=RTE \\\n      -o Data.Train.dataset.root=./dataset/RTE/ \\\n      -o Data.Eval.dataset.name=RTE \\\n      -o Data.Eval.dataset.root=./dataset/RTE/ \\\n      -o Data.Eval.dataset.split=dev \\\n      -o Model.num_classes=2\nelif [ $1 == \"WNLI\" ]\nthen\n    python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \\\n      -o Global.num_train_epochs=5 \\\n      -o Data.Train.dataset.name=WNLI \\\n      -o Data.Train.dataset.root=./dataset/WNLI/ \\\n      -o Data.Eval.dataset.name=WNLI \\\n      -o Data.Eval.dataset.root=./dataset/WNLI/ \\\n      -o Data.Eval.dataset.split=dev \\\n      -o Model.num_classes=2\nelse\n   echo \"Task name not recognized, please input CoLA, SST2, MRPC, QQP, STSB, MNLI, QNLI, RTE, WNLI.\"\nfi\n"
  },
  {
    "path": "examples/transformer/models/GPT/generation/configs/generation_gpt_345M_dp8.yaml",
    "content": "_base_: ./generation_gpt_base.yaml\n\nGeneration:\n  top_k: 50\n  top_p: 0.75\n  temperature: 1.0\n  min_dec_len: 1\n  max_dec_len: 200\n  num_return_sequences: 1\n  decode_strategy: \"sampling\"\n\nDistributed:\n  dp_degree: \n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n"
  },
  {
    "path": "examples/transformer/models/GPT/generation/configs/generation_gpt_345M_single_card.yaml",
    "content": "_base_: ./generation_gpt_base.yaml\n\nGeneration:\n  top_k: 50\n  top_p: 0.75\n  temperature: 1.0\n  min_dec_len: 1\n  max_dec_len: 200\n  num_return_sequences: 1\n  decode_strategy: \"sampling\"\n"
  },
  {
    "path": "examples/transformer/models/GPT/generation/configs/generation_gpt_base.yaml",
    "content": "Global:\n  device: gpu\n  seed: 1024\n\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n  max_steps: 500000\n  num_train_epochs: 1\n  accumulate_steps:\n  logging_freq: 1\n  eval_freq: 500\n  eval_iters: 10\n  test_iters:\n  mix_precision:\n    enable: True\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n  save_load:\n    save_steps: 1000\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\n\nModel:\n  name: \"GPT\"\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: 4096\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  recompute_granularity:\n  no_recompute_layers:\n  fused_linear: False\n  fuse_attn_qkv: True\n  sequence_parallel: False\n\n\nProfiler:\n  enable: False\n  scheduler: [1, 5]\n  profiler_log: profiler_log\n  detailed: False\n\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n"
  },
  {
    "path": "examples/transformer/models/GPT/generation/configs/generation_pruned_gpt_345M_single_card.yaml",
    "content": "_base_: ./generation_gpt_base.yaml\n\nCompress:\n  Prune:\n    enable: True\n    criterion: l1_norm\n    ratio: 0.125\n\nGeneration:\n  top_k: 50\n  top_p: 0.75\n  temperature: 1.0\n  min_dec_len: 1\n  max_dec_len: 200\n  num_return_sequences: 1\n  decode_strategy: \"sampling\"\n"
  },
  {
    "path": "examples/transformer/models/GPT/generation/configs/generation_qat_gpt_345M_single_card.yaml",
    "content": "_base_: ./generation_gpt_base.yaml\n\nGeneration:\n  top_k: 50\n  top_p: 0.75\n  temperature: 1.0\n  min_dec_len: 1\n  max_dec_len: 200\n  num_return_sequences: 1\n  decode_strategy: \"sampling\"\n  use_topp_sampling: True\n  inference: True\n\n\nCompress:\n  pretrained:\n  Quantization:\n    enable: True\n    weight_quantize_type: 'abs_max'\n    activation_quantize_type: 'moving_average_abs_max'\n    weight_bits: 8\n    activation_bits: 8\n    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']\n    onnx_format: True\n"
  },
  {
    "path": "examples/transformer/models/GPT/generation/configs/generation_qat_gpt_6.7B_single_card.yaml",
    "content": "_base_: ./generation_gpt_base.yaml\n\nModel:\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 32\n  num_attention_heads: 32\n  ffn_hidden_size: 16384\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  recompute_granularity:\n  no_recompute_layers:\n\n\nGeneration:\n  top_k: 50\n  top_p: 0.75\n  temperature: 1.0\n  min_dec_len: 1\n  max_dec_len: 200\n  num_return_sequences: 1\n  decode_strategy: \"sampling\"\n  use_topp_sampling: True\n  inference: True\n\nCompress:\n  pretrained:\n  Quantization:\n    enable: True\n    weight_quantize_type: 'abs_max'\n    activation_quantize_type: 'moving_average_abs_max'\n    weight_bits: 8\n    activation_bits: 8\n    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']\n    onnx_format: True\n"
  },
  {
    "path": "examples/transformer/models/GPT/generation/configs/inference_gpt_345M_dp8.yaml",
    "content": "_base_: ./generation_gpt_345M_dp8.yaml\n\n\nInference:\n  model_dir: ./output\n  mp_degree: 1\n\n\nDistributed:\n  dp_degree: \n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n"
  },
  {
    "path": "examples/transformer/models/GPT/generation/configs/inference_gpt_345M_single_card.yaml",
    "content": "_base_: ./generation_gpt_345M_single_card.yaml\n\n\nInference:\n  model_dir: ./output\n  mp_degree: 1\n\n\nDistributed:\n  dp_degree: \n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n"
  },
  {
    "path": "examples/transformer/models/GPT/generation/export.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport sys\nimport copy\n\nimport paddle\nfrom paddle.distributed import fleet\nimport paddle.distributed as dist\nfrom paddle.static import InputSpec\n\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../')))\n\nfrom ppfleetx.distributed.apis import env, strategy, io\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.utils import device, log\nfrom ppfleetx.utils.export import export_inference_model\nfrom examples.transformer.utils import qat\nfrom examples.transformer.utils import config as cfg\nfrom examples.transformer.utils import components as cpn\n\nimport impls\n\nif __name__ == \"__main__\":\n    # parse config from yaml\n    args = cfg.parse_args()\n    config = cfg.get_config(args.config, overrides=args.override, show=False)\n\n    paddle.set_device(config.Global.device)\n\n    # init distributed env\n    nranks = dist.get_world_size()\n    if nranks > 1:\n        env.init_dist_env(config)\n\n    env.set_seed(config.Global.seed)\n\n    cfg.process_configs(config)\n    cfg.print_config(config)\n\n    if config.Global.mix_precision.enable:\n        logger.info(\"NOTE: disable mix_precision in export mode\")\n\n    # build GPT model\n    model, _ = impls.build_model(config)\n\n    # export\n    model.eval()\n    input_spec = [\n        InputSpec(\n            shape=[None, None], name=\"input_ids\", dtype='int64')\n    ]\n\n    output_dir = config.Global.save_load.output_dir\n    dp_rank = 0 if nranks == 1 else env.get_hcg().get_data_parallel_rank()\n    save_dir = os.path.join(output_dir, \"rank_{}\".format(dp_rank))\n\n    quanter = None\n    quant_mode = False\n\n    if 'Compress' in config:\n        mode = 'compress'\n        compress_configs = config['Compress']\n\n        if \"Quantization\" in compress_configs:\n            quant_mode = True\n\n        model, quanter = qat.compress_model(config, model, input_spec)\n\n    # load pretrained checkpoints\n    if config.Global.save_load.ckpt_dir is not None:\n        io.load(\n            config.Global.save_load.ckpt_dir,\n            model,\n            optimizer=None,\n            mode='export',\n            load_recovery=None)\n\n    if not quant_mode:\n        export_inference_model(model, input_spec, save_dir, 'model')\n    else:\n        logger.info(\"export quantized model.\")\n        export_inference_model(\n            model,\n            input_spec,\n            save_dir,\n            'model',\n            export_quant_model=True,\n            quanter=quanter)\n"
  },
  {
    "path": "examples/transformer/models/GPT/generation/impls.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport sys\nimport copy\nimport numpy as np\n\nimport paddle\nimport paddle.distributed as dist\n\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.distributed.apis import env\nimport ppfleetx.models.language_model.gpt as gpt\nfrom ppfleetx.data.tokenizers import GPTTokenizer, GPTChineseTokenizer\nfrom examples.transformer.models.GPT.pretrain.impls import fit_impl as pretrain_fit_impl\n\nMODEL_CLASSES = {\n    \"GPT\": (GPTTokenizer, \"gpt2\"),\n    \"GPT-cn\": (GPTChineseTokenizer, \"gpt-cpm-large-cn\"),\n}\n\n\ndef adjust_length_to_model(length, max_sequence_length):\n    if length < 0 or length > max_sequence_length:\n        length = max_sequence_length\n    return length\n\n\ndef build_model(config):\n    nranks = dist.get_world_size()\n    generation_cfgs = config.Generation\n\n    model_setting = copy.deepcopy(config.Model)\n    if 'Compress' in config and 'Quantization' in config.Compress:\n        quant_setting = copy.deepcopy(config.Compress.Quantization)\n        skip_tensor_map = quant_setting.get('skip_tensor_map', {})\n        freeze_embedding = quant_setting.get('freeze_embedding', False)\n        model_setting['skip_tensor_map'] = skip_tensor_map\n        model_setting['freeze_embedding'] = freeze_embedding\n\n    model_name = model_setting.pop(\"name\")\n    tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]\n    tokenizer = tokenizer_class.from_pretrained(pretrained_name)\n\n    if nranks == 1:\n        model = gpt.GPTForGeneration(\n            gpt.GPTModel(**model_setting), generation_cfgs)\n    else:\n        assert nranks == config.Distributed.dp_degree, \\\n            \"only support single card and data parallel in generation task.\"\n        model = gpt.GPTForGenerationHybrid(\n            gpt.GPTModelHybrid(**model_setting), generation_cfgs)\n\n    generation_cfgs['max_dec_len'] = adjust_length_to_model(\n        generation_cfgs['max_dec_len'], 512)\n\n    generation_cfgs['bos_token_id'] = tokenizer.eos_token_id\n    generation_cfgs['eos_token_id'] = tokenizer.eos_token_id\n    generation_cfgs['pad_token_id'] = tokenizer.eos_token_id\n\n    return model, tokenizer\n\n\ndef left_padding(inputs, pad_id, padding=\"longest\"):\n    assert \"input_ids\" in inputs, \"input_ids should be in inputs!\"\n    max_length = 0\n    for ids in inputs[\"input_ids\"]:\n        max_length = max(max_length, len(ids))\n\n    def extend_max_lenth(value, max_length, to_pad_id):\n        return [to_pad_id] * (max_length - len(value)) + value\n\n    def extend_filed(name, max_length, to_pad_id):\n        values = inputs[name]\n        res = []\n        for index, value in enumerate(values):\n            res.append(extend_max_lenth(value, max_length, to_pad_id))\n        inputs[name] = res\n\n    extend_filed(\"input_ids\", max_length, pad_id)\n    if \"attention_mask\" in inputs:\n        extend_filed(\"attention_mask\", max_length, 0)\n    if \"position_ids\" in inputs:\n        extend_filed(\"position_ids\", max_length, 0)\n\n    return inputs\n"
  },
  {
    "path": "examples/transformer/models/GPT/generation/inference.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport sys\nimport copy\n\nimport paddle\nfrom paddle.distributed import fleet\nimport paddle.distributed as dist\n\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../')))\n\nfrom ppfleetx.distributed.apis import env, strategy, io\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.utils import device, log\nfrom ppfleetx.core.engine import InferenceEngine, TensorRTConfig\nfrom examples.transformer.utils import config as cfg\nfrom examples.transformer.utils import components as cpn\n\nimport impls\n\nif __name__ == \"__main__\":\n    # parse config from yaml\n    args = cfg.parse_args()\n    config = cfg.get_config(args.config, overrides=args.override, show=False)\n\n    paddle.set_device(config.Global.device)\n\n    # init distributed env\n    nranks = dist.get_world_size()\n    if nranks > 1:\n        env.init_dist_env(config)\n\n    env.set_seed(config.Global.seed)\n    cfg.process_configs(config)\n\n    # build model\n    model, tokenizer = impls.build_model(config)\n    model.eval()\n\n    if 'Inference' in config:\n        inference_configs = config['Inference']\n        inference_engine = None\n    else:\n        raise RuntimeError(f'No Inference in config')\n\n    input_text = 'Hi, GPT2. Tell me who Jack Ma is.'\n    input_ids = [tokenizer.encode(input_text)]\n\n    if inference_engine is None:\n        # parse TensorRT config\n        tensorrt_config = None\n        if 'TensorRT' in inference_configs:\n            tensorrt_config = TensorRTConfig(**inference_configs['TensorRT'])\n\n        inference_engine = InferenceEngine(inference_configs['model_dir'],\n                                           inference_configs['mp_degree'],\n                                           tensorrt_config)\n\n    outs = inference_engine.predict([input_ids])\n\n    ids = list(outs.values())[0]\n    out_ids = [int(x) for x in ids[0]]\n    result = tokenizer.decode(out_ids)\n    result = input_text + result\n\n    print('Prompt:', input_text)\n    print('Generation:', result)\n"
  },
  {
    "path": "examples/transformer/models/GPT/generation/run.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport sys\nimport copy\n\nimport paddle\nfrom paddle.distributed import fleet\nimport paddle.distributed as dist\nfrom paddle.static import InputSpec\n\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../')))\n\nfrom ppfleetx.distributed.apis import env, strategy, io\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.utils import device, log\nfrom examples.transformer.utils import qat\nfrom examples.transformer.utils import config as cfg\nfrom examples.transformer.utils import components as cpn\n\nimport impls\n\nif __name__ == \"__main__\":\n    # parse config from yaml\n    args = cfg.parse_args()\n    config = cfg.get_config(args.config, overrides=args.override, show=False)\n\n    paddle.set_device(config.Global.device)\n\n    # init distributed env\n    nranks = dist.get_world_size()\n    if nranks > 1:\n        env.init_dist_env(config)\n\n    env.set_seed(config.Global.seed)\n    cfg.process_configs(config)\n\n    # build model\n    model, tokenizer = impls.build_model(config)\n\n    if 'Compress' in config:\n        input_spec = [\n            InputSpec(\n                shape=[None, None], name=\"input_ids\", dtype='int64')\n        ]\n        model, quanter = qat.compress_model(config, model, input_spec)\n\n    model.eval()\n    cfg.print_config(config)\n\n    # call fleet wrapper\n    if nranks > 1:\n        model, _, _ = strategy.wrap_with_fleet(\n            config.Distributed, model, optimizer=None, scaler=None)\n\n    # load pretrained checkpoints\n    if config.Global.save_load.ckpt_dir is not None:\n        io.load(\n            config.Global.save_load.ckpt_dir,\n            model,\n            optimizer=None,\n            mode='generation',\n            load_recovery=None)\n\n    # build profiler\n    if config.get('Profiler', {}).get('enable', False):\n        profiler = cpn.build_profiler(config.Profiler)\n    else:\n        profiler = None\n\n    input_text = 'Hi, GPT2. Tell me who Jack Ma is.'\n    input_ids = tokenizer.encode(input_text)\n    inputs = {'input_ids': [input_ids]}\n\n    inputs = impls.left_padding(inputs, tokenizer.eos_token_id)\n    input_ids = inputs['input_ids']\n\n    if len(input_ids) == 0:\n        input_ids = None\n    else:\n        # [1, seq_len]\n        input_ids = paddle.to_tensor(input_ids, dtype='int64')\n\n    ids, scores = model(input_ids=input_ids)\n\n    result = []\n    for i, generated_ids in enumerate(ids):\n        generated_ids = generated_ids.numpy().tolist()\n        # Decode text\n        text = tokenizer.convert_ids_to_string(generated_ids)\n        sequence = input_text + text\n        result.append(sequence)\n\n    print(f'Prompt: {input_text}')\n    print(f'Generation: {result[0]}')\n\n    if profiler:\n        cpn.profiler_done(profiler, config.Profiler)\n"
  },
  {
    "path": "examples/transformer/models/GPT/offline-eval/configs/eval_gpt_345M_single_card.yaml",
    "content": "_base_: ./eval_gpt_base.yaml\n\n\nOffline_Eval:\n  eval_path: ./wikitext-103/wiki.valid.tokens\n  cloze_eval: False\n  overlapping_eval: 32\n  batch_size: 8\n  max_seq_len: 1024\n  logging_freq: 10\n"
  },
  {
    "path": "examples/transformer/models/GPT/offline-eval/configs/eval_gpt_base.yaml",
    "content": "Global:\n  device: gpu\n  seed: 1024\n\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n  max_steps: 500000\n  num_train_epochs: 1\n  accumulate_steps:\n  logging_freq: 1\n  eval_freq: 500\n  eval_iters: 10\n  test_iters:\n  mix_precision:\n    enable: True\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n  save_load:\n    save_steps: 1000\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\n\nModel:\n  name: \"GPT\"\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: 4096\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  recompute_granularity:\n  no_recompute_layers:\n  fused_linear: False\n  fuse_attn_qkv: True\n  sequence_parallel: False\n\n\nData:\n  Eval:\n    dataset:\n      name: LM_Eval_Dataset\n      input_dir: ./data/\n      split: [949, 50, 1]\n      max_seq_len: 1024\n      overlapping_eval: \n    sampler:\n      name: GPTBatchSampler\n      shuffle: False\n      drop_last: True\n    loader:\n      num_workers: 1\n      return_list: False\n      collate_fn: gpt_collate_fn\n\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n\n\nProfiler:\n  enable: False\n  scheduler: [1, 5]\n  profiler_log: profiler_log\n  detailed: False\n\n\nOffline_Eval:\n  eval_path: ./wikitext-103/wiki.valid.tokens\n  cloze_eval: False\n  overlapping_eval: 32\n  batch_size: 8\n  max_seq_len: 1024\n  logging_freq: 10\n"
  },
  {
    "path": "examples/transformer/models/GPT/offline-eval/configs/eval_pruned_gpt_345M_single_card.yaml",
    "content": "_base_: ./eval_gpt_base.yaml\n\n\nModel:\n  hidden_dropout_prob: 0.0\n  attention_probs_dropout_prob: 0.0\n\n\nCompress:\n  Prune:\n    enable: True\n    criterion: l1_norm\n    ratio: 0.125\n\n\nOffline_Eval:\n  eval_path: ./lambada_test.jsonl\n  cloze_eval: True\n  overlapping_eval: 32\n  batch_size: 8\n  max_seq_len: 1024\n  logging_freq: 10\n"
  },
  {
    "path": "examples/transformer/models/GPT/offline-eval/configs/eval_qat_gpt_345M_single_card.yaml",
    "content": "_base_: ./eval_gpt_base.yaml\n\n\nCompress:\n  pretrained:\n  Quantization:\n    enable: True\n    weight_quantize_type: 'abs_max'\n    activation_quantize_type: 'moving_average_abs_max'\n    activation_preprocess_type: 'PACT'\n    weight_bits: 8\n    activation_bits: 8\n    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']\n    onnx_format: True\n    skip_tensor_map: \n      block_3: ['linear2']\n      block_5: ['linear1']\n      block_6: ['linear2']\n      block_7: ['linear2']\n      block_10: ['linear2']\n      block_20: ['linear2']\n      block_21: ['linear2']\n\n\nOffline_Eval:\n  eval_path: ./wikitext-103/wiki.valid.tokens\n  cloze_eval: False\n  overlapping_eval: 32\n  batch_size: 8\n  max_seq_len: 1024\n  logging_freq: 10\n"
  },
  {
    "path": "examples/transformer/models/GPT/offline-eval/impls.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport sys\nimport copy\nimport numpy as np\nimport json\nimport re\nimport math\n\nimport paddle\nimport paddle.distributed as dist\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.distributed.apis import env\nfrom ppfleetx.models.language_model import gpt\nfrom ppfleetx.data.tokenizers import GPTTokenizer, GPTChineseTokenizer\n\nMODEL_CLASSES = {\n    \"GPT\": (GPTTokenizer, \"gpt2\"),\n    \"GPT-cn\": (GPTChineseTokenizer, \"gpt-cpm-large-cn\"),\n}\n\n\ndef build_model(config):\n    nranks = dist.get_world_size()\n    model_setting = copy.deepcopy(config.Model)\n\n    if 'Compress' in config and 'Quantization' in config.Compress:\n        quant_setting = copy.deepcopy(config.Compress.Quantization)\n        model_setting['skip_tensor_map'] = quant_setting.get('skip_tensor_map',\n                                                             {})\n        model_setting['freeze_embedding'] = quant_setting.get(\n            'freeze_embedding', False)\n\n    model_name = model_setting.pop(\"name\")\n    tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]\n    tokenizer = tokenizer_class.from_pretrained(pretrained_name)\n\n    if nranks == 1:\n        model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting))\n    else:\n        raise RuntimeError(\n            \"Only single-card offline eval is supported in GPTModel now.\")\n\n    return model, tokenizer\n\n\n@paddle.no_grad()\ndef eval_impl(config, batch, model):\n    model.eval()\n\n    use_fp16 = config.Global.mix_precision.enable\n    black_list = config.Global.mix_precision.custom_black_list\n    white_list = config.Global.mix_precision.custom_white_list\n\n    with paddle.amp.auto_cast(\n            use_fp16,\n            custom_black_list=black_list,\n            custom_white_list=white_list,\n            level='O2'):\n\n        tokens, loss_mask, attention_mask, position_ids, labels = batch\n        preds = model(tokens, position_ids, attention_mask)\n\n        if not config.Offline_Eval.cloze_eval:\n            masked_lm_loss = paddle.nn.functional.cross_entropy(\n                preds, labels, reduction=\"none\")\n            loss = paddle.sum(masked_lm_loss * loss_mask)\n\n            return loss\n        else:\n            outputs = paddle.argmax(preds, -1)\n            acc = paddle.cast(outputs == labels, 'float32')\n            acc = paddle.where(\n                paddle.cast(loss_mask, 'bool'), acc, paddle.ones_like(acc))\n            acc = paddle.sum(paddle.prod(acc, -1))\n\n            return acc\n\n\nclass LM_Eval_Dataset(paddle.io.Dataset):\n    def __init__(self,\n                 tokens,\n                 max_seq_len,\n                 eos_token_id,\n                 overlapping_eval=None,\n                 **kwargs):\n        self.tokens = tokens\n        self.seq_len = max_seq_len\n        self.pad_idx = eos_token_id\n        self.overlapping_eval = overlapping_eval\n        if self.overlapping_eval is None:\n            self.overlapping_eval = self.seq_len\n        self.overlapping_eval = max(1, self.overlapping_eval)\n\n        self.total_targets = len(self.tokens) - 1\n        # remove first sequence tokens\n        targets = max(self.total_targets - self.overlapping_eval, 0)\n        self.total_sequences = max(\n            math.ceil(targets / self.overlapping_eval) + 1, 1)\n\n    def __len__(self):\n        return self.total_sequences\n\n    def _construct_sample(self, tokens):\n        tokens = np.array(tokens).astype(\"int64\").tolist()\n        labels = tokens[1:]\n        tokens = tokens[:-1]\n        seq_length = len(tokens)\n        # attention mask for the attention calulate\n        attention_mask = np.tri(seq_length, seq_length).reshape(\n            (1, seq_length, seq_length))\n\n        # the pad and eos tokens do not contribute the loss\n        loss_mask = np.ones(seq_length, dtype=\"float32\")\n        loss_mask[np.where(np.array(tokens) == self.pad_idx)] = 0.0\n        position_ids = np.arange(0, seq_length, dtype=\"int64\")\n\n        # -INF mask value as default\n        # attention_mask = (attention_mask - 1.0) * 1e9\n        # Bool mask of attention\n        attention_mask = attention_mask.astype(\"float32\")\n        return [tokens, loss_mask, attention_mask, position_ids, labels]\n\n    def __getitem__(self, idx):\n        start_idx = idx * self.overlapping_eval\n        end_idx = start_idx + self.seq_len\n        tokens = self.tokens[start_idx:end_idx + 1]\n        num_tokens = len(tokens)\n        if num_tokens < self.seq_len + 1:\n            num_pad = (self.seq_len + 1 - num_tokens)\n            tokens += [self.pad_idx] * num_pad\n        [tokens, loss_mask, attention_mask, position_ids,\n         labels] = self._construct_sample(tokens)\n        if self.overlapping_eval != self.seq_len and idx != 0:\n            loss_mask[:-self.overlapping_eval] *= 0\n\n        return [tokens, loss_mask, attention_mask, position_ids, labels]\n\n\nclass Lambada_Eval_Dataset(paddle.io.Dataset):\n    def __init__(self, tokens, labels, max_seq_len, eos_token_id, **kwargs):\n        self.pad_idx = eos_token_id\n        self.seq_len = max_seq_len\n        self.tokens = tokens\n        self.labels = labels\n\n    def __len__(self):\n        return len(self.tokens)\n\n    def _construct_sample(self, tokens):\n        tokens = np.array(tokens).astype(\"int64\").tolist()\n        labels = tokens[1:]\n        tokens = tokens[:-1]\n\n        seq_length = len(tokens)\n        # attention mask for the attention calulate\n        attention_mask = np.tri(seq_length, seq_length).reshape(\n            (1, seq_length, seq_length))\n\n        # the pad and eos tokens do not contribute the loss\n        position_ids = np.arange(0, seq_length, dtype=\"int64\")\n\n        # -INF mask value as default\n        #attention_mask = (attention_mask - 1.0) * 1e9\n        # Bool mask of attention\n        attention_mask = attention_mask.astype(\"float32\")\n        return [tokens, attention_mask, position_ids, labels]\n\n    def __getitem__(self, idx):\n        tokens = self.tokens[idx][:self.seq_len]\n        labels = self.labels[idx]\n        tokens = tokens + labels\n        num_tokens = len(tokens)\n        if num_tokens < self.seq_len + 1:\n            num_pad = (self.seq_len + 1 - num_tokens)\n            tokens += [self.pad_idx] * num_pad\n        loss_mask = np.zeros(self.seq_len, dtype=\"float32\")\n        loss_mask[num_tokens - len(labels) - 1:num_tokens - 1] = 1.\n        [tokens, attention_mask, position_ids,\n         labels] = self._construct_sample(tokens)\n        return [tokens, loss_mask, attention_mask, position_ids, labels]\n\n\ndef wikitext_detokenizer(string):\n    # contractions\n    string = string.replace(\"s '\", \"s'\")\n    string = re.sub(r\"/' [0-9]/\", r\"/'[0-9]/\", string)\n\n    # number separators\n    string = string.replace(\" @-@ \", \"-\")\n    string = string.replace(\" @,@ \", \",\")\n    string = string.replace(\" @.@ \", \".\")\n\n    # punctuation\n    string = string.replace(\" : \", \": \")\n    string = string.replace(\" ; \", \"; \")\n    string = string.replace(\" . \", \". \")\n    string = string.replace(\" ! \", \"! \")\n    string = string.replace(\" ? \", \"? \")\n    string = string.replace(\" , \", \", \")\n\n    # double brackets\n    string = re.sub(r\"\\(\\s*([^\\)]*?)\\s*\\)\", r\"(\\1)\", string)\n    string = re.sub(r\"\\[\\s*([^\\]]*?)\\s*\\]\", r\"[\\1]\", string)\n    string = re.sub(r\"{\\s*([^}]*?)\\s*}\", r\"{\\1}\", string)\n    string = re.sub(r\"\\\"\\s*([^\\\"]*?)\\s*\\\"\", r'\"\\1\"', string)\n    string = re.sub(r\"'\\s*([^']*?)\\s*'\", r\"'\\1'\", string)\n\n    # miscellaneous\n    string = string.replace(\"= = = =\", \"====\")\n    string = string.replace(\"= = =\", \"===\")\n    string = string.replace(\"= =\", \"==\")\n    string = string.replace(\" \" + chr(176) + \" \", chr(176))\n    string = string.replace(\" \\n\", \"\\n\")\n    string = string.replace(\"\\n \", \"\\n\")\n    string = string.replace(\" N \", \" 1 \")\n    string = string.replace(\" 's\", \"'s\")\n\n    return string\n\n\ndef get_tokens(tokenizer, text, strict=True):\n    if not strict:\n        tokens = tokenizer.encode(text)\n        return tokens[:-1], [tokens[-1]]\n    last_token = text.split()[-1]\n    start_idx = text.rfind(last_token)\n    beginning_tokens = tokenizer.encode(text[:start_idx].strip())\n    last_token = tokenizer.encode(' ' + last_token)\n    return beginning_tokens, last_token\n"
  },
  {
    "path": "examples/transformer/models/GPT/offline-eval/run.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport sys\nimport copy\nimport json\nimport math\n\nimport paddle\nfrom paddle.distributed import fleet\nimport paddle.distributed as dist\nfrom paddle.static import InputSpec\n\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../')))\n\nfrom ppfleetx.distributed.apis import env, strategy, io\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.utils import device, log\nfrom ppfleetx.models.language_model import gpt\nfrom examples.transformer.utils import qat\nfrom examples.transformer.utils import config as cfg\nfrom examples.transformer.utils import components as cpn\nimport impls\n\nif __name__ == \"__main__\":\n    # parse config from yaml\n    args = cfg.parse_args()\n    config = cfg.get_config(args.config, overrides=args.override, show=False)\n\n    paddle.set_device(config.Global.device)\n\n    # init distributed env\n    nranks = dist.get_world_size()\n    if nranks > 1:\n        env.init_dist_env(config)\n\n    env.set_seed(config.Global.seed)\n\n    # process configs\n    eval_cfgs = config.Offline_Eval\n    config.Data.Eval.pop(\"sampler\", None)\n    config.Data.Eval.loader.collate_fn = \"gpt_collate_fn\"\n    config.Data.Eval.loader.batch_size = eval_cfgs.batch_size\n    config.Data.Eval.dataset.input_dir = eval_cfgs.eval_path\n    config.Data.Eval.dataset.max_seq_len = eval_cfgs.max_seq_len\n    config.Global.logging_freq = eval_cfgs.logging_freq\n\n    if not eval_cfgs.cloze_eval:\n        config.Data.Eval.dataset.name = \"LM_Eval_Dataset\"\n        config.Data.Eval.dataset.overlapping_eval = eval_cfgs.overlapping_eval\n    else:\n        config.Data.Eval.dataset.name = \"Lambada_Eval_Dataset\"\n\n    cfg.print_config(config)\n\n    # build GPT model\n    model, tokenizer = impls.build_model(config)\n\n    if 'Compress' in config:\n        input_spec = [\n            InputSpec(\n                shape=[None, None], name=\"tokens\", dtype='int64'), InputSpec(\n                    shape=[None, None], name=\"ids\", dtype='int64')\n        ]\n        model, quanter = qat.compress_model(config, model, input_spec)\n\n    if config.Global.mix_precision.enable:\n        scaler = paddle.amp.GradScaler(\n            init_loss_scaling=config.Global.mix_precision.scale_loss)\n        # Note: Save dtype is the same as model dtype. Also can set save_dtype='float32' when \n        # training with pure fp16 strategy, but will cause the rise of memory.\n        model = paddle.amp.decorate(models=model, level='O2')\n    else:\n        scaler = None\n\n    # load pretrained checkpoints\n    load_recovery = {'step': 0, 'epoch': 0, 'rng_state': -1}\n    if config.Global.save_load.ckpt_dir is not None:\n        io.load(\n            config.Global.save_load.ckpt_dir,\n            model,\n            optimizer=None,\n            mode='eval',\n            load_recovery=load_recovery)\n\n    # build dataset for eval\n    if not eval_cfgs.cloze_eval:\n        with open(eval_cfgs.eval_path, \"rb\") as reader:\n            entire_data = reader.read().decode('utf-8')\n\n        num_original_tokens = len(entire_data.strip().split(\" \"))\n        entire_data = impls.wikitext_detokenizer(entire_data)\n        tokenized_data = tokenizer.encode(entire_data)\n        num_tokenized_tokens = len(tokenized_data)\n        print('Original Tokens: %d, Detokenized tokens: %d' %\n              (num_original_tokens, num_tokenized_tokens))\n\n        dataset = impls.LM_Eval_Dataset(\n            tokens=tokenized_data,\n            max_seq_len=eval_cfgs.max_seq_len,\n            overlapping_eval=eval_cfgs.overlapping_eval,\n            eos_token_id=tokenizer.eos_token_id)\n    else:\n        tokenized_data = []\n        tokenized_label = []\n\n        with open(eval_cfgs.eval_path, 'r') as f:\n            for line in f.readlines():\n                text = json.loads(line)['text']\n                tokens, labels = impls.get_tokens(tokenizer, text)\n                tokenized_data.append(tokens)\n                tokenized_label.append(labels)\n\n        dataset = impls.Lambada_Eval_Dataset(\n            tokens=tokenized_data,\n            labels=tokenized_label,\n            max_seq_len=eval_cfgs.max_seq_len,\n            eos_token_id=tokenizer.eos_token_id)\n\n        num_examples = len(dataset)\n\n    # build dataloader for eval\n    valid_data_loader = cpn.build_dataloader(\n        config.Data.Eval.loader, dataset, batch_sampler=None)\n\n    # build profiler\n    if config.get('Profiler', {}).get('enable', False):\n        profiler = cpn.build_profiler(config.Profiler)\n    else:\n        profiler = None\n\n    # start eval\n    model.eval()\n    total_score = 0\n    score_name = \"loss\" if not eval_cfgs.cloze_eval else \"number correct\"\n    eval_start = log.get_timestamp()\n\n    if load_recovery['rng_state'] != -1:\n        paddle.set_cuda_rng_state(load_recovery['rng_state'])\n\n    for epoch_index in range(config.Global.num_train_epochs):\n        eval_epoch_start = log.get_timestamp()\n\n        eval_step_start = log.get_timestamp()\n        eval_losses = []\n        total_eval_batch = len(valid_data_loader)\n\n        for eval_step, batch in enumerate(valid_data_loader):\n            loss = impls.eval_impl(config, batch, model)\n            eval_losses.append(float(loss))\n\n            if eval_step > 0 and eval_step % config.Global.logging_freq == 0:\n                eval_step_cost = log.get_timestamp() - eval_step_start\n                speed = config.Global.logging_freq / eval_step_cost\n                eval_loss = sum(eval_losses) / len(eval_losses)\n\n                if not eval_cfgs.cloze_eval:\n                    total_score += eval_loss * config.Global.logging_freq / (\n                        num_tokenized_tokens - 1)\n                else:\n                    total_score += eval_loss * config.Global.logging_freq\n\n                logger.info(\n                    \"[eval] epoch: %d, batch: %d, %s: %.9f, speed: %.2f step/s\"\n                    % (epoch_index, eval_step, score_name, total_score, speed))\n\n                eval_step_start = log.get_timestamp()\n                eval_losses = []\n\n            if eval_step >= config.Global.max_steps:\n                break\n\n        eval_epoch_cost = log.get_timestamp() - eval_epoch_start\n        logger.info(\n            \"[eval] epoch {} : evaluting process is complete and cost {}\".\n            format(epoch_index, log.convert_timestamp_to_data(\n                eval_epoch_cost)))\n\n        string = '[eval] epoch {} : validation results on {} | '.format(\n            epoch_index, eval_cfgs.eval_path)\n\n        if not eval_cfgs.cloze_eval:\n            total_loss = float(total_score)\n            ppl = math.exp(min(20, total_loss))\n            token_ratio = (num_tokenized_tokens - 1) / (\n                num_original_tokens - 1)\n            adjusted_ppl = math.exp(min(20, total_loss * token_ratio))\n\n            string += 'avg loss: {:.4E} | '.format(total_loss)\n            string += 'ppl: {:.4E} | '.format(ppl)\n            string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl)\n            string += 'token ratio: {} |'.format(token_ratio)\n        else:\n            num_correct = float(total_score)\n            acc = float(num_correct / num_examples)\n\n            string += 'number correct: {:.4E} | '.format(num_correct)\n            string += 'total examples: {:.4E} | '.format(num_examples)\n            string += 'avg accuracy: {:.4E}'.format(acc)\n\n        logger.info(string)\n\n    # evaluting end log\n    logger.info(\n        \"The evaluting process is complete and total cost of time for evaluting is : {}\".\n        format(\n            log.convert_timestamp_to_data(log.get_timestamp() - eval_start)))\n\n    del valid_data_loader\n\n    if profiler:\n        cpn.profiler_done(profiler, config.Profiler)\n"
  },
  {
    "path": "examples/transformer/models/GPT/pretrain/configs/export_qat_gpt_345M_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: 8\n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size:\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  recompute_granularity:\n  fused_linear: True\n  \n\nDistributed:\n  dp_degree:\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n\n\nCompress:\n  pretrained:\n  Quantization:\n    enable: True\n    weight_quantize_type: 'abs_max'\n    activation_quantize_type: 'moving_average_abs_max'\n    weight_bits: 8\n    activation_bits: 8\n    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']\n    onnx_format: True\n"
  },
  {
    "path": "examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_1.3B_dp8.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 2048\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: \n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: True\n  recompute_granularity:\n  no_recompute_layers:\n  \n\nDistributed:\n  dp_degree: 8\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n"
  },
  {
    "path": "examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_1.3B_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: 8\n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 2048\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: \n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: True\n  recompute_granularity:\n  no_recompute_layers:\n  \n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n"
  },
  {
    "path": "examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_175B_mp8_pp16.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 1536\n  micro_batch_size: 1\n\n\nModel:\n  vocab_size: 51200\n  hidden_size: 12288\n  num_layers: 96\n  num_attention_heads: 96\n  ffn_hidden_size:\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: True\n  recompute_granularity: 'core_attn'\n  no_recompute_layers:\n  virtual_pp_degree: 1\n  sequence_parallel: True\n  fused_linear: True\n  \n\nDistributed:\n  dp_degree:\n  mp_degree: 8\n  pp_degree: 16\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n"
  },
  {
    "path": "examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_345M_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: 4096\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  recompute_granularity:\n  no_recompute_layers:\n  \n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n"
  },
  {
    "path": "examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_6.7B_sharding16.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n  logging_freq: 10\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 4096\n  num_layers: 32\n  num_attention_heads: 32\n  ffn_hidden_size:\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: True\n  recompute_granularity:\n  no_recompute_layers:\n  fused_linear: True\n\n\nDistributed:\n  dp_degree:\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 16\n    sharding_stage: 2\n    sharding_offload: False\n    reduce_overlap: True\n    broadcast_overlap: True\n\n\nOptimizer:\n  tensor_fusion: True\n"
  },
  {
    "path": "examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_base.yaml",
    "content": "Global:\n  device: gpu\n  seed: 1024\n\n  global_batch_size: \n  local_batch_size: 1\n  micro_batch_size: 1\n\n  max_steps: 500000\n  num_train_epochs: 1\n  accumulate_steps:\n  logging_freq: 1\n  eval_freq: 500\n  eval_iters: 10\n  test_iters:\n  mix_precision:\n    enable: True\n    dtype: \"float16\"\n    level: \"O2\"\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n  save_load:\n    save_steps: 1000\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\n\nModel:\n  name: \"GPT\"\n  fused_linear: False\n  fuse_attn_qkv: True\n  scale_qk_by_layer_num: True\n  sequence_parallel: False\n  no_recompute_layers:\n  vocab_size_divisible_unit: 128\n  fused_softmax_with_triangular: True\n\n\nData:\n  Train:\n    dataset:\n      name: GPTDataset\n      input_dir: ./data/\n      split: [969, 30, 1]\n      max_seq_len: 1024\n    sampler:\n      name: GPTBatchSampler\n      shuffle: False\n      drop_last: True\n    loader:\n      num_workers: 1\n      return_list: False\n      collate_fn: gpt_collate_fn\n  \n  Eval:\n    dataset:\n      name: GPTDataset\n      input_dir: ./data/\n      split: [969, 30, 1]\n      max_seq_len: 1024\n    sampler:\n      name: GPTBatchSampler\n      shuffle: False\n      drop_last: True\n    loader:\n      num_workers: 1\n      return_list: False\n      collate_fn: gpt_collate_fn\n\n\nOptimizer:\n  name: FusedAdamW\n  weight_decay: 0.01\n  beta1: 0.9\n  beta2: 0.999\n  epsilon: 1.0e-8\n  lr:\n    name: CosineAnnealingWithWarmupDecay\n    decay_steps: 360000\n    warmup_rate: 0.01\n    max_lr: 5.0e-5\n    min_lr: 1.0e-5\n    use_increments: True\n  grad_clip:\n    name: \"ClipGradByGlobalNorm\"\n    clip_norm: 1.0\n  tensor_fusion: False\n\n\nProfiler:\n  enable: False\n  scheduler: [1, 5]\n  profiler_log: profiler_log\n  detailed: False\n\nDistributed:\n  fuse_sequence_parallel_allreduce: False\n"
  },
  {
    "path": "examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_cn_345M_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  name: \"GPT-cn\"\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: 4096\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  recompute_granularity:\n  no_recompute_layers:\n  \n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n"
  },
  {
    "path": "examples/transformer/models/GPT/pretrain/configs/prune_gpt_345M_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n  save_load:\n    save_steps: 1000\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: 4096\n  hidden_dropout_prob: 0.0\n  attention_probs_dropout_prob: 0.0\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  recompute_granularity:\n  no_recompute_layers:\n  \n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    comm_overlap: False\n\n\nOptimizer:\n  weight_decay: 0.0\n  lr:\n    decay_steps: 90000\n    warmup_rate: 0.00\n    max_lr: 2.5e-5\n    min_lr: 5.0e-6\n    \n\nCompress:\n  pretrained:\n  Prune:\n    enable: True\n    criterion: l1_norm\n    ratio: 0.125\n"
  },
  {
    "path": "examples/transformer/models/GPT/pretrain/configs/qat_gpt_345M_mp8.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: 8\n  local_batch_size: 8\n  micro_batch_size: 1\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size:\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  recompute_granularity:\n  fused_linear: True\n  \n\nDistributed:\n  dp_degree:\n  mp_degree: 8\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n\n\nCompress:\n  pretrained:\n  Quantization:\n    enable: True\n    weight_quantize_type: 'abs_max'\n    activation_quantize_type: 'moving_average_abs_max'\n    weight_bits: 8\n    activation_bits: 8\n    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']\n    onnx_format: True\n    freeze_embedding: True\n    skip_tensor_map: \n      block_3: ['linear2']\n      block_5: ['linear1']\n      block_6: ['linear2']\n      block_7: ['linear2']\n      block_10: ['linear2']\n      block_20: ['linear2']\n      block_21: ['linear2']\n"
  },
  {
    "path": "examples/transformer/models/GPT/pretrain/configs/qat_gpt_345M_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: 8\n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size:\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  recompute_granularity:\n  fused_linear: True\n  \n\nDistributed:\n  dp_degree:\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n\n\nCompress:\n  pretrained:\n  Quantization:\n    enable: True\n    weight_quantize_type: 'abs_max'\n    activation_quantize_type: 'moving_average_abs_max'\n    activation_preprocess_type: 'PACT'\n    weight_bits: 8\n    activation_bits: 8\n    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']\n    onnx_format: True\n    freeze_embedding: True\n    skip_tensor_map: \n      block_3: ['linear2']\n      block_5: ['linear1']\n      block_6: ['linear2']\n      block_7: ['linear2']\n      block_10: ['linear2']\n      block_20: ['linear2']\n      block_21: ['linear2']\n"
  },
  {
    "path": "examples/transformer/models/GPT/pretrain/configs/qat_gpt_6.7B_sharding16.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n  logging_freq: 10\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 4096\n  num_layers: 32\n  num_attention_heads: 32\n  ffn_hidden_size:\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: True\n  recompute_granularity:\n  no_recompute_layers:\n  fused_linear: True\n\n\nDistributed:\n  dp_degree:\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 16\n    sharding_stage: 2\n    sharding_offload: False\n    reduce_overlap: True\n    broadcast_overlap: True\n\n\nOptimizer:\n  tensor_fusion: True\n\n\nCompress:\n  pretrained:\n  Quantization:\n    enable: True\n    weight_quantize_type: 'abs_max'\n    activation_quantize_type: 'moving_average_abs_max'\n    activation_preprocess_type: 'PACT'\n    weight_bits: 8\n    activation_bits: 8\n    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']\n    onnx_format: True\n"
  },
  {
    "path": "examples/transformer/models/GPT/pretrain/export.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport sys\nimport copy\n\nimport paddle\nfrom paddle.distributed import fleet\nimport paddle.distributed as dist\nfrom paddle.static import InputSpec\n\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../')))\n\nfrom ppfleetx.distributed.apis import env, strategy, io\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.utils import device, log\nfrom ppfleetx.utils.export import export_inference_model\nfrom examples.transformer.utils import qat\nfrom examples.transformer.utils import config as cfg\nfrom examples.transformer.utils import components as cpn\n\nimport impls\n\nif __name__ == \"__main__\":\n    # parse config from yaml\n    args = cfg.parse_args()\n    config = cfg.get_config(args.config, overrides=args.override, show=False)\n\n    paddle.set_device(config.Global.device)\n\n    # init distributed env\n    nranks = dist.get_world_size()\n    if nranks > 1:\n        env.init_dist_env(config)\n\n    env.set_seed(config.Global.seed)\n\n    cfg.process_configs(config)\n    cfg.print_config(config)\n\n    if config.Global.mix_precision.enable:\n        logger.info(\"NOTE: disable mix_precision in export mode\")\n\n    # build GPT model\n    model, _, _ = impls.build_model(config)\n\n    # export\n    model.eval()\n    input_spec = [\n        InputSpec(\n            shape=[None, None], name=\"tokens\", dtype='int64'), InputSpec(\n                shape=[None, None], name=\"ids\", dtype='int64')\n    ]\n\n    output_dir = config.Global.save_load.output_dir\n    dp_rank = 0 if nranks == 1 else env.get_hcg().get_data_parallel_rank()\n    save_dir = os.path.join(output_dir, \"rank_{}\".format(dp_rank))\n\n    quanter = None\n    quant_mode = False\n\n    if 'Compress' in config:\n        mode = 'compress'\n        compress_configs = config['Compress']\n\n        if \"Quantization\" in compress_configs:\n            quant_mode = True\n\n        model, quanter = qat.compress_model(config, model, input_spec)\n\n    # load pretrained checkpoints\n    if config.Global.save_load.ckpt_dir is not None:\n        io.load(\n            config.Global.save_load.ckpt_dir,\n            model,\n            optimizer=None,\n            mode='export',\n            load_recovery=None)\n\n    if not quant_mode:\n        export_inference_model(model, input_spec, save_dir, 'model')\n    else:\n        logger.info(\"export quantized model.\")\n        export_inference_model(\n            model,\n            input_spec,\n            save_dir,\n            'model',\n            export_quant_model=True,\n            quanter=quanter)\n"
  },
  {
    "path": "examples/transformer/models/GPT/pretrain/impls.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport sys\nimport copy\n\nimport paddle\nimport paddle.distributed as dist\nfrom paddle.optimizer.lr import LRScheduler\nfrom paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients\n\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../')))\n\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.distributed.apis import env, amp\nimport ppfleetx.models.language_model.gpt as gpt\nfrom ppfleetx.utils.tensor_fusion_helper import all_reduce_parameters\nfrom ppfleetx.data.tokenizers import GPTTokenizer, GPTChineseTokenizer\nfrom ppfleetx.models.language_model.gpt.dygraph.sequence_parallel_utils import register_sequence_parallel_allreduce_hooks\n\nMODEL_CLASSES = {\n    \"GPT\": (GPTTokenizer, \"gpt2\"),\n    \"GPT-cn\": (GPTChineseTokenizer, \"gpt-cpm-large-cn\"),\n}\n\n\ndef _get_model_size(l, h, v, s):\n    P = 0\n    # embedding\n    P += (v + s) * h\n    # attention\n    P += (4 * h * h + 4 * h) * l\n    # layer_norm of decoder\n    P += (2 * (2 * h)) * l\n    # FFN Layer\n    P += (8 * h * h + 5 * h) * l\n    # layer_norm of transformer\n    P += 2 * h\n    logger.info('Model Size: {:.2f} B'.format(P / 1000.0 / 1000.0 / 1000.0))\n\n\ndef _vocab_size_with_padding(vocab_size, div_unit, mp_degree):\n    padded_size = vocab_size\n    multiple = div_unit * mp_degree\n    while (padded_size % multiple) != 0:\n        padded_size += 1\n    logger.warning(' > padded vocab (size: {}) with {} dummy tokens '\n                   '(new size: {})'.format(vocab_size, padded_size -\n                                           vocab_size, padded_size))\n    return padded_size\n\n\ndef build_model(config):\n    nranks = dist.get_world_size()\n    model_setting = copy.deepcopy(config.Model)\n\n    if 'Compress' in config and 'Quantization' in config.Compress:\n        quant_setting = copy.deepcopy(config.Compress.Quantization)\n        model_setting['skip_tensor_map'] = quant_setting.get('skip_tensor_map',\n                                                             {})\n        model_setting['freeze_embedding'] = quant_setting.get(\n            'freeze_embedding', False)\n\n    model_name = model_setting.pop(\"name\")\n    tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]\n    tokenizer = tokenizer_class.from_pretrained(pretrained_name)\n\n    model_setting['vocab_size'] = _vocab_size_with_padding(\n        model_setting.get('vocab_size', tokenizer.vocab_size),\n        model_setting.pop('vocab_size_divisible_unit', 128),\n        config.Distributed.get('mp_degree', 1))\n\n    l = model_setting['num_layers']\n    h = model_setting['hidden_size']\n    v = model_setting['vocab_size']\n    s = config.Data.Train.dataset.max_seq_len\n    _get_model_size(l, h, v, s)\n\n    if nranks == 1:\n        model_setting.pop(\"sequence_parallel\")\n        model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting))\n    else:\n        model_setting['num_partitions'] = config.Distributed.mp_degree\n        if config.Distributed.pp_degree == 1:\n            model_setting.pop(\"virtual_pp_degree\", None)\n            model = gpt.GPTForPretrainingHybrid(\n                gpt.GPTModelHybrid(**model_setting))\n        else:\n            model = gpt.GPTForPretrainingPipe(**model_setting)\n\n    if config.Model.sequence_parallel:\n        register_sequence_parallel_allreduce_hooks(\n            model, config.Global.accumulate_steps,\n            config.Distributed.fuse_sequence_parallel_allreduce)\n\n    if nranks == 1:\n        loss_fn = gpt.GPTPretrainingCriterion()\n    else:\n        loss_fn = gpt.GPTPretrainingCriterionHybird(\n            sequence_parallel=config.Model.sequence_parallel)\n\n    return model, tokenizer, loss_fn\n\n\ndef model_forward_backward(config, batch, forward_func, **kwargs):\n    acc_steps = config.Global.accumulate_steps\n    amp_enable = config.Global.mix_precision.enable\n    amp_dtype = config.Global.mix_precision.dtype\n    amp_level = config.Global.mix_precision.level\n    black_list = config.Global.mix_precision.custom_black_list\n    white_list = config.Global.mix_precision.custom_white_list\n\n    # train with pipeline strategy\n    if config.Distributed.pp_degree > 1:\n        tokens, position_ids, labels, loss_mask = batch\n        batch = [(tokens, position_ids), (labels, loss_mask)]\n\n        batches = [batch]\n\n        with paddle.amp.auto_cast(\n                amp_enable,\n                custom_black_list=black_list,\n                custom_white_list=white_list,\n                dtype=amp_dtype,\n                level=amp_level):\n\n            batch = kwargs['model']._prepare_training(\n                batch, kwargs['optimizer'], None)\n            loss = kwargs['model'].forward_backward_pipeline(batch,\n                                                             kwargs['scaler'])\n\n        return loss\n\n    # train with non-pipeline strategy\n    if acc_steps == 1:\n        batches = [batch]\n    else:\n        split_batches = [paddle.split(b, acc_steps) for b in batch]\n        batches = []\n        for i in range(len(split_batches[0])):\n            micro_batch = [split_batch[i] for split_batch in split_batches]\n            batches.append(micro_batch)\n\n    # gradient merge strategy\n    final_loss = None\n    for micro_batch in batches:\n        with paddle.amp.auto_cast(\n                amp_enable,\n                custom_black_list=black_list,\n                custom_white_list=white_list,\n                dtype=amp_dtype,\n                level=amp_level):\n\n            # forward in training step\n            loss = forward_func(micro_batch, kwargs['model'],\n                                kwargs['loss_fn'])\n\n        loss_bw = kwargs['scaler'].scale(\n            loss) if amp_enable and amp_dtype == \"float16\" else loss\n        loss_bw = loss_bw / acc_steps if acc_steps > 1 else loss_bw\n        loss_bw.backward()\n\n        detach_loss = loss.detach()\n        if final_loss is None:\n            final_loss = detach_loss\n        else:\n            final_loss = paddle.add(final_loss, detach_loss)\n\n    final_loss = final_loss / acc_steps if acc_steps > 1 else final_loss\n\n    return final_loss\n\n\ndef optim_update_params(config, **kwargs):\n    hcg = env.get_hcg()\n    amp_enable = config.Global.mix_precision.enable\n    amp_dtype = config.Global.mix_precision.dtype\n\n    dp_degree = config.Distributed.dp_degree\n    sharding_stage = config.Distributed.sharding.sharding_stage\n\n    if config.Model.use_recompute and isinstance(kwargs['model'],\n                                                 paddle.DataParallel):\n        if not hasattr(kwargs['optimizer'], \"all_fused_tensors\") or kwargs[\n                'optimizer'].all_fused_tensors is None:\n            fused_allreduce_gradients(list(kwargs['model'].parameters()), None)\n        else:\n            dp_group = hcg.get_data_parallel_group()\n            all_reduce_parameters(kwargs['optimizer'].all_fused_tensors,\n                                  dp_group)\n    elif isinstance(kwargs['model'], amp.MixPrecisionLayer) \\\n        and dist.get_world_size() > 1 and dist.get_world_size() == dp_degree:\n        fused_allreduce_gradients(list(kwargs['model'].parameters()), None)\n\n    if sharding_stage == 3 and dp_degree > 1:\n        dp_group = hcg.get_data_parallel_group()\n        fused_allreduce_gradients(kwargs['model'].parameters(), hcg)\n\n        for p in kwargs['model'].parameters():\n            if hasattr(p, \"bw_storage\"):\n                assert p.grad is None, \"This case shouldn't happen.\"\n                p.bw_storage.scale_(1.0 / dp_group.nranks)\n                dist.all_reduce(p.bw_storage, group=dp_group)\n\n    if amp_enable and amp_dtype == 'float16':\n        kwargs['scaler'].step(kwargs['optimizer'])\n        kwargs['scaler'].update()\n    else:\n        kwargs['optimizer'].step()\n\n\ndef fit_impl(config, batch, forward_func, **kwargs):\n    kwargs['model'].train()\n\n    if config.Distributed.pp_degree == 1:\n        if config.Model.use_recompute and isinstance(kwargs['model'],\n                                                     paddle.DataParallel):\n            with kwargs['model'].no_sync():\n                loss = model_forward_backward(config, batch, forward_func,\n                                              **kwargs)\n        else:\n            loss = model_forward_backward(config, batch, forward_func,\n                                          **kwargs)\n    else:\n        loss = model_forward_backward(config, batch, forward_func, **kwargs)\n\n    optim_update_params(config, **kwargs)\n\n    return loss\n\n\n@paddle.no_grad()\ndef eval_impl(config, batch, model, loss_fn):\n    model.eval()\n\n    amp_enable = config.Global.mix_precision.enable\n    amp_dtype = config.Global.mix_precision.dtype\n    amp_level = config.Global.mix_precision.level\n    black_list = config.Global.mix_precision.custom_black_list\n    white_list = config.Global.mix_precision.custom_white_list\n\n    with paddle.amp.auto_cast(\n            amp_enable,\n            custom_black_list=black_list,\n            custom_white_list=white_list,\n            dtype=amp_dtype,\n            level=amp_level):\n        tokens, position_ids, labels, loss_mask = batch\n\n        if config.Distributed.pp_degree == 1:\n            tokens, position_ids, labels, loss_mask = batch\n            preds = model(tokens, position_ids)\n            preds = paddle.cast(preds, dtype=\"float32\")\n            loss = loss_fn(preds, labels, loss_mask)\n        else:\n            batch = [(tokens, position_ids), (labels, loss_mask)]\n            loss = model.eval_batch(batch, compute_loss=True)\n\n    return loss\n"
  },
  {
    "path": "examples/transformer/models/GPT/pretrain/run.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport sys\nimport copy\n\nimport paddle\nfrom paddle.distributed import fleet\nimport paddle.distributed as dist\nfrom paddle.static import InputSpec\n\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../')))\n\nfrom ppfleetx.distributed.apis import env, strategy, io, amp\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.utils import device, log\nfrom examples.transformer.utils import qat\nfrom examples.transformer.utils import config as cfg\nfrom examples.transformer.utils import components as cpn\n\nimport impls\n\nif __name__ == \"__main__\":\n    # parse config from yaml\n    args = cfg.parse_args()\n    config = cfg.get_config(args.config, overrides=args.override, show=False)\n\n    paddle.set_device(config.Global.device)\n\n    # init distributed env\n    nranks = dist.get_world_size()\n    if nranks > 1:\n        env.init_dist_env(config)\n\n    env.set_seed(config.Global.seed)\n\n    cfg.process_configs(config)\n    cfg.print_config(config)\n\n    # Note: Only for GPTDataset\n    dataset_kwargs = {\n        \"seed\": config.Global.seed,\n        \"model_type\": config.Model.name,\n    }\n    sampler_kwargs = {\"batch_size\": config.Global.local_batch_size, }\n\n    # build dataloader for training/eval\n    dataset_kwargs.update({\"mode\": \"Train\"})\n    dataset = cpn.build_dataset(config.Data.Train.dataset, **dataset_kwargs)\n    sampler = cpn.build_batch_sampler(config.Data.Train.sampler, dataset,\n                                      **sampler_kwargs)\n    train_data_loader = cpn.build_dataloader(config.Data.Train.loader, dataset,\n                                             sampler)\n\n    dataset_kwargs.update({\"mode\": \"Eval\"})\n    dataset = cpn.build_dataset(config.Data.Eval.dataset, **dataset_kwargs)\n    sampler = cpn.build_batch_sampler(config.Data.Eval.sampler, dataset,\n                                      **sampler_kwargs)\n    valid_data_loader = cpn.build_dataloader(config.Data.Eval.loader, dataset,\n                                             sampler)\n\n    # build GPT model\n    model, tokenizer, loss_fn = impls.build_model(config)\n\n    if 'Compress' in config:\n        input_spec = [\n            InputSpec(\n                shape=[None, None], name=\"tokens\", dtype='int64'), InputSpec(\n                    shape=[None, None], name=\"ids\", dtype='int64')\n        ]\n        model, quanter = qat.compress_model(config, model, input_spec)\n\n    amp_config = config.Global.mix_precision\n    amp_enable = amp_config['enable']\n    amp_dtype = amp_config.get('dtype', 'float16')\n    amp_level = amp_config.get('level', 'O2')\n    amp_use_main_grad = amp_config.get('use_main_grad', False)\n    amp_scale_loss = amp_config.get('scale_loss', 32768)\n\n    if amp_enable:\n        if amp_dtype == \"float16\":\n            scaler = paddle.amp.GradScaler(init_loss_scaling=amp_scale_loss)\n        elif amp_dtype == \"bfloat16\":\n            scaler = paddle.amp.GradScaler(\n                init_loss_scaling=1, use_dynamic_loss_scaling=False)\n\n        # Note: Save dtype is the same as model dtype. Also can set save_dtype='float32' when \n        # training with pure fp16 strategy, but will cause the rise of memory.\n        model = paddle.amp.decorate(\n            models=model, level=amp_level, dtype=amp_dtype)\n    else:\n        scaler = None\n\n    config.Optimizer.lr.update({\n        'epochs': config.Global.num_train_epochs,\n        'step_each_epoch': len(train_data_loader),\n        'total_steps': config.Global.max_steps,\n    })\n\n    use_increments = config.Optimizer.lr.pop('use_increments', False)\n\n    # build lr and optim\n    lr_scheduler = cpn.build_lr_scheduler(config.Optimizer.lr)\n    optimizer = cpn.build_optimizer(\n        config.Optimizer,\n        model,\n        lr_scheduler,\n        multi_precision=config.Global.mix_precision.enable)\n\n    if amp_enable and amp_dtype in [\n            'float16', 'bfloat16'\n    ] and amp_level == 'O2' and amp_use_main_grad:\n        model = amp.MixPrecisionLayer(model, dtype=amp_dtype)\n        optimizer = amp.MixPrecisionOptimizer(optimizer)\n        scaler = amp.MixPrecisionScaler(scaler)\n\n    # call fleet wrapper\n    if nranks > 1:\n        model, optimizer, scaler = strategy.wrap_with_fleet(\n            config.Distributed, model, optimizer, scaler)\n\n    # load pretrained checkpoints\n    load_recovery = {'step': 0, 'epoch': 0, 'rng_state': -1}\n    if config.Global.save_load.ckpt_dir is not None:\n        io.load(config.Global.save_load.ckpt_dir, model, optimizer, 'train',\n                load_recovery)\n\n    # build profiler\n    if config.get('Profiler', {}).get('enable', False):\n        profiler = cpn.build_profiler(config.Profiler)\n    else:\n        profiler = None\n\n    # start training\n    train_start = log.get_timestamp()\n\n    if load_recovery['rng_state'] != -1:\n        paddle.set_cuda_rng_state(load_recovery['rng_state'])\n\n    for epoch_index in range(load_recovery['epoch'],\n                             config.Global.num_train_epochs):\n        train_epoch_start = log.get_timestamp()\n\n        # time count\n        train_losses = []\n        train_step_start = log.get_timestamp()\n\n        # Note(GuoxiaWang): Do not use len(train_data_loader()),\n        # it will cause a memory leak.\n        total_train_batch = len(train_data_loader)\n        total_train_step = config.Global.max_steps\n        total_eval_batch = len(\n            valid_data_loader) if valid_data_loader is not None else 0\n        valid_data_loader = valid_data_loader(\n        ) if valid_data_loader is not None else None\n        eval_finished_step = 0\n        for step, batch in enumerate(train_data_loader()):\n            if epoch_index == load_recovery['epoch']:\n                if step < load_recovery['step']:\n                    continue\n\n            model.train()\n            fit_kwargs = {\n                \"model\": model,\n                \"loss_fn\": loss_fn,\n                \"scaler\": scaler,\n                \"optimizer\": optimizer,\n            }\n\n            def forward_func(batch, model, loss_fn):\n                tokens, position_ids, labels, loss_mask = batch\n\n                loss_mask.stop_gradient = True\n                labels.stop_gradient = True\n                position_ids.stop_gradient = True\n\n                preds = model(tokens, position_ids)\n                loss = loss_fn(preds, labels, loss_mask)\n\n                return loss\n\n            loss = impls.fit_impl(config, batch, forward_func, **fit_kwargs)\n            train_losses.append(loss)\n\n            if lr_scheduler is not None:\n                if scaler is None or scaler._found_inf == 0:\n                    lr_scheduler.step(epoch=config.Global.global_batch_size\n                                      if use_increments else None)\n\n            # training step log\n            if (step + 1) % config.Global.logging_freq == 0:\n                train_step_cost = log.get_timestamp() - train_step_start\n                numpy_losses = [float(loss) for loss in train_losses]\n\n                train_cost = train_step_cost \\\n                    if step == 0 else train_step_cost / config.Global.logging_freq\n                speed = 1. / train_cost\n                default_global_tokens_num = config.Global.global_batch_size * \\\n                    config.Data.Train.dataset.max_seq_len\n                ips_total = speed * default_global_tokens_num\n                ips = ips_total / env.get_data_world_size()\n\n                loss_scale_str = \" loss_scale: %.9f,\" % (\n                    scaler._scale.numpy()[0]) if scaler is not None else \"\"\n\n                logger.info(\n                    \"[train] epoch: [%d/%d], batch: [%d/%d], loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, \" \\\n                    \"ips_total: %.0f tokens/s, ips: %.0f tokens/s,%s learning rate: %.5e, found_inf: %d\"\n                    % (epoch_index, config.Global.num_train_epochs, step, total_train_step, sum(numpy_losses) / len(numpy_losses), train_cost, speed, ips_total, ips, loss_scale_str, optimizer.get_lr(), scaler._found_inf if scaler is not None else 0))\n\n                train_step_start = log.get_timestamp()\n                train_losses = []\n\n            optimizer.clear_grad()\n\n            # start eval\n            if step > 0 and config.Global.eval_freq > 0 and step % config.Global.eval_freq == 0:\n                eval_losses = []\n                eval_step_start = log.get_timestamp()\n\n                for eval_step, batch in enumerate(valid_data_loader):\n                    eval_finished_step += 1\n                    loss = impls.eval_impl(config, batch, model, loss_fn)\n                    eval_losses.append(loss)\n\n                    if eval_step >= config.Global.eval_iters - 1:\n                        break\n\n                eval_step_cost = log.get_timestamp() - eval_step_start\n                eval_loss = sum(eval_losses) / len(eval_losses)\n                eval_cost = eval_step_cost / config.Global.logging_freq\n\n                logger.info(\n                    \"[eval] epoch: %d, batch: %d/%d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s\"\n                    % (epoch_index, eval_step, eval_finished_step,\n                       float(eval_loss), eval_cost, 1. / eval_cost))\n\n            if step > 0 and config.Global.save_load.save_steps > 0 and \\\n                step % config.Global.save_load.save_steps == 0:\n                device.synchronize()\n                io.save(\n                    config.Global.save_load.output_dir,\n                    model,\n                    optimizer,\n                    step=step,\n                    epoch=epoch_index,\n                    sharding_stage=config.Distributed.sharding.sharding_stage)\n\n            if step >= config.Global.max_steps:\n                break\n\n            if profiler:\n                profiler.step()\n\n        # training epoch log\n        train_epoch_cost = log.get_timestamp() - train_epoch_start\n        logger.info(\"[Training] epoch: %d, total time: %.5f sec\" %\n                    (epoch_index, train_epoch_cost))\n\n    # training end log\n    logger.info(\n        \"The training process is complete and total cost of time for training is : {}\".\n        format(\n            log.convert_timestamp_to_data(log.get_timestamp() - train_start)))\n\n    if profiler:\n        cpn.profiler_done(profiler, config.Profiler)\n"
  },
  {
    "path": "examples/transformer/models/GPT/pretrain_moe/configs/pretrain_moe_345M_single_card.yaml",
    "content": "_base_: ./pretrain_moe_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 2\n  max_steps: 20000\n  logging_freq: 10\n  mix_precision:\n    enable: True\n\nData:\n  Train:\n    dataset:\n      split: [98,2,0]\n    loader:\n      num_workers: 0\n  Eval:\n    dataset:\n      split: [98,2,0]\n\nModel:\n  vocab_size: 50304\n  hidden_size: 768\n  num_layers: 12\n  num_attention_heads: 12\n  ffn_hidden_size: 3072\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.014\n  use_recompute: True\n  recompute_granularity:\n  no_recompute_layers:\n  num_experts: 2,\n  expert_interval: 2\n  topk: 1\n  moe_use_residual: False #True\n  moe_train_capacity_factor: 1.0\n  moe_eval_capacity_factor: 1.0\n  moe_min_capacity: 4\n  moe_token_dropping: True\n  balance_loss_weight: 0.01\n  enable_expert_tensor_parallelism: False\n\n\n  \n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n"
  },
  {
    "path": "examples/transformer/models/GPT/pretrain_moe/configs/pretrain_moe_base.yaml",
    "content": "Global:\n  device: gpu\n  seed: 1234\n\n  global_batch_size: \n  local_batch_size: 1\n  micro_batch_size: 1\n\n  max_steps: 500000\n  num_train_epochs: 1\n  accumulate_steps:\n  logging_freq: 1\n  eval_freq: 1000\n  eval_iters: 10\n  test_iters:\n  mix_precision:\n    enable: True\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n  save_load:\n    save_steps: 1000\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\n\nModel:\n  name: \"GPT\"\n  fused_linear: False\n  fuse_attn_qkv: True\n  sequence_parallel: False\n  no_recompute_layers:\n\n\nData:\n  Train:\n    dataset:\n      name: GPTDataset\n      input_dir: ./data/\n      split: [949, 50, 1]\n      max_seq_len: 1024\n    sampler:\n      name: GPTBatchSampler\n      shuffle: False\n      drop_last: True\n    loader:\n      num_workers: 1\n      return_list: False\n      collate_fn: gpt_collate_fn\n  \n  Eval:\n    dataset:\n      name: GPTDataset\n      input_dir: ./data/\n      split: [949, 50, 1]\n      max_seq_len: 1024\n    sampler:\n      name: GPTBatchSampler\n      shuffle: False\n      drop_last: True\n    loader:\n      num_workers: 1\n      return_list: False\n      collate_fn: gpt_collate_fn\n\n\nOptimizer:\n  name: FusedAdamW\n  weight_decay: 0.1\n  beta1: 0.9\n  beta2: 0.95\n  epsilon: 1.0e-8\n  lr:\n    name: CosineAnnealingWithWarmupDecay\n    decay_steps: 20000\n    warmup_rate: 0.01\n    max_lr: 4.5e-4\n    min_lr: 4.5e-6\n  grad_clip:\n    name: \"ClipGradByGlobalNorm\"\n    clip_norm: 1.0\n  tensor_fusion: False\n\n\nProfiler:\n  enable: False\n  scheduler: [1, 5]\n  profiler_log: profiler_log\n  detailed: False\n\nDistributed:\n  fuse_sequence_parallel_allreduce: False\n"
  },
  {
    "path": "examples/transformer/models/GPT/pretrain_moe/impls.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport sys\nimport copy\n\nimport numpy as np\n\nimport paddle\nimport paddle.distributed as dist\nfrom paddle.optimizer.lr import LRScheduler\nfrom paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients\n\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.distributed.apis import env\nimport ppfleetx.models.language_model.gpt as gpt\nfrom ppfleetx.utils.tensor_fusion_helper import all_reduce_parameters\nfrom ppfleetx.data.tokenizers import GPTTokenizer, GPTChineseTokenizer\nfrom ppfleetx.models.language_model.gpt.dygraph.sequence_parallel_utils import register_sequence_parallel_allreduce_hooks\n\nMODEL_CLASSES = {\n    \"GPT\": (GPTTokenizer, \"gpt2\"),\n    \"GPT-cn\": (GPTChineseTokenizer, \"gpt-cpm-large-cn\"),\n}\n\n\ndef _get_model_size(l, h, v, s, ne, ei):\n    assert len(ne) == 1 or len(ne) == l // ei, \\\n            'num_experts must be either a single value or a list of the same length as the number of MoE layers'\n    P = 0\n    # embedding\n    P += (v + s) * h\n    logger.info(f'vs: {v} {s}')\n    moe_mode = True\n    if len(ne) == 1:\n        if ne[0] == 1:\n            moe_mode = False\n        ne = ne * (l // ei)\n    for i in range(l):\n        # attention\n        P += 4 * h * h + 4 * h\n        # layer_norm of decoder\n        P += 2 * (2 * h)\n        # MoE Layer\n        if ((i + 1) % ei == 0) and moe_mode:\n            nei = ne[i // ei]\n            # gate\n            P += (h * nei + nei)\n            # experts\n            P += nei * (8 * h * h + 5 * h)\n        # FFN Layer\n        else:\n            P += 8 * h * h + 5 * h\n    # layer_norm of transformer\n    P += 2 * h\n    logger.info('Model Size: {:.2f} B'.format(P / 1000.0 / 1000.0 / 1000.0))\n\n\ndef build_model(config):\n    nranks = dist.get_world_size()\n    model_setting = copy.deepcopy(config.Model)\n\n    if 'Compress' in config and 'Quantization' in config.Compress:\n        quant_setting = copy.deepcopy(config.Compress.Quantization)\n        model_setting['skip_tensor_map'] = quant_setting.get('skip_tensor_map',\n                                                             {})\n        model_setting['freeze_embedding'] = quant_setting.get(\n            'freeze_embedding', False)\n\n    l = model_setting['num_layers']\n    h = model_setting['hidden_size']\n    v = model_setting['vocab_size']\n    s = model_setting['max_position_embeddings']\n    ne = model_setting['num_experts']\n    ei = model_setting['expert_interval']\n    _get_model_size(l, h, v, s, ne, ei)\n\n    model_name = model_setting.pop(\"name\")\n    tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]\n    tokenizer = tokenizer_class.from_pretrained(pretrained_name)\n\n    model_setting.pop(\"balance_loss_weight\")\n    if nranks == 1:\n        model_setting.pop(\"sequence_parallel\")\n        model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting))\n    else:\n        model_setting['num_partitions'] = config.Distributed.mp_degree\n        if config.Distributed.pp_degree == 1:\n            model_setting.pop(\"virtual_pp_degree\", None)\n            model = gpt.GPTForPretrainingHybrid(\n                gpt.GPTModelHybrid(**model_setting))\n        else:\n            model = gpt.GPTForPretrainingPipe(**model_setting)\n\n    if config.Model.sequence_parallel:\n        register_sequence_parallel_allreduce_hooks(\n            model, config.Global.accumulate_steps,\n            config.Distributed.fuse_sequence_parallel_allreduce)\n\n    if nranks == 1:\n        loss_fn = gpt.GPTPretrainingCriterion()\n    else:\n        loss_fn = gpt.GPTPretrainingCriterionHybird(\n            sequence_parallel=config.Model.sequence_parallel)\n\n    return model, tokenizer, loss_fn\n\n\ndef model_forward_backward(config, batch, forward_func, **kwargs):\n    acc_steps = config.Global.accumulate_steps\n    use_fp16 = config.Global.mix_precision.enable\n    black_list = config.Global.mix_precision.custom_black_list\n    white_list = config.Global.mix_precision.custom_white_list\n\n    # HACK： add 'expand' to black_list (put_along_axis_)\n    black_list.append('expand_v2')\n    # train with pipeline strategy\n    if config.Distributed.pp_degree > 1:\n        tokens, position_ids, labels, loss_mask = batch\n        batch = [(tokens, position_ids), (labels, loss_mask)]\n\n        batches = [batch]\n\n        with paddle.amp.auto_cast(\n                use_fp16,\n                custom_black_list=black_list,\n                custom_white_list=white_list,\n                level='O2'):\n\n            batch = kwargs['model']._prepare_training(\n                batch, kwargs['optimizer'], None)\n            loss = kwargs['model'].forward_backward_pipeline(batch,\n                                                             kwargs['scaler'])\n\n        return loss\n\n    # train with non-pipeline strategy\n    if acc_steps == 1:\n        batches = [batch]\n    else:\n        split_batches = [paddle.split(b, acc_steps) for b in batch]\n        batches = []\n        for i in range(len(split_batches[0])):\n            micro_batch = [split_batch[i] for split_batch in split_batches]\n            batches.append(micro_batch)\n\n    # gradient merge strategy\n    final_loss = None\n    for micro_batch in batches:\n        with paddle.amp.auto_cast(\n                use_fp16,\n                custom_black_list=black_list,\n                custom_white_list=white_list,\n                level='O2'):\n\n            # forward in training step\n            loss = forward_func(micro_batch, kwargs['model'],\n                                kwargs['loss_fn'])\n\n        # calculate auxiliary loss to balance experts' load\n        if max(config.Model.\n               num_experts) > 1 and config.Model.balance_loss_weight:\n            aux_loss_list = [\n                l.moe_mlp.fleetx_moe.get_loss()\n                for l in kwargs['model'].gpt.decoder.layers\n                if l.moe_mlp is not None\n            ]\n            bal_loss = paddle.concat(aux_loss_list)\n            if bal_loss.dtype == paddle.float16:\n                bal_loss = paddle.cast(bal_loss, dtype=paddle.float32)\n            bal_loss = bal_loss.mean()\n            loss += bal_loss * config.Model.balance_loss_weight\n        loss_bw = kwargs['scaler'].scale(loss) if use_fp16 else loss\n        loss_bw = loss_bw / acc_steps if acc_steps > 1 else loss_bw\n        loss_bw.backward()\n\n        detach_loss = loss.detach()\n        if final_loss is None:\n            final_loss = detach_loss\n        else:\n            final_loss = paddle.add(final_loss, detach_loss)\n\n    final_loss = final_loss / acc_steps if acc_steps > 1 else final_loss\n\n    return final_loss\n\n\ndef optim_update_params(config, **kwargs):\n    hcg = env.get_hcg()\n    use_fp16 = config.Global.mix_precision.enable\n\n    dp_degree = config.Distributed.dp_degree\n    sharding_stage = config.Distributed.sharding.sharding_stage\n\n    if config.Model.use_recompute and isinstance(kwargs['model'],\n                                                 paddle.DataParallel):\n        if not hasattr(kwargs['optimizer'], \"all_fused_tensors\") or kwargs[\n                'optimizer'].all_fused_tensors is None:\n            fused_allreduce_gradients(list(kwargs['model'].parameters()), None)\n        else:\n            dp_group = hcg.get_data_parallel_group()\n            all_reduce_parameters(kwargs['optimizer'].all_fused_tensors,\n                                  dp_group)\n\n    if sharding_stage == 3 and dp_degree > 1:\n        dp_group = hcg.get_data_parallel_group()\n        fused_allreduce_gradients(kwargs['model'].parameters(), hcg)\n\n        for p in kwargs['model'].parameters():\n            if hasattr(p, \"bw_storage\"):\n                assert p.grad is None, \"This case shouldn't happen.\"\n                p.bw_storage.scale_(1.0 / dp_group.nranks)\n                dist.all_reduce(p.bw_storage, group=dp_group)\n\n    if use_fp16:\n        kwargs['scaler'].step(kwargs['optimizer'])\n        kwargs['scaler'].update()\n    else:\n        kwargs['optimizer'].step()\n\n\ndef fit_impl(config, batch, forward_func, **kwargs):\n    kwargs['model'].train()\n\n    if config.Distributed.pp_degree == 1:\n        if config.Model.use_recompute and isinstance(kwargs['model'],\n                                                     paddle.DataParallel):\n            with kwargs['model'].no_sync():\n                loss = model_forward_backward(config, batch, forward_func,\n                                              **kwargs)\n        else:\n            loss = model_forward_backward(config, batch, forward_func,\n                                          **kwargs)\n    else:\n        loss = model_forward_backward(config, batch, forward_func, **kwargs)\n\n    optim_update_params(config, **kwargs)\n\n    return loss\n\n\n@paddle.no_grad()\ndef eval_impl(config, batch, model, loss_fn):\n    model.eval()\n\n    use_fp16 = config.Global.mix_precision.enable\n    black_list = config.Global.mix_precision.custom_black_list\n    white_list = config.Global.mix_precision.custom_white_list\n\n    with paddle.amp.auto_cast(\n            use_fp16,\n            custom_black_list=black_list,\n            custom_white_list=white_list,\n            level='O2'):\n        tokens, position_ids, labels, loss_mask = batch\n\n        if config.Distributed.pp_degree == 1:\n            tokens, position_ids, labels, loss_mask = batch\n            preds = model(tokens, position_ids)\n            preds = paddle.cast(preds, dtype=\"float32\")\n            loss = loss_fn(preds, labels, loss_mask)\n        else:\n            batch = [(tokens, position_ids), (labels, loss_mask)]\n            loss = model.eval_batch(batch, compute_loss=True)\n\n    return loss\n"
  },
  {
    "path": "examples/transformer/models/GPT/pretrain_moe/run.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport sys\nimport copy\n\nimport paddle\nfrom paddle.distributed import fleet\nimport paddle.distributed as dist\nfrom paddle.static import InputSpec\n\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.insert(1, os.path.abspath(os.path.join(__dir__, '../../../../../')))\n\nfrom ppfleetx.distributed.apis import env, strategy, io\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.utils import device, log\nfrom examples.transformer.utils import config as cfg\nfrom examples.transformer.utils import components as cpn\n\nimport impls\n\nif __name__ == \"__main__\":\n    # parse config from yaml\n    args = cfg.parse_args()\n    config = cfg.get_config(args.config, overrides=args.override, show=True)\n\n    # HACK: use certain device\n    paddle.set_device(config.Global.device + ':3')\n\n    # init distributed env\n    nranks = dist.get_world_size()\n    if nranks > 1:\n        env.init_dist_env(config)\n\n    env.set_seed(config.Global.seed)\n\n    cfg.process_configs(config)\n    cfg.print_config(config)\n\n    # Note: Only for GPTDataset\n    dataset_kwargs = {\n        \"seed\": config.Global.seed,\n        \"model_type\": config.Model.name,\n    }\n    sampler_kwargs = {\"batch_size\": config.Global.local_batch_size, }\n\n    # build dataloader for training/eval\n    dataset_kwargs.update({\"mode\": \"Train\"})\n    dataset = cpn.build_dataset(config.Data.Train.dataset, **dataset_kwargs)\n    sampler = cpn.build_batch_sampler(config.Data.Train.sampler, dataset,\n                                      **sampler_kwargs)\n    train_data_loader = cpn.build_dataloader(config.Data.Train.loader, dataset,\n                                             sampler)\n\n    dataset_kwargs.update({\"mode\": \"Eval\"})\n    dataset = cpn.build_dataset(config.Data.Eval.dataset, **dataset_kwargs)\n    sampler = cpn.build_batch_sampler(config.Data.Eval.sampler, dataset,\n                                      **sampler_kwargs)\n    valid_data_loader = cpn.build_dataloader(config.Data.Eval.loader, dataset,\n                                             sampler)\n\n    # build GPT model\n    model, tokenizer, loss_fn = impls.build_model(config)\n\n    if 'Compress' in config:\n        from examples.transformer.utils import qat\n        input_spec = [\n            InputSpec(\n                shape=[None, None], name=\"tokens\", dtype='int64'), InputSpec(\n                    shape=[None, None], name=\"ids\", dtype='int64')\n        ]\n        model, quanter = qat.compress_model(config, model, input_spec)\n\n    if config.Global.mix_precision.enable:\n        scaler = paddle.amp.GradScaler(\n            init_loss_scaling=config.Global.mix_precision.scale_loss)\n        # Note: Save dtype is the same as model dtype. Also can set save_dtype='float32' when \n        # training with pure fp16 strategy, but will cause the rise of memory.\n        model = paddle.amp.decorate(models=model, level='O2')\n    else:\n        scaler = None\n\n    config.Optimizer.lr.update({\n        'epochs': config.Global.num_train_epochs,\n        'step_each_epoch': len(train_data_loader),\n        'total_steps': config.Global.max_steps,\n    })\n\n    # build lr and optim\n    lr_scheduler = cpn.build_lr_scheduler(config.Optimizer.lr)\n    optimizer = cpn.build_optimizer(\n        config.Optimizer,\n        model,\n        lr_scheduler,\n        multi_precision=config.Global.mix_precision.enable)\n\n    # call fleet wrapper\n    if nranks > 1:\n        model, optimizer, scaler = strategy.wrap_with_fleet(\n            config.Distributed, model, optimizer, scaler)\n\n    # load pretrained checkpoints\n    load_recovery = {'step': 0, 'epoch': 0, 'rng_state': -1}\n    if config.Global.save_load.ckpt_dir is not None:\n        io.load(config.Global.save_load.ckpt_dir, model, optimizer, 'train',\n                load_recovery)\n\n    # build profiler\n    if config.get('Profiler', {}).get('enable', False):\n        profiler = cpn.build_profiler(config.Profiler)\n    else:\n        profiler = None\n\n    # start training\n    train_start = log.get_timestamp()\n\n    if load_recovery['rng_state'] != -1:\n        paddle.set_cuda_rng_state(load_recovery['rng_state'])\n\n    for epoch_index in range(load_recovery['epoch'],\n                             config.Global.num_train_epochs):\n        train_epoch_start = log.get_timestamp()\n\n        # time count\n        train_losses = []\n        train_step_start = log.get_timestamp()\n\n        # Note(GuoxiaWang): Do not use len(train_data_loader()),\n        # it will cause a memory leak.\n        total_train_batch = len(train_data_loader)\n        total_eval_batch = len(\n            valid_data_loader) if valid_data_loader is not None else 0\n        for step, batch in enumerate(train_data_loader):\n            if epoch_index == load_recovery['epoch']:\n                if step <= load_recovery['step']:\n                    continue\n\n            model.train()\n            fit_kwargs = {\n                \"model\": model,\n                \"loss_fn\": loss_fn,\n                \"scaler\": scaler,\n                \"optimizer\": optimizer,\n            }\n\n            def forward_func(batch, model, loss_fn):\n                tokens, position_ids, labels, loss_mask = batch\n\n                loss_mask.stop_gradient = True\n                labels.stop_gradient = True\n                position_ids.stop_gradient = True\n\n                preds = model(tokens, position_ids)\n                loss = loss_fn(preds, labels, loss_mask)\n\n                return loss\n\n            loss = impls.fit_impl(config, batch, forward_func, **fit_kwargs)\n            train_losses.append(loss)\n\n            # training step log\n            if (step + 1) % config.Global.logging_freq == 0:\n                train_step_cost = log.get_timestamp() - train_step_start\n                numpy_losses = [loss.numpy()[0] for loss in train_losses]\n\n                train_cost = train_step_cost \\\n                    if step == 0 else train_step_cost / config.Global.logging_freq\n                speed = 1. / train_cost\n                default_global_tokens_num = config.Global.global_batch_size * \\\n                    config.Data.Train.dataset.max_seq_len\n                ips_total = speed * default_global_tokens_num\n                ips = ips_total / env.get_data_world_size()\n\n                logger.info(\n                    \"[train] epoch: %d, batch: %d, loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, \" \\\n                    \"ips_total: %.0f tokens/s, ips: %.0f tokens/s, learning rate: %.5e\"\n                    % (epoch_index, step, sum(numpy_losses) / len(numpy_losses), train_cost, speed, ips_total, ips, optimizer.get_lr()))\n\n                train_step_start = log.get_timestamp()\n                train_losses = []\n\n            if lr_scheduler is not None:\n                lr_scheduler.step()\n\n            optimizer.clear_grad()\n\n            # start eval\n            if step > 0 and config.Global.eval_freq > 0 and step % config.Global.eval_freq == 0:\n                eval_losses = []\n                eval_step_start = log.get_timestamp()\n\n                for eval_step, batch in enumerate(valid_data_loader):\n                    loss = impls.eval_impl(config, batch, model, loss_fn)\n                    eval_losses.append(loss)\n\n                    if eval_step >= config.Global.eval_iters - 1:\n                        break\n\n                eval_step_cost = log.get_timestamp() - eval_step_start\n                eval_loss = sum(eval_losses) / len(eval_losses)\n                eval_cost = eval_step_cost / config.Global.logging_freq\n\n                logger.info(\n                    \"[eval] epoch: %d, batch: %d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s\"\n                    % (epoch_index, eval_step, eval_loss.numpy()[0], eval_cost,\n                       1. / eval_cost))\n\n            if step > 0 and config.Global.save_load.save_steps > 0 and \\\n                step % config.Global.save_load.save_steps == 0:\n                device.synchronize()\n                io.save(\n                    config.Global.save_load.output_dir,\n                    model,\n                    optimizer,\n                    step=step,\n                    epoch=epoch_index,\n                    sharding_stage=config.Distributed.sharding.sharding_stage)\n\n            if step >= config.Global.max_steps:\n                break\n\n            if profiler:\n                profiler.step()\n\n        # training epoch log\n        train_epoch_cost = log.get_timestamp() - train_epoch_start\n        logger.info(\"[Training] epoch: %d, total time: %.5f sec\" %\n                    (epoch_index, train_epoch_cost))\n\n    # training end log\n    logger.info(\n        \"The training process is complete and total cost of time for training is : {}\".\n        format(\n            log.convert_timestamp_to_data(log.get_timestamp() - train_start)))\n\n    if profiler:\n        cpn.profiler_done(profiler, config.Profiler)\n"
  },
  {
    "path": "examples/transformer/utils/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "examples/transformer/utils/components.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport sys\nimport copy\nimport random\nimport numpy as np\n\nimport paddle\nimport paddle.distributed as dist\nfrom paddle.optimizer.lr import LRScheduler\nfrom paddle.profiler import SummaryView\n\nfrom ppfleetx.data import dataset, sampler, utils\nfrom ppfleetx.distributed.apis import env\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.optims import optimizer, grad_clip, lr_scheduler\n\n\ndef build_dataset(config_dataset, **config_kwargs):\n    # build dataset\n    if config_dataset is not None:\n        config_dataset = copy.deepcopy(config_dataset)\n        dataset_name = config_dataset.pop('name')\n        config_dataset.update(config_kwargs)\n        dataset = eval(\"dataset.{}\".format(dataset_name))(**config_dataset)\n\n        logger.debug(\"build dataset({}) success...\".format(dataset))\n    else:\n        dataset = None\n\n    return dataset\n\n\ndef build_batch_sampler(config_sampler, dataset, **config_kwargs):\n    # build sampler\n    if config_sampler is not None:\n        config_sampler = copy.deepcopy(config_sampler)\n        sampler_name = config_sampler.pop(\"name\")\n        config_sampler.update(config_kwargs)\n        batch_sampler = eval(\"sampler.{}\".format(sampler_name))(\n            dataset, **config_sampler)\n\n        logger.debug(\"build batch_sampler({}) success...\".format(\n            batch_sampler))\n    else:\n        batch_sampler = None\n\n    return batch_sampler\n\n\ndef build_dataloader(config_loader,\n                     dataset,\n                     batch_sampler=None,\n                     **config_kwargs):\n    collate_fn = None\n\n    if config_loader is not None:\n        config_loader = copy.deepcopy(config_loader)\n        config_loader.update(config_kwargs)\n\n        collate_fn_cfg = config_loader.pop('collate_fn', None)\n        if isinstance(collate_fn_cfg, str):\n            collate_fn = getattr(\n                utils, collate_fn_cfg) if collate_fn_cfg is not None else None\n        elif isinstance(collate_fn_cfg, dict):\n            collate_fn_class_name = collate_fn_cfg.pop(\"name\")\n            collate_fn = eval(\"utils.{}\".format(collate_fn_class_name))(\n                **collate_fn_cfg)\n\n            logger.debug(\"build collate_fn({}) success...\".format(collate_fn))\n\n    def worker_init_fn(worker_id):\n        \"\"\" set seed in subproces for dataloader when num_workers > 0\"\"\"\n        np.random.seed(env.get_dp_seed() + worker_id)\n        random.seed(env.get_dp_seed() + worker_id)\n\n    data_loader = paddle.io.DataLoader(\n        dataset=dataset,\n        batch_sampler=batch_sampler,\n        collate_fn=collate_fn,\n        worker_init_fn=worker_init_fn,\n        **config_loader)\n\n    logger.debug(\"build data_loader({}) success...\".format(data_loader))\n    return data_loader\n\n\ndef build_lr_scheduler(lr_config):\n    if 'name' in lr_config:\n        lr_name = lr_config.pop('name')\n        lr = eval(\"lr_scheduler.{}\".format(lr_name))(**lr_config)\n        if isinstance(lr, LRScheduler):\n            return lr\n        else:\n            return lr()\n    else:\n        lr = lr_config.learning_rate\n\n    logger.debug(\"build lr ({}) success..\".format(lr))\n    return lr\n\n\ndef build_grad_clip(grad_clip_config):\n    if grad_clip_config is not None:\n        grad_clip_name = grad_clip_config.pop('name', 'ClipGradByGlobalNorm')\n        grad_clip = eval(\"grad_clip.{}\".format(grad_clip_name))(\n            **grad_clip_config)\n        return grad_clip\n    else:\n        return None\n\n\ndef build_optimizer(config, model, lr_scheduler=None, multi_precision=False):\n    config = copy.deepcopy(config)\n    if lr_scheduler is not None:\n        config.pop('lr')\n\n    grad_clip_config = config.pop('grad_clip', None)\n    grad_clip = build_grad_clip(grad_clip_config)\n\n    optim_name = config.pop('name')\n    optim = eval(\"optimizer.{}\".format(optim_name))(\n        learning_rate=lr_scheduler,\n        parameters=model.parameters(),\n        grad_clip=grad_clip,\n        multi_precision=multi_precision,\n        **config)\n\n    logger.debug(\"build optimizer ({}) success..\".format(optim))\n    return optim\n\n\ndef build_profiler(profiler_config):\n    profiler = None\n\n    if profiler_config.get('enable', False):\n        scheduler = profiler_config.get('scheduler', None)\n        profiler_log = profiler_config.get('profiler_log', './profiler_log')\n        record_shapes = profiler_config.get('record_shapes', True)\n        profile_memory = profiler_config.get('profile_memory', True)\n        profiler = paddle.profiler.Profiler(\n            targets=[\n                paddle.profiler.ProfilerTarget.CPU,\n                paddle.profiler.ProfilerTarget.GPU\n            ],\n            scheduler=scheduler,\n            on_trace_ready=paddle.profiler.export_chrome_tracing(profiler_log),\n            record_shapes=record_shapes,\n            profile_memory=profile_memory)\n        profiler.start()\n        logger.warning(\"Profiler is enabled, do not enable it in production.\")\n\n    return profiler\n\n\ndef profiler_done(profiler, profiler_config):\n    if not profiler:\n        return\n\n    logger.info(\"Profiler finished, prepare to print summary...\")\n\n    profiler.stop()\n\n    _print_summary(profiler, profiler_config)\n    profiler_log = profiler_config.get('profiler_log', './profiler_log')\n    logger.info(\n        \"For more information please install visualdl and run it with following command:\"\n    )\n    logger.info(\n        \"-------------------------------------------------------------------------------\"\n    )\n    logger.info(f\"visualdl --host 0.0.0.0 --logdir {profiler_log}\")\n    logger.info(\n        \"-------------------------------------------------------------------------------\"\n    )\n\n\ndef _print_summary(profiler, profiler_config):\n    views_dict = {\n        SummaryView.DeviceView: 'device',\n        SummaryView.OverView: 'overview',\n        SummaryView.ModelView: 'model',\n        SummaryView.DistributedView: 'dist',\n        SummaryView.KernelView: 'kernel',\n        SummaryView.OperatorView: 'op',\n        SummaryView.MemoryView: 'mem',\n        SummaryView.MemoryManipulationView: 'memcpy',\n        SummaryView.UDFView: 'udf',\n    }\n\n    default_views = [\n        SummaryView.OverView,\n        SummaryView.ModelView,\n        SummaryView.KernelView,\n        SummaryView.OperatorView,\n    ]\n\n    def gen_views(cfg):\n        # print all summary view if detailed=True\n        if profiler_config.get('detailed', False):\n            return None\n\n        views = []\n        # override default view with user defined value if detailed=False\n        for view in SummaryView:\n            v = profiler_config.get('summary', {}).get(views_dict[view], None)\n            if v is True or (v is None and view in default_views):\n                views.append(view)\n\n        return views or None\n\n    profiler.summary(\n        sorted_by=paddle.profiler.SortedKeys.GPUTotal,\n        views=gen_views(profiler_config))\n"
  },
  {
    "path": "examples/transformer/utils/config.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport logging\nimport os\nimport sys\nimport copy\nimport argparse\nimport codecs\nimport yaml\nimport numpy as np\n\nimport paddle\nimport paddle.distributed as dist\nfrom paddle.fluid import core\nfrom paddle.fluid.reader import use_pinned_memory\n\nfrom ppfleetx.distributed.apis import env\nfrom ppfleetx.utils.log import logger, advertise\nfrom ppfleetx.utils import check\n\n__all__ = ['get_config', 'print_config']\n\n\nclass AttrDict(dict):\n    def __getattr__(self, key):\n        return self[key]\n\n    def __setattr__(self, key, value):\n        if key in self.__dict__:\n            self.__dict__[key] = value\n        else:\n            self[key] = value\n\n    def __copy__(self):\n        cls = self.__class__\n        result = cls.__new__(cls)\n        result.__dict__.update(self.__dict__)\n        return result\n\n    def __deepcopy__(self, memo):\n        cls = self.__class__\n        result = cls.__new__(cls)\n        memo[id(self)] = result\n        for k, v in self.__dict__.items():\n            setattr(result, k, copy.deepcopy(v, memo))\n        for k, v in self.items():\n            setattr(result, k, copy.deepcopy(v, memo))\n        return result\n\n    def setdefault(self, k, default=None):\n        if k not in self or self[k] is None:\n            self[k] = default\n            return default\n        else:\n            return self[k]\n\n\ndef create_attr_dict(yaml_config):\n    from ast import literal_eval\n    for key, value in yaml_config.items():\n        if type(value) is dict:\n            yaml_config[key] = value = AttrDict(value)\n        if isinstance(value, str):\n            try:\n                value = literal_eval(value)\n            except BaseException:\n                pass\n        if isinstance(value, AttrDict):\n            create_attr_dict(yaml_config[key])\n        else:\n            yaml_config[key] = value\n\n\ndef parse_config(cfg_file):\n    \"\"\"Load a config file into AttrDict\"\"\"\n\n    def _update_dic(dic, base_dic):\n        '''Update config from dic based base_dic\n        '''\n        base_dic = base_dic.copy()\n        dic = dic.copy()\n\n        if dic.get('_inherited_', True) == False:\n            dic.pop('_inherited_')\n            return dic\n\n        for key, val in dic.items():\n            if isinstance(val, dict) and key in base_dic:\n                base_dic[key] = _update_dic(val, base_dic[key])\n            else:\n                base_dic[key] = val\n        dic = base_dic\n        return dic\n\n    def _parse_from_yaml(path):\n        '''Parse a yaml file and build config'''\n\n        with codecs.open(path, 'r', 'utf-8') as file:\n            dic = yaml.load(file, Loader=yaml.FullLoader)\n\n        if '_base_' in dic:\n            cfg_dir = os.path.dirname(path)\n            base_path = dic.pop('_base_')\n            base_path = os.path.join(cfg_dir, base_path)\n            base_dic = _parse_from_yaml(base_path)\n            dic = _update_dic(dic, base_dic)\n        return dic\n\n    yaml_dict = _parse_from_yaml(cfg_file)\n    yaml_config = AttrDict(yaml_dict)\n\n    create_attr_dict(yaml_config)\n    return yaml_config\n\n\ndef print_dict(d, delimiter=0):\n    \"\"\"\n    Recursively visualize a dict and\n    indenting acrrording by the relationship of keys.\n    \"\"\"\n    placeholder = \"-\" * 60\n    for k, v in sorted(d.items()):\n        if isinstance(v, dict):\n            logger.info(\"{}{} : \".format(delimiter * \" \", k))\n            print_dict(v, delimiter + 4)\n        elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):\n            logger.info(\"{}{} : \".format(delimiter * \" \", k))\n            for value in v:\n                print_dict(value, delimiter + 4)\n        else:\n            logger.info(\"{}{} : {}\".format(delimiter * \" \", k, v))\n        if k.isupper():\n            logger.info(placeholder)\n\n\ndef print_config(config):\n    \"\"\"\n    visualize configs\n    Arguments:\n        config: configs\n    \"\"\"\n    advertise()\n    print_dict(config)\n\n\ndef check_config(config):\n    \"\"\"\n    Check config\n    \"\"\"\n    # global_batch_size = config.get(\"\")\n\n    global_config = config.get('Global')\n    check.check_version()\n    device = global_config.get('device', 'gpu')\n    device = device.lower()\n    if device in ['gpu', 'xpu', 'rocm', 'npu', \"cpu\"]:\n        check.check_device(device)\n    else:\n        raise ValueError(\n            f\"device({device}) is not in ['gpu', 'xpu', 'rocm', 'npu', 'cpu'],\\n\"\n            \"Please ensure the config option Global.device is one of these devices\"\n        )\n\n\ndef override(dl, ks, v):\n    \"\"\"\n    Recursively replace dict of list\n    Args:\n        dl(dict or list): dict or list to be replaced\n        ks(list): list of keys\n        v(str): value to be replaced\n    \"\"\"\n\n    def str2num(v):\n        try:\n            return eval(v)\n        except Exception:\n            return v\n\n    assert isinstance(dl, (list, dict)), (\"{} should be a list or a dict\")\n    assert len(ks) > 0, ('lenght of keys should larger than 0')\n    if isinstance(dl, list):\n        k = str2num(ks[0])\n        if len(ks) == 1:\n            assert k < len(dl), ('index({}) out of range({})'.format(k, dl))\n            dl[k] = str2num(v)\n        else:\n            override(dl[k], ks[1:], v)\n    else:\n        if len(ks) == 1:\n            # assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))\n            if not ks[0] in dl:\n                print('A new field ({}) detected!'.format(ks[0], dl))\n            dl[ks[0]] = str2num(v)\n        else:\n            if ks[0] not in dl.keys():\n                dl[ks[0]] = {}\n                print(\"A new Series field ({}) detected!\".format(ks[0], dl))\n            override(dl[ks[0]], ks[1:], v)\n\n\ndef override_config(config, options=None):\n    \"\"\"\n    Recursively override the config\n    Args:\n        config(dict): dict to be replaced\n        options(list): list of pairs(key0.key1.idx.key2=value)\n            such as: [\n                'topk=2',\n                'VALID.transforms.1.ResizeImage.resize_short=300'\n            ]\n    Returns:\n        config(dict): replaced config\n    \"\"\"\n    if options is not None:\n        for opt in options:\n            assert isinstance(opt, str), (\n                \"option({}) should be a str\".format(opt))\n            assert \"=\" in opt, (\n                \"option({}) should contain a =\"\n                \"to distinguish between key and value\".format(opt))\n            pair = opt.split('=')\n            assert len(pair) == 2, (\"there can be only a = in the option\")\n            key, value = pair\n            keys = key.split('.')\n            override(config, keys, value)\n    return config\n\n\ndef get_config(fname, overrides=None, show=False):\n    \"\"\"\n    Read config from file\n    \"\"\"\n    assert os.path.exists(fname), (\n        'config file({}) is not exist'.format(fname))\n    config = parse_config(fname)\n    override_config(config, overrides)\n\n    process_dist_config(config)\n    process_global_configs(config)\n    create_attr_dict(AttrDict(config))\n\n    if show:\n        print_config(config)\n    check_config(config)\n    return config\n\n\ndef parse_args():\n    parser = argparse.ArgumentParser(\"train script\")\n    parser.add_argument(\n        '-c',\n        '--config',\n        type=str,\n        default='configs/config.yaml',\n        help='config file path')\n    parser.add_argument(\n        '-o',\n        '--override',\n        action='append',\n        default=[],\n        help='config options to be overridden')\n    args = parser.parse_args()\n    return args\n\n\ndef is_fused_matmul_bias_supported():\n    if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm():\n        return hasattr(core.eager.ops.legacy, 'fused_gemm_epilogue')\n    else:\n        return False\n\n\ndef process_dist_config(configs):\n    \"\"\"\n    process distributed strategy for hybrid parallel\n    \"\"\"\n    nranks = dist.get_world_size()\n\n    config = configs['Distributed']\n\n    config.setdefault(\"hcg\", \"HybridCommunicateGroup\")\n    mp_degree = config.setdefault(\"mp_degree\", 1)\n    pp_degree = config.setdefault(\"pp_degree\", 1)\n    pp_recompute_interval = config.setdefault(\"pp_recompute_interval\", 1)\n\n    # sharding default\n    sharding_config = config['sharding']\n    sharding_degree = sharding_config.setdefault(\"sharding_degree\", 1)\n    sharding_stage = sharding_config.setdefault('sharding_stage', 2)\n    sharding_offload = sharding_config.setdefault('sharding_offload', False)\n    reduce_overlap = sharding_config.setdefault('reduce_overlap', False)\n    broadcast_overlap = sharding_config.setdefault('broadcast_overlap', False)\n\n    other_degree = mp_degree * pp_degree * sharding_degree\n\n    assert nranks % other_degree == 0, \"unreasonable config of dist_strategy.\"\n    dp_degree = config.setdefault(\"dp_degree\", nranks // other_degree)\n    assert nranks % dp_degree == 0, \"unreasonable config of dist_strategy.\"\n    assert nranks == dp_degree * other_degree, \\\n        \"Mismatched config using {} cards with dp_degree[{}],\" \\\n            \"mp_degree[{}], pp_degree[{}] and sharding_degree[{}]\".format(nranks, \\\n                dp_degree, mp_degree, pp_degree, sharding_degree)\n\n    if sharding_config['sharding_degree'] > 1 and reduce_overlap:\n        if sharding_config['sharding_stage'] == 3 or sharding_config[\n                'sharding_offload']:\n            sharding_config['reduce_overlap'] = False\n            logger.warning(\n                \"reduce overlap only valid for sharding stage 2 without offload\"\n            )\n\n    if sharding_config['sharding_degree'] > 1 and broadcast_overlap:\n        if sharding_config['sharding_stage'] == 3 or sharding_config[\n                'sharding_offload']:\n            sharding_config['broadcast_overlap'] = False\n            logger.warning(\n                \"broadcast overlap only valid for sharding stage 2 without offload\"\n            )\n\n    if broadcast_overlap and configs['Global']['logging_freq'] == 1:\n        logger.warning(\n            \"Set logging_freq to 1 will disable broadcast_overlap. \"\n            \"If you want to overlap the broadcast, please increase the logging_freq.\"\n        )\n        sharding_config['broadcast_overlap'] = False\n\n    if sharding_config['sharding_degree'] > 1:\n        if getattr(sharding_config, 'broadcast_overlap', False):\n            logger.warning(\n                \"Enable broadcast overlap for sharding will not use pin memory for dataloader\"\n            )\n            use_pinned_memory(False)\n\n    if 'fuse_sequence_parallel_allreduce' not in config:\n        config['fuse_sequence_parallel_allreduce'] = False\n\n\ndef process_global_configs(config):\n    \"\"\"\n    process global configs for hybrid parallel\n    \"\"\"\n    dp_degree = config['Distributed']['dp_degree']\n    pp_degree = config['Distributed']['pp_degree']\n    sharding_degree = config['Distributed']['sharding']['sharding_degree']\n\n    config['Global']['enable_partial_send_recv'] = True\n    if 'sequence_parallel' in config['Model'] and pp_degree > 1:\n        if config['Model']['sequence_parallel']:\n            config['Global']['enable_partial_send_recv'] = False\n            logger.warning(\n                \"if config.Distributed.pp_degree > 1 and config.Model.sequence_parallel is True, \" \\\n                \"config.Global.enable_partial_send_recv will be set False.\"\n            )\n\n    global_cfg = config['Global']\n\n    # Set environment variable\n    flags = global_cfg.get(\"flags\", {})\n    paddle.set_flags(flags)\n    for k, v in flags.items():\n        logger.info(\"Environment variable {} is set {}.\".format(k, v))\n\n    if global_cfg['global_batch_size'] is None and global_cfg[\n            'local_batch_size'] is None:\n        raise ValueError(\n            \"global_batch_size or local_batch_size should be set.\")\n    elif global_cfg['global_batch_size'] is not None and global_cfg[\n            'local_batch_size'] is not None:\n        assert global_cfg['global_batch_size'] // global_cfg['local_batch_size'] == (dp_degree * sharding_degree), \"global_batch_size[{}] should be divided by local_batch_size[{}] \"\\\n            \"when dp_degree is [{}] and sharding_degree is [{}]\".format(global_cfg['global_batch_size'],\n            global_cfg['local_batch_size'], dp_degree, sharding_degree)\n    elif global_cfg['global_batch_size'] is not None and global_cfg[\n            'local_batch_size'] is None:\n        assert global_cfg['global_batch_size'] % (dp_degree * sharding_degree) == 0, \\\n            \"global_batch_size[{}] should be divided by dp_degree[{}] times sharding_degree[{}]\"\\\n            .format(global_cfg['global_batch_size'], dp_degree, sharding_degree)\n        global_cfg['local_batch_size'] = global_cfg['global_batch_size'] // (\n            dp_degree * sharding_degree)\n    else:\n        global_cfg['global_batch_size'] = global_cfg[\n            'local_batch_size'] * dp_degree * sharding_degree\n    assert global_cfg['local_batch_size'] % global_cfg['micro_batch_size'] == 0\n\n    # save_load\n    global_cfg['save_load'] = global_cfg.get('save_load', {})\n    save_load_cfg = global_cfg.save_load\n    save_steps = save_load_cfg.get('save_steps', None)\n    save_epoch = save_load_cfg.get('save_epoch', None)\n    if save_steps is None or save_steps == -1:\n        save_load_cfg[\n            'save_steps'] = sys.maxsize if sys.version > '3' else sys.maxint\n\n    if save_epoch is None or save_epoch == -1:\n        save_load_cfg['save_epoch'] = 1\n\n    save_load_cfg['output_dir'] = save_load_cfg.get('output_dir', './output')\n    save_load_cfg['ckpt_dir'] = save_load_cfg.get('ckpt_dir', None)\n\n    # mix_precision\n    global_cfg['mix_precision'] = global_cfg.get('mix_precision', {})\n    amp_cfg = global_cfg.mix_precision\n\n    amp_cfg['enable'] = amp_cfg.get('enable', False)\n    amp_cfg['scale_loss'] = amp_cfg.get('scale_loss', 32768)\n    amp_cfg['custom_black_list'] = amp_cfg.get('custom_black_list', None)\n    amp_cfg['custom_white_list'] = amp_cfg.get('custom_white_list', None)\n\n    global_cfg['max_steps'] = global_cfg.get('max_steps', 500000)\n    global_cfg['eval_freq'] = global_cfg.get('eval_freq', -1)\n    global_cfg['eval_iters'] = global_cfg.get('eval_iters', 0)\n    global_cfg['logging_freq'] = global_cfg.get('logging_freq', 1)\n    global_cfg['num_train_epochs'] = global_cfg.get('num_train_epochs', 1)\n    global_cfg['test_iters'] = global_cfg['eval_iters'] * 10 \\\n            if global_cfg.get('test_iters', None) is None else global_cfg['test_iters']\n    global_cfg[\n        'accumulate_steps'] = global_cfg.local_batch_size // global_cfg.micro_batch_size\n\n\ndef process_model_configs(config):\n    \"\"\"\n    process model configs for hybrid parallel\n    \"\"\"\n    configs = config['Model']\n    if configs['ffn_hidden_size'] is None:\n        configs['ffn_hidden_size'] = 4 * configs['hidden_size']\n\n    if configs['use_recompute']:\n        if not configs['recompute_granularity']:\n            configs['recompute_granularity'] = 'full'\n        if not configs['no_recompute_layers']:\n            configs['no_recompute_layers'] = []\n        else:\n            assert isinstance(configs['no_recompute_layers'],\n                              list), \"no_recompute_layers should be a list\"\n            for i in configs['no_recompute_layers']:\n                assert isinstance(\n                    i, int\n                ), \"all values in no_recompute_layers should be an integer\"\n            assert min(configs['no_recompute_layers']) >= 0, \\\n                \"the min value in no_recompute_layers should >= 0\"\n            assert max(configs['no_recompute_layers']) < configs['num_layers'], \\\n                \"the max value in no_recompute_layers should < num_layers\"\n            configs['no_recompute_layers'] = sorted(\n                list(set(configs['no_recompute_layers'])))\n\n    if configs['fused_linear'] and not is_fused_matmul_bias_supported():\n        configs['fused_linear'] = False\n        logging.warning(\n            \"The flag fused_linear only valid for cuda version higher than 11.6, \"\n            \"but the paddle is compiled with cuda \" + paddle.version.cuda())\n\n    pp_degree = config.Distributed.pp_degree\n\n    if pp_degree > 1:\n        configs['virtual_pp_degree'] = 1 \\\n            if configs.get('virtual_pp_degree', None) is None \\\n            else configs['virtual_pp_degree']\n        virtual_pp_degree = configs['virtual_pp_degree']\n        num_layers = configs.num_layers\n\n        if not (num_layers % (virtual_pp_degree * pp_degree)) == 0:\n            assert virtual_pp_degree == 1, \"virtual pp doesn't support uneven layer split.\"\n            logger.warning(\n                \"The num_layers of the model is not divisible by pp_degree.\" \\\n                \"Receive num_layers: {}, pp_degree: {}.\".format(num_layers, pp_degree))\n        else:\n            assert (num_layers %\n                (virtual_pp_degree * pp_degree)) == 0, \\\n                \"The num_layers of the model should be divisible of pp_degree * virtual_pp_degree.\" \\\n                \"Receive num_layers: {}, pp_degree: {}, virtual_pp_degree: {}.\".format(\n                num_layers, pp_degree, virtual_pp_degree)\n\n        if virtual_pp_degree > 1:\n            local_batch_size = config.Global.local_batch_size\n            micro_batch_size = config.Global.micro_batch_size\n            acc_steps = local_batch_size // micro_batch_size\n            assert acc_steps % pp_degree == 0, \"num of microbatches {} should be divisible of pp_degree {} when \" \\\n                                               \"using interleave pipeline\".format(acc_steps, pp_degree)\n\n        if virtual_pp_degree > 2:\n            logger.warning(\n                \"Setting virtual_pp_degree > 2 may harm the throughput of the pipeline parallel.\"\n            )\n    else:\n        if configs.get('virtual_pp_degree', None):\n            logger.warning(\"virtual_pp_degree is unuseful.\")\n\n\ndef process_optim_configs(config):\n    \"\"\"\n    process optim configs for hybrid parallel\n    \"\"\"\n    if 'Optimizer' not in config.keys():\n        return\n\n    nranks = dist.get_world_size()\n    dp_degree = config['Distributed']['dp_degree']\n    sharding_degree = config['Distributed']['sharding']['sharding_degree']\n    if config['Optimizer']['tensor_fusion']:\n        assert nranks == dp_degree * sharding_degree, \\\n            \"tensor_fusion only support single card train or data/sharding parallel train\"\n\n    if config['Optimizer']['lr']['decay_steps'] is None:\n        config['Optimizer']['lr']['decay_steps'] = config['Engine'][\n            'max_steps']\n    config['Optimizer']['lr']['decay_steps'] *= config['Global'][\n        'global_batch_size']\n\n\ndef process_data_configs(config):\n    \"\"\"\n    process data configs for hybrid parallel\n    \"\"\"\n    if 'Data' not in config.keys():\n        return\n\n    cfg_global = config['Global']\n    cfg_data = config['Data']\n\n    mode_to_num_samples = {\n        \"Train\":\n        cfg_global['global_batch_size'] * config['Global']['max_steps'],\n        \"Eval\": cfg_global['global_batch_size'] *\n        (config['Global']['max_steps'] // config['Global']['eval_freq'] + 1) *\n        config['Global']['eval_iters'],\n        \"Test\":\n        cfg_global['global_batch_size'] * config['Global']['test_iters'],\n    }\n\n    for mode in (\"Train\", \"Eval\", \"Test\"):\n        if mode in cfg_data.keys():\n            cfg_data[mode]['dataset']['num_samples'] = mode_to_num_samples[\n                mode]\n\n\ndef process_inference_configs(config):\n    \"\"\"\n    process inference configs for hybrid parallel\n    \"\"\"\n    if 'Inference' not in config.keys():\n        return\n\n    configs = config['Inference']\n\n    if configs['model_dir'] is None:\n        configs['model_dir'] = config['Global']['save_load']['output_dir']\n\n    if configs['mp_degree'] is None:\n        configs['mp_degree'] = config['Distributed']['mp_degree']\n\n\ndef process_configs(config):\n    process_data_configs(config)\n    process_model_configs(config)\n    process_optim_configs(config)\n    process_inference_configs(config)\n\n    return config\n"
  },
  {
    "path": "examples/transformer/utils/qat.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\n\nfrom ppfleetx.distributed.apis import io\nfrom ppfleetx.utils.compression_helper import prune_model, quant_model\n\n\ndef compress_model(config, model, input_spec):\n    quanter, quant_configs = None, None\n    prune_configs, compress_configs = None, None\n\n    if 'Compress' in config:\n        compress_configs = config['Compress']\n        if \"Prune\" in compress_configs:\n            prune_configs = compress_configs[\"Prune\"]\n        if \"Quantization\" in compress_configs:\n            quant_configs = compress_configs[\"Quantization\"]\n\n        # Load pretrained model before compression\n        if 'pretrained' in compress_configs and compress_configs[\n                'pretrained'] is not None:\n            ckpt_dir = compress_configs['pretrained']\n            io.load(\n                ckpt_dir,\n                model,\n                optimizer=None,\n                mode='quant',\n                load_recovery=None)\n\n            # Avoid loading again\n            config.Global.save_load.ckpt_dir = None\n\n        if prune_configs is not None and prune_configs.enable:\n            prune_model(model, prune_configs, input_spec)\n\n        # NOTE(minghaoBD): We haven't fully tested Prune+Quantization, so an \"else if\" is put here for separation.\n        elif quant_configs is not None and quant_configs.enable:\n            model, quanter = quant_model(model, quant_configs)\n\n    return model, quanter\n"
  },
  {
    "path": "ppfleetx/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/configs/multimodal/imagen/imagen_397M_text2im_64x64.yaml",
    "content": "_base_: ./imagen_base.yaml\n\nGlobal:\n  global_batch_size:\n  local_batch_size: 1\n  micro_batch_size: 1\n\n\nModel:\n  name: imagen_397M_text2im_64\n  text_encoder_name: projects/imagen/t5/t5-11b\n  text_embed_dim: 1024 \n  timesteps: 1000 \n  channels: 3\n  cond_drop_prob: 0.1\n  noise_schedules: cosine\n  pred_objectives: noise\n  lowres_noise_schedule: linear\n  lowres_sample_noise_level: 0.2\n  per_sample_random_aug_noise_level: False\n  condition_on_text: True\n  auto_normalize_img: True\n  p2_loss_weight_gamma: 0.5\n  dynamic_thresholding: True,\n  dynamic_thresholding_percentile: 0.95\n  only_train_unet_number: 1 \n  use_recompute: False\n  recompute_granularity:\n\nData:\n  Train:\n    dataset:\n      name: ImagenDataset\n      input_path: ./projects/imagen/filelist/laion_400M/train\n      shuffle: True\n      image_format: base64 \n      image_size: 64 \n      text_max_len: 128 \n      filter_image_resolution: 64\n    loader:\n      num_workers: 8\n      shuffle: True\n      batch_size: 16 \n      drop_last: True\n      collate_fn: imagen_collate_fn\n\nLoss:\n  name: mse_loss\n  p2_loss_weight_k: 1.0\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n"
  },
  {
    "path": "ppfleetx/configs/multimodal/imagen/imagen_base.yaml",
    "content": "Global:\n  device: gpu\n  seed: 1024\n\n  global_batch_size: \n  local_batch_size: 1\n  micro_batch_size: 1\n\n\nEngine:\n  max_steps: 2500000\n  num_train_epochs: 1\n  accumulate_steps: 1\n  logging_freq: 10\n  eval_freq: 10000000\n  eval_iters: 10000000\n  mix_precision:\n    enable: False \n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n  save_load:\n    save_steps: 10000\n    output_dir: ./output\n    ckpt_dir:\n\n\nModel:\n  module: \"ImagenModule\"\n  name: \"Imagen\"\n  fused_linear: False\n\n# data loader for train\nData:\n  Train:\n    dataset:\n      name: ImagenDataset\n      input_path: ./projects/imagen/filelist/laion_400M/train\n      shuffle: True\n      image_format: base64 \n      image_size: 64 \n      text_max_len: 128 \n      filter_image_resolution: 64\n    loader:\n      num_workers: 8\n      shuffle: True\n      batch_size: 16 \n      drop_last: True\n      collate_fn: imagen_collate_fn\n  \n\nFused:\n  tensor_fusion: False\n\n\nOptimizer:\n  name: Adam\n  weight_decay: 0.\n  beta1: 0.9\n  beta2: 0.999\n  epsilon: 1.0e-8\n  lr:\n    name: CosineAnnealingWithWarmupDecay\n    decay_steps: 2500000\n    warmup_rate: 0.025 \n    max_lr: 1.0e-4\n    min_lr: 0.0\n  grad_clip:\n    name: \"ClipGradByGlobalNorm\"\n    clip_norm: 1.0\n\n\nProfiler:\n  enable: False\n  scheduler: [1, 5]\n  profiler_log: profiler_log\n  detailed: False\n\n\nInference:\n  model_dir: ./output\n  mp_degree: 1\n"
  },
  {
    "path": "ppfleetx/configs/multimodal/imagen/imagen_super_resolution_1024.yaml",
    "content": "_base_: ./imagen_base.yaml\n\nGlobal:\n  global_batch_size:\n  local_batch_size: 1\n  micro_batch_size: 1\n\n\nModel:\n  name: imagen_SR1024\n  text_encoder_name: None \n  text_embed_dim: \n  timesteps: 1000 \n  channels: 3\n  cond_drop_prob: 0.1\n  noise_schedules: cosine\n  pred_objectives: noise\n  lowres_cond: True \n  lowres_noise_schedule: linear\n  lowres_sample_noise_level: 0.2\n  per_sample_random_aug_noise_level: False\n  condition_on_text: False \n  auto_normalize_img: True\n  p2_loss_weight_gamma: 0.5\n  dynamic_thresholding: True,\n  dynamic_thresholding_percentile: 0.95\n  only_train_unet_number: 1 \n  is_sr: True\n  use_recompute: True \n  recompute_granularity:\n\nEngine:\n  max_steps: 2500000\n  num_train_epochs: 1\n  accumulate_steps: 1\n  logging_freq: 10\n  eval_freq: 10000000\n  eval_iters: 10000000\n  mix_precision:\n    enable: False \n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n    fp16_dtype: \"bfloat16\"\n  save_load:\n    save_steps: 10000\n    output_dir: ./output\n    ckpt_dir:\n\nData:\n  Train:\n    dataset:\n      name: ImagenDataset\n      input_path: ./projects/imagen/filelist/laion_400M/train\n      shuffle: True\n      image_format: base64 \n      image_size: 1024 \n      text_max_len: 128 \n      filter_image_resolution: 1024 \n      sr: True\n    loader:\n      num_workers: 8\n      shuffle: True\n      batch_size: 1 \n      drop_last: True\n      collate_fn: imagen_collate_fn\n  \n\nLoss:\n  name: mse_loss\n  p2_loss_weight_k: 1.0\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n"
  },
  {
    "path": "ppfleetx/configs/multimodal/imagen/imagen_super_resolution_256.yaml",
    "content": "_base_: ./imagen_base.yaml\n\nGlobal:\n  global_batch_size:\n  local_batch_size: 1\n  micro_batch_size: 1\n\n\nModel:\n  name: imagen_SR256\n  text_encoder_name: None  # We do not use text conditoin during training.\n  text_embed_dim: \n  timesteps: 1000 \n  channels: 3\n  cond_drop_prob: 0.1\n  noise_schedules: cosine\n  pred_objectives: noise\n  lowres_cond: True \n  lowres_noise_schedule: linear\n  lowres_sample_noise_level: 0.2\n  per_sample_random_aug_noise_level: False\n  condition_on_text: False \n  auto_normalize_img: True\n  p2_loss_weight_gamma: 0.5\n  dynamic_thresholding: True,\n  dynamic_thresholding_percentile: 0.95\n  only_train_unet_number: 1 \n  is_sr: True\n  use_recompute: True \n  recompute_granularity:\n\nData:\n  Train:\n    dataset:\n      name: ImagenDataset\n      input_path: ./projects/imagen/filelist/laion_400M/train\n      shuffle: True\n      image_format: base64 \n      image_size: 256 \n      text_max_len: 128 \n      filter_image_resolution: 256 \n      sr: True\n    loader:\n      num_workers: 8\n      shuffle: True\n      batch_size: 6 \n      drop_last: True\n      collate_fn: imagen_collate_fn\n  \n\nLoss:\n  name: mse_loss\n  p2_loss_weight_k: 1.0\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n"
  },
  {
    "path": "ppfleetx/configs/multimodal/imagen/imagen_text2im_64x64_DebertaV2.yaml",
    "content": "_base_: ./imagen_base.yaml\n\nGlobal:\n  global_batch_size:\n  local_batch_size: 1\n  micro_batch_size: 1\n\n\nModel:\n  name: imagen_text2im_64_debertav2\n  text_encoder_name: projects/imagen/cache/deberta-v-xxlarge\n  text_embed_dim: 1536\n  timesteps: 1000 \n  channels: 3\n  cond_drop_prob: 0.1\n  noise_schedules: cosine\n  pred_objectives: noise\n  lowres_noise_schedule: linear\n  lowres_sample_noise_level: 0.2\n  per_sample_random_aug_noise_level: False\n  condition_on_text: True\n  auto_normalize_img: True\n  p2_loss_weight_gamma: 0.5\n  dynamic_thresholding: True,\n  dynamic_thresholding_percentile: 0.95\n  only_train_unet_number: 1 \n  use_recompute: False\n  recompute_granularity:\n\nData:\n  Train:\n    dataset:\n      name: ImagenDataset\n      input_path: ./projects/imagen/filelist/laion_400M/train\n      shuffle: True\n      image_format: base64 \n      image_size: 64 \n      text_max_len: 128 \n      filter_image_resolution: 64\n    loader:\n      num_workers: 8\n      shuffle: True\n      batch_size: 8 \n      drop_last: True\n      collate_fn: imagen_collate_fn\n\nLoss:\n  name: mse_loss\n  p2_loss_weight_k: 1.0\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n"
  },
  {
    "path": "ppfleetx/configs/multimodal/imagen/imagen_text2im_64x64_T5-11B.yaml",
    "content": "_base_: ./imagen_base.yaml\n\nGlobal:\n  global_batch_size:\n  local_batch_size: 1\n  micro_batch_size: 1\n\n\nModel:\n  name: imagen_text2im_64\n  text_encoder_name: projects/imagen/t5/t5-11b\n  text_embed_dim: 1024 \n  timesteps: 1000 \n  channels: 3\n  cond_drop_prob: 0.1\n  noise_schedules: cosine\n  pred_objectives: noise\n  lowres_noise_schedule: linear\n  lowres_sample_noise_level: 0.2\n  per_sample_random_aug_noise_level: False\n  condition_on_text: True\n  auto_normalize_img: True\n  p2_loss_weight_gamma: 0.5\n  dynamic_thresholding: True,\n  dynamic_thresholding_percentile: 0.95\n  only_train_unet_number: 1 \n  use_recompute: True \n  recompute_granularity:\n\nData:\n  Train:\n    dataset:\n      name: ImagenDataset\n      input_path: ./projects/imagen/filelist/laion_400M/train\n      shuffle: True\n      image_format: base64 \n      image_size: 64 \n      text_max_len: 128 \n      filter_image_resolution: 64\n    loader:\n      num_workers: 8\n      shuffle: True\n      batch_size: 8 \n      drop_last: True\n      collate_fn: imagen_collate_fn\n\nLoss:\n  name: mse_loss\n  p2_loss_weight_k: 1.0\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n"
  },
  {
    "path": "ppfleetx/configs/nlp/ernie/auto/finetune_ernie_345M_single_card.yaml",
    "content": "_base_: ./finetune_ernie_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  vocab_size: 40000\n  hidden_size: 1024\n  num_hidden_layers: 24\n  num_attention_heads: 16\n  intermediate_size: \n  hidden_act: \"gelu\"\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 512\n  type_vocab_size: 4\n  initializer_range: 0.02\n  pad_token_id: 0\n  task_type_vocab_size: 3\n  task_id: 0\n  use_task_id: True\n  use_recompute: False\n\n\nDistributed:\n  dp_degree: \n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n"
  },
  {
    "path": "ppfleetx/configs/nlp/ernie/auto/finetune_ernie_base.yaml",
    "content": "Global:\n  device: gpu\n  seed: 1024\n  binary_head: True\n  \n  global_batch_size: \n  local_batch_size: 16\n  micro_batch_size: 16\n\n\nEngine:\n  max_steps: 500000\n  num_train_epochs: 1\n  accumulate_steps: 1\n  logging_freq: 1\n  eval_freq: 500000\n  eval_iters: 10\n  test_iters: -1\n  mix_precision:\n    level: \n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n  save_load:\n    save_steps: 50000\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\n\nModel:\n  module: \"ErnieSeqClsModuleAuto\"\n  name: \"Ernie\"\n  hidden_size: 768\n  num_hidden_layers: 12\n  num_attention_heads: 12\n  intermediate_size: 3072\n  hidden_act: \"gelu\"\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 512\n  type_vocab_size: 2\n  initializer_range: 0.02\n  pad_token_id: 0\n  task_type_vocab_size: 3\n  task_id: 0\n  use_task_id: False\n  use_recompute: False  \n\n\nData:\n  Train:\n    collate_fn: \n      name: DataCollatorWithPadding\n    dataset:\n      name: ErnieSeqClsDataset\n      dataset_type: chnsenticorp_v2\n      tokenizer_type: ernie-1.0-base-zh-cw\n      max_seq_len: 512\n\n  Eval:\n    collate_fn: \n      name: DataCollatorWithPadding\n    dataset:\n      name: ErnieSeqClsDataset\n      dataset_type: chnsenticorp_v2\n      tokenizer_type: ernie-1.0-base-zh-cw\n      max_seq_len: 512\n\n\nOptimizer:\n  name: AdamW\n  weight_decay: 0.01\n  beta1: 0.9\n  beta2: 0.999\n  epsilon: 1.0e-8\n  lr:\n    name: CosineAnnealingWithWarmupDecay\n    decay_steps: 990000\n    warmup_rate: 0.01\n    max_lr: 0.0001\n    min_lr: 5e-05\n  grad_clip:\n    name: \"ClipGradByGlobalNorm\"\n    clip_norm: 1.0\n"
  },
  {
    "path": "ppfleetx/configs/nlp/ernie/auto/pretrain_ernie_base.yaml",
    "content": "Global:\n  device: gpu\n  seed: 1024\n  binary_head: True\n  \n  global_batch_size: \n  local_batch_size: 1\n  micro_batch_size: 1\n\n\nEngine:\n  max_steps: 500000\n  num_train_epochs: 1\n  accumulate_steps: 1\n  logging_freq: 1\n  eval_freq: 500000\n  eval_iters: 10\n  test_iters: -1\n  mix_precision:\n    enable: False\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n  save_load:\n    save_steps: 50000\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\n\nModel:\n  module: \"ErnieModuleAuto\"\n  name: \"Ernie\"\n  hidden_size: 768\n  num_hidden_layers: 12\n  num_attention_heads: 12\n  intermediate_size: 3072\n  hidden_act: \"gelu\"\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 512\n  type_vocab_size: 2\n  initializer_range: 0.02\n  pad_token_id: 0\n  task_type_vocab_size: 3\n  task_id: 0\n  use_task_id: False\n  use_recompute: False  \n\n\nData:\n  Train:\n    sample_split: 4\n    collate_fn: \n      name: ErnieCollateData\n      micro_batch_size: \n    dataset:\n      name: ErnieDataset\n      input_dir: ./data\n      tokenizer_type: ernie-1.0-base-zh-cw\n      split: [949, 50, 1]\n      mode: Train\n      max_seq_length: 512\n      masked_lm_prob: 0.15\n      short_seq_prob: 0.1\n      seed: 1024\n      share_folder: False\n      favor_longer_ngram: False\n      max_ngrams: 3\n\n  Eval:\n    sample_split: 4\n    collate_fn: \n      name: ErnieCollateData\n      micro_batch_size: 1\n    dataset:\n      name: ErnieDataset\n      input_dir: ./data\n      tokenizer_type: ernie-1.0-base-zh-cw\n      split: [949, 50, 1]\n      mode: Eval\n      max_seq_length: 512\n      masked_lm_prob: 0.15\n      short_seq_prob: 0.1\n      seed: 1024\n      share_folder: False\n      favor_longer_ngram: False\n      max_ngrams: 3\n\n\nOptimizer:\n  name: AdamW\n  weight_decay: 0.01\n  beta1: 0.9\n  beta2: 0.999\n  epsilon: 1.0e-8\n  lr:\n    name: CosineAnnealingWithWarmupDecay\n    decay_steps: 990000\n    warmup_rate: 0.01\n    max_lr: 0.0001\n    min_lr: 0.00001\n  grad_clip:\n    name: \"ClipGradByGlobalNorm\"\n    clip_norm: 1.0\n"
  },
  {
    "path": "ppfleetx/configs/nlp/ernie/auto/pretrain_ernie_base_345M_single_card.yaml",
    "content": "_base_: ./pretrain_ernie_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  vocab_size: 40000\n  hidden_size: 1024\n  num_hidden_layers: 24\n  num_attention_heads: 16\n  intermediate_size: \n  hidden_act: \"gelu\"\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 512\n  type_vocab_size: 4\n  initializer_range: 0.02\n  pad_token_id: 0\n  task_type_vocab_size: 3\n  task_id: 0\n  use_task_id: True\n  use_recompute: False\n\n\nData:\n  Train:\n    dataset:\n      tokenizer_type: ernie-1.0-base-zh-cw\n  Eval:\n    dataset:\n      tokenizer_type: ernie-1.0-base-zh-cw\n\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n"
  },
  {
    "path": "ppfleetx/configs/nlp/ernie/finetune_ernie_345M_single_card.yaml",
    "content": "_base_: ./finetune_ernie_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  vocab_size: 40000\n  hidden_size: 1024\n  num_hidden_layers: 24\n  num_attention_heads: 16\n  intermediate_size: \n  hidden_act: \"gelu\"\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 512\n  type_vocab_size: 4\n  initializer_range: 0.02\n  pad_token_id: 0\n  task_type_vocab_size: 3\n  task_id: 0\n  use_task_id: True\n  use_recompute: False\n\n\nDistributed:\n  dp_degree: \n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n"
  },
  {
    "path": "ppfleetx/configs/nlp/ernie/finetune_ernie_base.yaml",
    "content": "Global:\n  device: gpu\n  seed: 1024\n  binary_head: True\n  \n  global_batch_size: \n  local_batch_size: 16\n  micro_batch_size: 16\n\n\nEngine:\n  max_steps: 500000\n  num_train_epochs: 1\n  accumulate_steps: 1\n  logging_freq: 1\n  eval_freq: 500000\n  eval_iters: 10\n  test_iters: -1\n  mix_precision:\n    enable: False\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n  save_load:\n    save_steps: 50000\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\n\nModel:\n  module: \"ErnieSeqClsModule\"\n  name: \"Ernie\"\n  hidden_size: 768\n  num_hidden_layers: 12\n  num_attention_heads: 12\n  intermediate_size: 3072\n  hidden_act: \"gelu\"\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 512\n  type_vocab_size: 2\n  initializer_range: 0.02\n  pad_token_id: 0\n  task_type_vocab_size: 3\n  task_id: 0\n  use_task_id: False\n  use_recompute: False  \n\n\nData:\n  Train:\n    dataset:\n      name: ErnieSeqClsDataset\n      dataset_type: chnsenticorp_v2\n      tokenizer_type: ernie-1.0-base-zh-cw\n      max_seq_len: 512\n    sampler:\n      name: GPTBatchSampler\n      shuffle: False\n      drop_last: True\n    loader:\n      num_workers: 0\n      return_list: False\n      collate_fn: \n        name: DataCollatorWithPadding\n  \n  Eval:\n    dataset:\n      name: ErnieSeqClsDataset\n      dataset_type: chnsenticorp_v2\n      tokenizer_type: ernie-1.0-base-zh-cw\n      max_seq_len: 512\n    sampler:\n      name: GPTBatchSampler\n      shuffle: False\n      drop_last: True\n    loader:\n      num_workers: 0\n      return_list: False\n      collate_fn: \n        name: DataCollatorWithPadding\n\n\nOptimizer:\n  name: FusedAdamW\n  weight_decay: 0.01\n  beta1: 0.9\n  beta2: 0.999\n  epsilon: 1.0e-8\n  lr:\n    name: CosineAnnealingWithWarmupDecay\n    decay_steps: 990000\n    warmup_rate: 0.01\n    max_lr: 5e-05\n    min_lr: 1e-05\n  grad_clip:\n    name: \"ClipGradByGlobalNorm\"\n    clip_norm: 1.0\n  tensor_fusion: False\n\n\nProfiler:\n  enable: False\n  scheduler: [1, 5]\n  profiler_log: profiler_log\n  detailed: False\n"
  },
  {
    "path": "ppfleetx/configs/nlp/ernie/inference_ernie_345M_single_card.yaml",
    "content": "_base_: ./finetune_ernie_345M_single_card.yaml\n\n\nInference:\n  model_dir: ./output\n  mp_degree: 1\n\n\nDistributed:\n  dp_degree: \n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n"
  },
  {
    "path": "ppfleetx/configs/nlp/ernie/pretrain_ernie_base.yaml",
    "content": "Global:\n  device: gpu\n  seed: 1024\n  binary_head: True\n  \n  global_batch_size: \n  local_batch_size: 1\n  micro_batch_size: 1\n\n\nEngine:\n  max_steps: 500000\n  num_train_epochs: 1\n  accumulate_steps: 1\n  logging_freq: 1\n  eval_freq: 500000\n  eval_iters: 10\n  test_iters: -1\n  mix_precision:\n    enable: False\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n  save_load:\n    save_steps: 50000\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\n\nModel:\n  module: \"ErnieModule\"\n  name: \"Ernie\"\n  hidden_size: 768\n  num_hidden_layers: 12\n  num_attention_heads: 12\n  intermediate_size: 3072\n  hidden_act: \"gelu\"\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 512\n  type_vocab_size: 2\n  initializer_range: 0.02\n  pad_token_id: 0\n  task_type_vocab_size: 3\n  task_id: 0\n  use_task_id: False\n  use_recompute: False  \n\n\nData:\n  Train:\n    dataset:\n      name: ErnieDataset\n      input_dir: ./data\n      tokenizer_type: ernie-1.0-base-zh-cw\n      split: [949, 50, 1]\n      mode: Train\n      max_seq_length: 512\n      masked_lm_prob: 0.15\n      short_seq_prob: 0.1\n      seed: 1024\n      share_folder: False\n      favor_longer_ngram: False\n      max_ngrams: 3\n\n    sampler:\n      name: GPTBatchSampler\n      shuffle: False\n      drop_last: True\n    loader:\n      num_workers: 0\n      return_list: False\n      collate_fn: \n        name: ErnieCollateData\n        micro_batch_size: \n  \n  Eval:\n    dataset:\n      name: ErnieDataset\n      input_dir: ./data\n      tokenizer_type: ernie-1.0-base-zh-cw\n      split: [949, 50, 1]\n      mode: Eval\n      max_seq_length: 512\n      masked_lm_prob: 0.15\n      short_seq_prob: 0.1\n      seed: 1024\n      share_folder: False\n      favor_longer_ngram: False\n      max_ngrams: 3\n\n    sampler:\n      name: GPTBatchSampler\n      shuffle: False\n      drop_last: True\n\n    loader:\n      num_workers: 1\n      return_list: False\n      collate_fn: \n        name: ErnieCollateData\n        micro_batch_size: 1\n\nOptimizer:\n  name: FusedAdamW\n  weight_decay: 0.01\n  beta1: 0.9\n  beta2: 0.999\n  epsilon: 1.0e-8\n  lr:\n    name: CosineAnnealingWithWarmupDecay\n    decay_steps: 990000\n    warmup_rate: 0.01\n    max_lr: 0.0001\n    min_lr: 0.00001\n  grad_clip:\n    name: \"ClipGradByGlobalNorm\"\n    clip_norm: 1.0\n  tensor_fusion: False\n\n\nProfiler:\n  enable: False\n  scheduler: [1, 5]\n  profiler_log: profiler_log\n  detailed: False\n\n\nInference:\n  model_dir: ./output\n  mp_degree: 1\n"
  },
  {
    "path": "ppfleetx/configs/nlp/ernie/pretrain_ernie_base_175B_mp8_pp16.yaml",
    "content": "_base_: ./pretrain_ernie_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 512\n  micro_batch_size: 1\n\n\nModel:\n  vocab_size: 40000\n  hidden_size: 12288\n  num_hidden_layers: 96\n  num_attention_heads: 96\n  intermediate_size: \n  hidden_act: \"gelu\"\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 512\n  type_vocab_size: 4\n  initializer_range: 0.02\n  pad_token_id: 0\n  task_type_vocab_size: 3\n  task_id: 0\n  use_task_id: True\n  use_recompute: True\n\n\nData:\n  Train:\n    dataset:\n      tokenizer_type: ernie-1.0-base-zh-cw\n  Eval:\n    dataset:\n      tokenizer_type: ernie-1.0-base-zh-cw\n\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 8\n  pp_degree: 16\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n"
  },
  {
    "path": "ppfleetx/configs/nlp/ernie/pretrain_ernie_base_345M_single_card.yaml",
    "content": "_base_: ./pretrain_ernie_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  vocab_size: 40000\n  hidden_size: 1024\n  num_hidden_layers: 24\n  num_attention_heads: 16\n  intermediate_size: \n  hidden_act: \"gelu\"\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 512\n  type_vocab_size: 4\n  initializer_range: 0.02\n  pad_token_id: 0\n  task_type_vocab_size: 3\n  task_id: 0\n  use_task_id: True\n  use_recompute: False\n\n\nData:\n  Train:\n    dataset:\n      tokenizer_type: ernie-1.0-base-zh-cw\n  Eval:\n    dataset:\n      tokenizer_type: ernie-1.0-base-zh-cw\n\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n"
  },
  {
    "path": "ppfleetx/configs/nlp/ernie/pretrain_ernie_base_3D.yaml",
    "content": "_base_: ./pretrain_ernie_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 1\n\n\nModel:\n  vocab_size: 40000\n  hidden_size: 768\n  num_hidden_layers: 8\n  num_attention_heads: 16\n  intermediate_size: \n  hidden_act: \"gelu\"\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 512\n  type_vocab_size: 4\n  initializer_range: 0.02\n  pad_token_id: 0\n  task_type_vocab_size: 3\n  task_id: 0\n  use_task_id: True\n  use_recompute: False\n\nData:\n  Train:\n    dataset:\n      tokenizer_type: ernie-1.0-base-zh-cw\n  Eval:\n    dataset:\n      tokenizer_type: ernie-1.0-base-zh-cw\n\n\nDistributed:\n  dp_degree: 2\n  mp_degree: 2\n  pp_degree: 2\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n"
  },
  {
    "path": "ppfleetx/configs/nlp/ernie/pretrain_ernie_base_6.7B_sharding16.yaml",
    "content": "_base_: ./pretrain_ernie_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 512\n  micro_batch_size: 1\n\n\nModel:\n  vocab_size: 40000\n  hidden_size: 4096\n  num_hidden_layers: 32\n  num_attention_heads: 32\n  intermediate_size: \n  hidden_act: \"gelu\"\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 512\n  type_vocab_size: 4\n  initializer_range: 0.02\n  pad_token_id: 0\n  task_type_vocab_size: 3\n  task_id: 0\n  use_task_id: True\n  use_recompute: True\n\nData:\n  Train:\n    dataset:\n      tokenizer_type: ernie-1.0-base-zh-cw\n  Eval:\n    dataset:\n      tokenizer_type: ernie-1.0-base-zh-cw\n\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 8\n  pp_degree: 16\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n"
  },
  {
    "path": "ppfleetx/configs/nlp/ernie/pretrain_ernie_large_single_card.yaml",
    "content": "_base_: ./pretrain_ernie_base.yaml\n\nGlobal:\n  global_batch_size: 8\n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  vocab_size: 18000\n  hidden_size: 1024\n  num_hidden_layers: 24\n  num_attention_heads: 16\n  intermediate_size: 3072\n  hidden_act: \"relu\"\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 512\n  type_vocab_size: 2\n  initializer_range: 0.02\n  pad_token_id: 0\n  use_recompute: False\n\n\nData:\n  Train:\n    dataset:\n      tokenizer_type: ernie-1.0-large-zh-cw\n  Eval:\n    dataset:\n      tokenizer_type: ernie-1.0-large-zh-cw\n\n\n\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n"
  },
  {
    "path": "ppfleetx/configs/nlp/ernie/qat_ernie_base.yaml",
    "content": "Global:\n  device: gpu\n  seed: 1024\n  binary_head: True\n  \n  global_batch_size: \n  local_batch_size: 1\n  micro_batch_size: 1\n\n\nEngine:\n  max_steps: 500000\n  num_train_epochs: 1\n  accumulate_steps: 1\n  logging_freq: 1\n  eval_freq: 500000\n  eval_iters: 10\n  test_iters: -1\n  mix_precision:\n    enable: False\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n  save_load:\n    save_steps: 50000\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\n\nModel:\n  module: \"ErnieModule\"\n  name: \"Ernie\"\n  hidden_size: 768\n  num_hidden_layers: 12\n  num_attention_heads: 12\n  intermediate_size: 3072\n  hidden_act: \"gelu\"\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 512\n  type_vocab_size: 2\n  initializer_range: 0.02\n  pad_token_id: 0\n  task_type_vocab_size: 3\n  task_id: 0\n  use_task_id: False\n  use_recompute: False  \n\n\nData:\n  Train:\n    dataset:\n      name: ErnieDataset\n      input_dir: ./data\n      tokenizer_type: ernie-1.0-base-zh-cw\n      split: [949, 50, 1]\n      mode: Train\n      max_seq_length: 512\n      masked_lm_prob: 0.15\n      short_seq_prob: 0.1\n      seed: 1024\n      share_folder: False\n      favor_longer_ngram: False\n      max_ngrams: 3\n\n    sampler:\n      name: GPTBatchSampler\n      shuffle: False\n      drop_last: True\n    loader:\n      num_workers: 0\n      return_list: False\n      collate_fn: \n        name: ErnieCollateData\n        micro_batch_size: \n  \n  Eval:\n    dataset:\n      name: ErnieDataset\n      input_dir: ./data\n      tokenizer_type: ernie-1.0-base-zh-cw\n      split: [949, 50, 1]\n      mode: Eval\n      max_seq_length: 512\n      masked_lm_prob: 0.15\n      short_seq_prob: 0.1\n      seed: 1024\n      share_folder: False\n      favor_longer_ngram: False\n      max_ngrams: 3\n\n    sampler:\n      name: GPTBatchSampler\n      shuffle: False\n      drop_last: True\n\n    loader:\n      num_workers: 1\n      return_list: False\n      collate_fn: \n        name: ErnieCollateData\n        micro_batch_size: 1\n\nOptimizer:\n  name: FusedAdamW\n  weight_decay: 0.01\n  beta1: 0.9\n  beta2: 0.999\n  epsilon: 1.0e-8\n  lr:\n    name: CosineAnnealingWithWarmupDecay\n    decay_steps: 990000\n    warmup_rate: 0.01\n    max_lr: 0.0001\n    min_lr: 0.00001\n  grad_clip:\n    name: \"ClipGradByGlobalNorm\"\n    clip_norm: 1.0\n  tensor_fusion: False\n\n\nProfiler:\n  enable: False\n  scheduler: [1, 5]\n  profiler_log: profiler_log\n  detailed: False\n\n\nInference:\n  model_dir: ./output\n  mp_degree: 1\n\nCompress:\n  pretrained:\n  Quantization:\n    enable: True\n    weight_quantize_type: 'abs_max'\n    activation_quantize_type: 'moving_average_abs_max'\n    activation_preprocess_type: 'PACT'\n    weight_bits: 8\n    activation_bits: 8\n    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']\n    onnx_format: True\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/auto/export_gpt_fp16_single_card.yaml",
    "content": "Global:\n  device: gpu\n  seed: 1024\n  global_batch_size: \n  local_batch_size: 1\n  micro_batch_size: 1\n\nEngine:\n  max_steps: -1\n  num_train_epochs: -1\n  eval_freq: -1\n  eval_iters: -1\n  test_iters: -1\n  mix_precision:\n    enable: True\n    dtype: \"float16\"\n    level: \"o2\"\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\", \"where\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n    use_fp16_guard: False\n  save_load:\n    output_dir:\n    ckpt_dir:\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/auto/generation_gpt_175B_mp8.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\n\nEngine:\n  mix_precision:\n    enable: True\n    dtype: \"float16\"\n    level: \"o2\"\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\", \"where\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n    use_fp16_guard: False\n\n\nGeneration:\n  top_k: 1\n  top_p: 0.9\n  temperature: 1.0\n  min_dec_len: 1\n  max_dec_len: 8\n  use_topp_sampling: True\n  num_return_sequences: 1\n  decode_strategy: \"sampling\"\n  use_topp_sampling: True\n  early_finish: True\n\n\nModel:\n  module: GPTGenerationModuleAuto\n  vocab_size: 51200\n  hidden_size: 12288\n  num_layers: 96\n  num_attention_heads: 96\n  ffn_hidden_size: 49152\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 1\n  initializer_range: 0.02\n  use_recompute: False\n  fuse_attn_qkv: True\n\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 8\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/auto/generation_gpt_345M_mp2.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\n\nEngine:\n  mix_precision:\n    enable: True\n    dtype: \"float16\"\n    level: \"o2\"\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\", \"where\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n    use_fp16_guard: False\n\n\nGeneration:\n  top_k: 50\n  top_p: 0.75\n  temperature: 1.0\n  min_dec_len: 1\n  max_dec_len: 200\n  num_return_sequences: 1\n  decode_strategy: \"sampling\"\n  use_topp_sampling: True\n  early_finish: True\n\n\nModel:\n  module: GPTGenerationModuleAuto\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: 4096\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  fuse_attn_qkv: True\n\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 2\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/auto/generation_gpt_345M_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\n\nEngine:\n  mix_precision:\n    enable: True\n    dtype: \"float16\"\n    level: \"o2\"\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\", \"where\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n    use_fp16_guard: False\n\n\nGeneration:\n  top_k: 0\n  top_p: 0.9\n  use_topp_sampling: True\n  inference: True\n  temperature: 1.0\n  min_dec_len: 8\n  max_dec_len: 8\n  num_return_sequences: 1\n  decode_strategy: \"sampling\"\n  early_finish: True\n\n\nModel:\n  module: GPTGenerationModuleAuto\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: 4096\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  fuse_attn_qkv: True\n\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/auto/generation_gpt_6.7B_mp1.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\n\nEngine:\n  mix_precision:\n    enable: True\n    dtype: \"float16\"\n    level: \"o2\"\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\", \"where\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n    use_fp16_guard: False\n\n\nGeneration:\n  top_k: 0\n  top_p: 0.9\n  use_topp_sampling: True\n  inference: True\n  temperature: 1.0\n  min_dec_len: 8\n  max_dec_len: 8\n  num_return_sequences: 1\n  decode_strategy: \"sampling\"\n  early_finish: True\n\n\nModel:\n  module: GPTGenerationModuleAuto\n  vocab_size: 51200\n  hidden_size: 4096\n  num_layers: 32\n  num_attention_heads: 32\n  ffn_hidden_size: 16384\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  fuse_attn_qkv: True\n\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 2048\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: \n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  fuse_attn_qkv: True\n  use_recompute: True\n  recompute_granularity:\n  no_recompute_layers:\n\n\nDistributed:\n  dp_degree: 8\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8_tuning.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 2048\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: \n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  fuse_attn_qkv: True\n  use_recompute: True\n  recompute_granularity: \"full_attn\"\n  no_recompute_layers:\n\n\nDistributed:\n  dp_degree: 8\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n\n\nTuning:\n  enable: True\n  tuning_recompute: True\n  profile_start_step: 1\n  profile_end_step: 5\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: 8\n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 2048\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: \n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  fuse_attn_qkv: True\n  use_recompute: True\n  recompute_granularity:\n  no_recompute_layers:\n\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_345M_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: 4096\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  fuse_attn_qkv: True\n\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_6.7B_sharding16.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 4096\n  num_layers: 32\n  num_attention_heads: 32\n  ffn_hidden_size:\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  fuse_attn_qkv: True\n  use_recompute: True\n  recompute_granularity:\n  no_recompute_layers:\n\n\nDistributed:\n  dp_degree:\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 16\n    sharding_stage: 2\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_base.yaml",
    "content": "Global:\n  device: gpu\n  seed: 1024\n\n  global_batch_size: \n  local_batch_size: 1\n  micro_batch_size: 1\n\n\nEngine:\n  max_steps: 500000\n  num_train_epochs: 1\n  eval_freq: 1\n  eval_iters: 10\n  test_iters:\n  mix_precision:\n    enable: True\n    dtype: \"float16\"\n    level: \"o2\"\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n    use_fp16_guard: False\n  save_load:\n    output_dir: ./output\n    ckpt_dir:\n\n\nModel:\n  module: \"GPTModuleAuto\"\n  name: \"GPT\"\n  fuse_attn_qkv: False\n\n\nData:\n  Train:\n    collate_fn: gpt_collate_fn\n    sample_split: 2\n    dataset:\n      name: GPTDataset\n      input_dir: ./data/\n      split: [949, 50, 1]\n      max_seq_len: 1024\n\n  Eval:\n    collate_fn: gpt_collate_fn\n    sample_split: 2\n    dataset:\n      name: GPTDataset\n      input_dir: ./data/\n      split: [949, 50, 1]\n      max_seq_len: 1024\n\n\nOptimizer:\n  name: AdamW\n  weight_decay: 0.01\n  beta1: 0.9\n  beta2: 0.999\n  epsilon: 1.0e-8\n  lr:\n    name: CosineAnnealingWithWarmupDecay\n    decay_steps: 360000\n    warmup_rate: 0.01\n    max_lr: 5.0e-5\n    min_lr: 1.0e-5\n  grad_clip:\n    name: \"ClipGradByGlobalNorm\"\n    clip_norm: 1.0\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/auto/qat_generation_gpt_345M_mp2.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\n\nEngine:\n  mix_precision:\n    enable: True\n    dtype: \"float16\"\n    level: \"o2\"\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\", \"where\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n    use_fp16_guard: False\n\n\nGeneration:\n  top_k: 50\n  top_p: 0.75\n  temperature: 1.0\n  min_dec_len: 1\n  max_dec_len: 200\n  num_return_sequences: 1\n  decode_strategy: \"sampling\"\n\n\nModel:\n  module: GPTGenerationModuleAuto\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: 4096\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  fuse_attn_qkv: True\n\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 2\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n\n\nQuantization:\n  enable: True\n  channel_wise_abs_max: False\n  weight_bits: 8\n  activation_bits: 8\n  onnx_format: True\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/eval_gpt_345M_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_345M_single_card.yaml\n\n\nModel:\n  module: GPTEvalModule\n\n\nOffline_Eval:\n  eval_path: ./wikitext-103/wiki.valid.tokens\n  cloze_eval: False\n  overlapping_eval: 32\n  batch_size: 8\n  max_seq_len: 1024\n  logging_freq: 10\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/eval_pruned_gpt_345M_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_345M_single_card.yaml\n\n\nEngine:\n  save_load:\n    ckpt_dir:\n\n\nModel:\n  module: GPTEvalModule\n  hidden_dropout_prob: 0.0\n  attention_probs_dropout_prob: 0.0\n\n\nCompress:\n  Prune:\n    enable: True\n    criterion: l1_norm\n    ratio: 0.125\n\n\nOffline_Eval:\n  eval_path: ./lambada_test.jsonl\n  cloze_eval: True\n  overlapping_eval: 32\n  batch_size: 8\n  max_seq_len: 1024\n  logging_freq: 10\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/eval_qat_gpt_345M_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_345M_single_card.yaml\n\n\nModel:\n  module: GPTEvalModule\n\nCompress:\n  pretrained:\n  Quantization:\n    enable: True\n    weight_quantize_type: 'abs_max'\n    activation_quantize_type: 'moving_average_abs_max'\n    activation_preprocess_type: 'PACT'\n    weight_bits: 8\n    activation_bits: 8\n    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']\n    onnx_format: True\n    skip_tensor_map: \n      block_3: ['linear2']\n      block_5: ['linear1']\n      block_6: ['linear2']\n      block_7: ['linear2']\n      block_10: ['linear2']\n      block_20: ['linear2']\n      block_21: ['linear2']\n\nOffline_Eval:\n  eval_path: ./wikitext-103/wiki.valid.tokens\n  cloze_eval: False\n  overlapping_eval: 32\n  batch_size: 8\n  max_seq_len: 1024\n  logging_freq: 10\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/export_qat_gpt_345M_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: 8\n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size:\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  recompute_granularity:\n  fused_linear: True\n  \n\nDistributed:\n  dp_degree:\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n\n\nCompress:\n  pretrained:\n  Quantization:\n    enable: True\n    weight_quantize_type: 'abs_max'\n    activation_quantize_type: 'moving_average_abs_max'\n    weight_bits: 8\n    activation_bits: 8\n    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']\n    onnx_format: True\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml",
    "content": "_base_: ./finetune_gpt_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 32\n  micro_batch_size: 32\n  \n\nEngine:\n  run_mode: epoch\n  num_train_epochs: 3\n  accumulate_steps:\n  logging_freq: 10\n  eval_freq: 1\n  mix_precision:\n    enable: True\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\", \"reduce_mean\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n  save_load:\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\n\nModel:\n  module: \"GPTFinetuneModule\"\n  name: \"GPT\"\n  num_classes: 2\n  pretrained: './ckpt/PaddleFleetX_GPT_345M_220826/model'\n  fuse_attn_qkv: True\n  fused_linear: False\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: 4096\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  recompute_granularity:\n  \n  loss:\n    train:\n      name: 'CrossEntropyLoss'\n    eval:\n      name: 'CrossEntropyLoss'\n  \n  metric:\n    eval:\n      name: 'Accuracy'\n  \n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n    \nOptimizer:\n  name: FusedAdamW\n  weight_decay: 0.0\n  beta1: 0.9\n  beta2: 0.999\n  epsilon: 1e-6\n  multi_precision: True\n  lr:\n    name: LinearDecayWithWarmup\n    warmup: 0.1\n    learning_rate: 2e-5\n  tensor_fusion: False\n    \n    \nData:\n  Train:\n    dataset:\n      name: SST2\n      root: ./dataset/SST-2/\n      split: 'train'\n      max_length: 128\n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 32\n      shuffle: True\n      drop_last: True\n    loader:\n      num_workers: 4\n      return_list: False\n  \n  Eval:\n    dataset:\n      name: SST2\n      root: ./dataset/SST-2/\n      split: 'dev'\n      max_length: 128\n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 32\n      shuffle: False\n      drop_last: False\n    loader:\n      num_workers: 4\n      return_list: False\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/finetune_gpt_base.yaml",
    "content": "Global:\n  device: gpu\n  seed: 42\n\n  global_batch_size: \n  local_batch_size: 1\n  micro_batch_size: 1\n  \nEngine:\n  run_mode: epoch\n  max_steps: -1\n  eval_freq: 1\n  eval_iters: -1\n  test_iters: -1\n  save_load:\n    save_steps: -1\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\n\nProfiler:\n  enable: False\n  scheduler: [1, 5]\n  profiler_log: profiler_log\n  detailed: False\n\nModel:\n  use_flash_attn: False\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/generation_gpt_345M_dp8.yaml",
    "content": "_base_: ./pretrain_gpt_345M_single_card.yaml\n\nModel:\n  module: GPTGenerationModule\n\nGeneration:\n  top_k: 50\n  top_p: 0.75\n  temperature: 1.0\n  min_dec_len: 1\n  max_dec_len: 200\n  num_return_sequences: 1\n  decode_strategy: \"sampling\"\n\nDistributed:\n  dp_degree: \n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/generation_gpt_345M_mp1.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\n\nEngine:\n  mix_precision:\n    level:\n\n\nGeneration:\n  top_k: 50\n  top_p: 0.75\n  temperature: 1.0\n  min_dec_len: 1\n  max_dec_len: 200\n  num_return_sequences: 1\n  decode_strategy: \"sampling\"\n\n\nModel:\n  module: GPTGenerationModuleAuto\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: 4096\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  fuse_attn_qkv: True\n\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/generation_gpt_345M_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_345M_single_card.yaml\n\nModel:\n  module: GPTGenerationModule\n\nGeneration:\n  top_k: 50\n  top_p: 0.75\n  temperature: 1.0\n  min_dec_len: 1\n  max_dec_len: 200\n  num_return_sequences: 1\n  decode_strategy: \"sampling\"\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/generation_gpt_6.7B_single_mp1.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\n\nEngine:\n  mix_precision:\n    level: \"o2\"\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\", \"where\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n    use_fp16_guard: False\n\n\nGeneration:\n  top_k: 0\n  top_p: 0.9\n  use_topp_sampling: True\n  inference: True\n  temperature: 1.0\n  min_dec_len: 8\n  max_dec_len: 8\n  num_return_sequences: 1\n  decode_strategy: \"sampling\"\n\n\nModel:\n  module: GPTGenerationModuleAuto\n  vocab_size: 51200\n  hidden_size: 4096\n  num_layers: 32\n  num_attention_heads: 32\n  ffn_hidden_size: 16384\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  fuse_attn_qkv: True\n\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/generation_pruned_gpt_345M_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_345M_single_card.yaml\n\nModel:\n  module: GPTGenerationModule\n\nCompress:\n  Prune:\n    enable: True\n    criterion: l1_norm\n    ratio: 0.125\n\nGeneration:\n  top_k: 50\n  top_p: 0.75\n  temperature: 1.0\n  min_dec_len: 1\n  max_dec_len: 200\n  num_return_sequences: 1\n  decode_strategy: \"sampling\"\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/generation_qat_gpt_345M_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_345M_single_card.yaml\n\nModel:\n  module: GPTGenerationModule\n\nGeneration:\n  top_k: 50\n  top_p: 0.75\n  temperature: 1.0\n  min_dec_len: 1\n  max_dec_len: 200\n  num_return_sequences: 1\n  decode_strategy: \"sampling\"\n  use_topp_sampling: True\n  inference: True\n\nCompress:\n  pretrained:\n  Quantization:\n    enable: True\n    weight_quantize_type: 'abs_max'\n    activation_quantize_type: 'moving_average_abs_max'\n    weight_bits: 8\n    activation_bits: 8\n    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']\n    onnx_format: True\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/generation_qat_gpt_6.7B_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_6.7B_single_card.yaml\n\nModel:\n  module: GPTGenerationModule\n\nGeneration:\n  top_k: 50\n  top_p: 0.75\n  temperature: 1.0\n  min_dec_len: 1\n  max_dec_len: 200\n  num_return_sequences: 1\n  decode_strategy: \"sampling\"\n  use_topp_sampling: True\n  inference: True\n\nCompress:\n  pretrained:\n  Quantization:\n    enable: True\n    weight_quantize_type: 'abs_max'\n    activation_quantize_type: 'moving_average_abs_max'\n    weight_bits: 8\n    activation_bits: 8\n    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']\n    onnx_format: True\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/inference_gpt_345M_dp8.yaml",
    "content": "_base_: ./generation_gpt_345M_dp8.yaml\n\n\nInference:\n  model_dir: ./output\n  mp_degree: 1\n\n\nDistributed:\n  dp_degree: \n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n\n\nData:\n  Test:\n    dataset:\n      name: GPTDataset\n      input_dir: ./data/\n      split: [949, 50, 1]\n      max_seq_len: 1024\n    sampler:\n      name: GPTBatchSampler\n      shuffle: False\n      drop_last: True\n    loader:\n      num_workers: 1\n      return_list: False\n      collate_fn: gpt_collate_fn\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/inference_gpt_345M_single_card.yaml",
    "content": "_base_: ./generation_gpt_345M_single_card.yaml\n\n\nInference:\n  model_dir: ./output\n  mp_degree: 1\n\n\nDistributed:\n  dp_degree: \n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n\n\nData:\n  Test:\n    dataset:\n      name: GPTDataset\n      input_dir: ./data/\n      split: [949, 50, 1]\n      max_seq_len: 1024\n    sampler:\n      name: GPTBatchSampler\n      shuffle: False\n      drop_last: True\n    loader:\n      num_workers: 1\n      return_list: False\n      collate_fn: gpt_collate_fn\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 2048\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: \n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: True\n  recompute_granularity:\n  no_recompute_layers:\n  \n\nDistributed:\n  dp_degree: 8\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: 8\n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 2048\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: \n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: True\n  recompute_granularity:\n  no_recompute_layers:\n  \n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/pretrain_gpt_13B_dp8.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  seed: 1234\n\n  global_batch_size: 480\n  local_batch_size: \n  micro_batch_size: 4\n\n\nEngine:\n  max_steps: 200000\n  eval_freq: 1000\n  eval_iters: 10\n  save_load:\n    save_steps: 500\n\n\nModel:\n  vocab_size: 50432\n  hidden_size: 5120\n  num_layers: 40\n  num_attention_heads: 40\n  ffn_hidden_size: \n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 4096\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: True\n  recompute_granularity: 'full'\n  no_recompute_layers:\n\n\nData:\n  Train:\n    dataset:\n      max_seq_len: 4096\n  \n  Eval:\n    dataset:\n      max_seq_len: 4096\n\n\nDistributed:\n  dp_degree:\n  mp_degree: 2\n  pp_degree: 8\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n\n\nOptimizer:\n  lr:\n    name: CosineAnnealingWithWarmupDecay\n    decay_steps: 160000\n    warmup_rate: 0.001\n    max_lr: 1.0e-4\n    min_lr: 1.0e-5\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/pretrain_gpt_175B_mp8_pp16.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 1536\n  micro_batch_size: 1\n\n\nModel:\n  vocab_size: 51200\n  hidden_size: 12288\n  num_layers: 96\n  num_attention_heads: 96\n  ffn_hidden_size:\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: True\n  recompute_granularity: 'core_attn'\n  no_recompute_layers:\n  virtual_pp_degree: 1\n  sequence_parallel: True\n  fused_linear: True\n  \n\nDistributed:\n  dp_degree:\n  mp_degree: 8\n  pp_degree: 16\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: 4096\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  recompute_granularity:\n  no_recompute_layers:\n  \n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nEngine:\n  logging_freq: 10\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 4096\n  num_layers: 32\n  num_attention_heads: 32\n  ffn_hidden_size:\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: True\n  recompute_granularity:\n  no_recompute_layers:\n  fused_linear: True\n\n\nDistributed:\n  dp_degree:\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 16\n    sharding_stage: 2\n    sharding_offload: False\n    reduce_overlap: True\n    broadcast_overlap: True\n\n\nOptimizer:\n  tensor_fusion: True\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 32\n  num_attention_heads: 32\n  ffn_hidden_size: 16384\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  recompute_granularity:\n  no_recompute_layers:\n  \n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml",
    "content": "Global:\n  device: gpu\n  seed: 1024\n\n  global_batch_size: \n  local_batch_size: 1\n  micro_batch_size: 1\n\n\nEngine:\n  max_steps: 500000\n  num_train_epochs: 1\n  accumulate_steps:\n  logging_freq: 1\n  eval_freq: 500\n  eval_iters: 10\n  test_iters:\n  mix_precision:\n    enable: True\n    dtype: \"float16\"\n    level: \"O2\"\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n  save_load:\n    save_steps: 1000\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\n\nModel:\n  module: \"GPTModule\"\n  name: \"GPT\"\n  vocab_size_divisible_unit: 128\n  fused_linear: False\n  fuse_attn_qkv: True\n  scale_qk_by_layer_num: True\n  sequence_parallel: False\n  use_flash_attn: False\n  fused_softmax_with_triangular: True\n\n\nData:\n  Train:\n    dataset:\n      name: GPTDataset\n      input_dir: ./data/\n      split: [969, 30, 1]\n      max_seq_len: 1024\n    sampler:\n      name: GPTBatchSampler\n      shuffle: False\n      drop_last: True\n    loader:\n      num_workers: 1\n      return_list: False\n      collate_fn: gpt_collate_fn\n  \n  Eval:\n    dataset:\n      name: GPTDataset\n      input_dir: ./data/\n      split: [969, 30, 1]\n      max_seq_len: 1024\n    sampler:\n      name: GPTBatchSampler\n      shuffle: False\n      drop_last: True\n    loader:\n      num_workers: 1\n      return_list: False\n      collate_fn: gpt_collate_fn\n\n\nOptimizer:\n  name: FusedAdamW\n  weight_decay: 0.01\n  beta1: 0.9\n  beta2: 0.999\n  epsilon: 1.0e-8\n  lr:\n    name: CosineAnnealingWithWarmupDecay\n    decay_steps: 360000\n    warmup_rate: 0.01\n    max_lr: 5.0e-5\n    min_lr: 1.0e-5\n    use_increments: True\n  grad_clip:\n    name: \"ClipGradByGlobalNorm\"\n    clip_norm: 1.0\n  tensor_fusion: False\n\n\nProfiler:\n  enable: False\n  scheduler: [1, 5]\n  profiler_log: profiler_log\n  detailed: False\n\n\nDistributed:\n  fuse_sequence_parallel_allreduce: False\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/pretrain_gpt_cn_345M_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  name: \"GPT-cn\"\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: 4096\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  recompute_granularity:\n  no_recompute_layers:\n  \n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/prune_gpt_345M_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\nEngine:\n  save_load:\n    save_steps: 1000\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\nModel:\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: 4096\n  hidden_dropout_prob: 0.0\n  attention_probs_dropout_prob: 0.0\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  recompute_granularity:\n  no_recompute_layers:\n  \n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    comm_overlap: False\n\n\nOptimizer:\n  weight_decay: 0.0\n  lr:\n    decay_steps: 90000\n    warmup_rate: 0.00\n    max_lr: 2.5e-5\n    min_lr: 5.0e-6\n    \n\nCompress:\n  pretrained:\n  Prune:\n    enable: True\n    criterion: l1_norm\n    ratio: 0.125\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/qat_gpt_345M_mp8.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: 8\n  local_batch_size: 8\n  micro_batch_size: 1\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size:\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  recompute_granularity:\n  fused_linear: True\n  \n\nDistributed:\n  dp_degree:\n  mp_degree: 8\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n\n\nCompress:\n  pretrained:\n  Quantization:\n    enable: True\n    weight_quantize_type: 'abs_max'\n    activation_quantize_type: 'moving_average_abs_max'\n    weight_bits: 8\n    activation_bits: 8\n    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']\n    onnx_format: True\n    freeze_embedding: True\n    skip_tensor_map: \n      block_3: ['linear2']\n      block_5: ['linear1']\n      block_6: ['linear2']\n      block_7: ['linear2']\n      block_10: ['linear2']\n      block_20: ['linear2']\n      block_21: ['linear2']\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/qat_gpt_345M_single_card.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: 8\n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 1024\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size:\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: False\n  recompute_granularity:\n  fused_linear: True\n  \n\nDistributed:\n  dp_degree:\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n\n\nCompress:\n  pretrained:\n  Quantization:\n    enable: True\n    weight_quantize_type: 'abs_max'\n    activation_quantize_type: 'moving_average_abs_max'\n    activation_preprocess_type: 'PACT'\n    weight_bits: 8\n    activation_bits: 8\n    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']\n    onnx_format: True\n    freeze_embedding: True\n    skip_tensor_map: \n      block_3: ['linear2']\n      block_5: ['linear1']\n      block_6: ['linear2']\n      block_7: ['linear2']\n      block_10: ['linear2']\n      block_20: ['linear2']\n      block_21: ['linear2']\n"
  },
  {
    "path": "ppfleetx/configs/nlp/gpt/qat_gpt_6.7B_sharding16.yaml",
    "content": "_base_: ./pretrain_gpt_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nEngine:\n  logging_freq: 10\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 4096\n  num_layers: 32\n  num_attention_heads: 32\n  ffn_hidden_size:\n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: True\n  recompute_granularity:\n  no_recompute_layers:\n  fused_linear: True\n\n\nDistributed:\n  dp_degree:\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 16\n    sharding_stage: 2\n    sharding_offload: False\n    reduce_overlap: True\n    broadcast_overlap: True\n\n\nOptimizer:\n  tensor_fusion: True\n\n\nCompress:\n  pretrained:\n  Quantization:\n    enable: True\n    weight_quantize_type: 'abs_max'\n    activation_quantize_type: 'moving_average_abs_max'\n    activation_preprocess_type: 'PACT'\n    weight_bits: 8\n    activation_bits: 8\n    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']\n    onnx_format: True\n"
  },
  {
    "path": "ppfleetx/configs/nlp/moe/pretrain_moe_1.3B_dp8.yaml",
    "content": "_base_: ./pretrain_moe_base.yaml\n\nGlobal:\n  global_batch_size: \n  local_batch_size: 8\n  micro_batch_size: 8\n\n\nModel:\n  vocab_size: 50304\n  hidden_size: 2048\n  num_layers: 24\n  num_attention_heads: 16\n  ffn_hidden_size: \n  hidden_dropout_prob: 0.1\n  attention_probs_dropout_prob: 0.1\n  max_position_embeddings: 1024\n  type_vocab_size: 16\n  initializer_range: 0.02\n  use_recompute: True\n  recompute_granularity:\n  no_recompute_layers:\n  \n\nDistributed:\n  dp_degree: 8\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n"
  },
  {
    "path": "ppfleetx/configs/nlp/moe/pretrain_moe_base.yaml",
    "content": "Global:\n  device: gpu\n  seed: 1024\n\n  global_batch_size: \n  local_batch_size: 1\n  micro_batch_size: 1\n\n\nEngine:\n  max_steps: 500000\n  num_train_epochs: 1\n  accumulate_steps:\n  logging_freq: 1\n  eval_freq: 500\n  eval_iters: 10\n  test_iters:\n  mix_precision:\n    enable: True\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\"]\n    custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n  save_load:\n    save_steps: 1000\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n  balance_loss_weight: 1.0\n\n\nModel:\n  module: \"MoEModule\"\n  name: \"MoE\"\n  fused_linear: False\n  fuse_attn_qkv: True\n  sequence_parallel: False\n  moe_configs:\n    expert_mode: True\n    gate: gshard\n    top_k: 2\n    num_experts: 2\n\n\nData:\n  Train:\n    dataset:\n      name: GPTDataset\n      input_dir: ./data/\n      split: [949, 50, 1]\n      max_seq_len: 1024\n    sampler:\n      name: GPTBatchSampler\n      shuffle: False\n      drop_last: True\n    loader:\n      num_workers: 1\n      return_list: False\n      collate_fn: gpt_collate_fn\n  \n  Eval:\n    dataset:\n      name: GPTDataset\n      input_dir: ./data/\n      split: [949, 50, 1]\n      max_seq_len: 1024\n    sampler:\n      name: GPTBatchSampler\n      shuffle: False\n      drop_last: True\n    loader:\n      num_workers: 1\n      return_list: False\n      collate_fn: gpt_collate_fn\n\n\nOptimizer:\n  name: FusedAdamW\n  weight_decay: 0.01\n  beta1: 0.9\n  beta2: 0.999\n  epsilon: 1.0e-8\n  lr:\n    name: CosineAnnealingWithWarmupDecay\n    decay_steps: 360000\n    warmup_rate: 0.01\n    max_lr: 5.0e-5\n    min_lr: 1.0e-5\n  grad_clip:\n    name: \"ClipGradForMOEByGlobalNorm\"\n    clip_norm: 1.0\n  tensor_fusion: False\n\n\nProfiler:\n  enable: False\n  scheduler: [1, 5]\n  profiler_log: profiler_log\n  detailed: False\n\nDistributed:\n  dp_degree: 1\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n  hcg: HybridCommGroupForMoE\n"
  },
  {
    "path": "ppfleetx/configs/vis/base.yaml",
    "content": "Global:\n  device: gpu\n  seed: 2021\n  global_batch_size: \n  local_batch_size: 1\n  micro_batch_size: 1\n  flags:\n      FLAGS_enable_cublas_tensor_op_math: True\n      FLAGS_gemm_use_half_precision_compute_type: False\n\nEngine:\n  run_mode: epoch\n  max_steps: -1\n  eval_freq: 1\n  eval_iters: -1\n  test_iters: -1\n  save_load:\n    save_steps: -1\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\nDistributed:\n  dp_degree:\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n    \nModel:\n  use_recompute: False\n\nFused:\n  tensor_fusion: False\n\nProfiler:\n  enable: False\n  scheduler: [1, 5]\n  profiler_log: profiler_log\n  detailed: False\n\n\nInference:\n  model_dir: ./output\n  mp_degree: 1\n"
  },
  {
    "path": "ppfleetx/configs/vis/moco/moco_lincls_in1k_1n8c.yaml",
    "content": "_base_: ../base.yaml\n\nGlobal:\n  device: gpu\n  seed: 2022\n\nEngine:\n  run_mode: 'epoch'\n  num_train_epochs: 100\n  eval_freq: 1\n  eval_iters: 1\n  accumulate_steps: 1\n  logging_freq: 10\n  mix_precision:\n    enable: False\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"elementwise_div\"]\n    custom_white_list: []\n  save_load:\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\nDistributed:\n  dp_degree:\n\nModel:\n  module: \"MOCOClsModule\"\n  model:\n    base_encoder:\n      name: \"resnet50\"\n      with_pool: False\n      num_classes: 0 # remove last classifier   \n      #pretrained: ./pretrained/mocov1/model\n      pretrained: ./pretrained/mocov2/model\n    base_classifier:\n      name: \"MoCoClassifier\"\n      with_pool: True\n      num_features: 2048\n      num_classes: 1000\n\n  loss:\n    train:\n      name: 'CELoss'\n    eval:\n      name: 'CELoss'\n  metric:\n    train:\n      name: 'TopkAcc'\n      topk: [1, 5]\n    eval:\n      name: 'TopkAcc'\n      topk: [1, 5]\n\n\nOptimizer:\n  name: Momentum\n  momentum: 0.9\n  weight_decay: 0.0\n  lr:\n    name: MultiStepDecay\n    run_mode: epoch\n    learning_rate: 30.0\n    gamma: 0.1\n    milestones: [60, 80]\n\n\nData:\n  Train:\n    dataset:\n      name: GeneralClsDataset\n      image_root: ./dataset/ILSVRC2012/\n      class_num: 1000\n      cls_label_path: ./dataset/ILSVRC2012/train_list.txt\n      transform_ops:\n        - DecodeImage:\n            to_rgb: True\n            channel_first: False\n        - RandCropImage:\n            size: 224\n            interpolation: bilinear\n            backend: pil\n        - RandFlipImage:\n            flip_code: 1\n        - NormalizeImage:\n            scale: 1.0/255.0\n            mean: [0.5, 0.5, 0.5]\n            std: [0.5, 0.5, 0.5]\n            order: ''\n        - ToCHWImage:\n\n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 32 # total bachsize 256\n      drop_last: True\n      shuffle: True\n    loader:\n      num_workers: 8\n      use_shared_memory: True\n\n  Eval:\n    dataset: \n      name: GeneralClsDataset\n      image_root: ./dataset/ILSVRC2012/\n      cls_label_path: ./dataset/ILSVRC2012/val_list.txt\n      transform_ops:\n        - DecodeImage:\n            to_rgb: True\n            channel_first: False\n        - ResizeImage:\n            resize_short: 256\n            interpolation: bilinear\n            backend: pil\n        - CenterCropImage:\n            size: 224\n        - NormalizeImage:\n            scale: 1.0/255.0\n            mean: [0.5, 0.5, 0.5]\n            std: [0.5, 0.5, 0.5]\n            order: ''\n        - ToCHWImage:\n        \n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 64\n      drop_last: False\n      shuffle: False\n    loader:\n      num_workers: 8\n      use_shared_memory: True\n"
  },
  {
    "path": "ppfleetx/configs/vis/moco/mocov1_pt_in1k_1n8c.yaml",
    "content": "_base_: ../base.yaml\n\nGlobal:\n  device: gpu\n  seed: 2022\n\nEngine:\n  run_mode: 'epoch'\n  num_train_epochs: 200\n  eval_freq: -1\n  eval_iters: 0\n  accumulate_steps: 1\n  logging_freq: 10\n  mix_precision:\n    enable: False\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"elementwise_div\"]\n    custom_white_list: []\n  save_load:\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\nDistributed:\n  dp_degree:\n\nModel:\n  module: \"MOCOModule\"\n  model:\n    base_encoder:\n      name: \"resnet50\"\n      with_pool: False\n      num_classes: 0 # remove last classifier\n    base_classifier:\n      name: \"MoCoClassifier\"\n      with_pool: True\n      num_features: 2048\n      num_classes: 128\n    momentum_encoder:\n      name: \"resnet50\"\n      with_pool: False\n      num_classes: 0 # remove last classifier\n    momentum_classifier:\n      name: \"MoCoClassifier\"\n      with_pool: True\n      num_features: 2048\n      num_classes: 128\n  loss:\n    train:\n      name: 'CELoss'\n\n\nOptimizer:\n  name: Momentum\n  momentum: 0.9\n  weight_decay: 0.0001\n  lr:\n    name: MultiStepDecay\n    run_mode: epoch\n    learning_rate: 0.03\n    gamma: 0.1\n    milestones: [120, 160]\n\n\nData:\n  Train:\n    dataset:\n      name: ContrativeLearningDataset\n      root: ./dataset/ILSVRC2012/train\n      transform_ops:\n        - DecodeImage:\n            to_rgb: True\n            channel_first: False\n        - RandCropImage:\n            size: 224\n            scale: [0.2, 1.0]\n            interpolation: bicubic\n            backend: pil\n        - RandomGrayscale:\n            p: 0.2\n        - ColorJitter:\n            brightness: 0.4\n            contrast: 0.4\n            saturation: 0.4\n            hue: 0.4\n        - RandFlipImage:\n            flip_code: 1\n        - NormalizeImage:\n            scale: 1.0/255.0\n            mean: [0.5, 0.5, 0.5]\n            std: [0.5, 0.5, 0.5]\n            order: ''\n        - ToCHWImage:\n\n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 32 # total batchsize 256\n      drop_last: True\n      shuffle: True\n    loader:\n      num_workers: 8\n      use_shared_memory: True\n"
  },
  {
    "path": "ppfleetx/configs/vis/moco/mocov2_pt_in1k_1n8c.yaml",
    "content": "_base_: ../base.yaml\n\nGlobal:\n  device: gpu\n  seed: 2022\n\nEngine:\n  run_mode: 'epoch'\n  num_train_epochs: 200\n  eval_freq: -1\n  eval_iters: 0\n  accumulate_steps: 1\n  logging_freq: 10\n  mix_precision:\n    enable: False\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"elementwise_div\"]\n    custom_white_list: []\n  save_load:\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\nDistributed:\n  dp_degree:\n\nModel:\n  module: \"MOCOModule\"\n  model:\n    T: 0.2\n    base_encoder:\n      name: \"resnet50\"\n      with_pool: False\n      num_classes: 0 # remove last classifier\n    base_projector:\n      name: \"MoCoV2Projector\"\n      in_dim: 2048\n      out_dim: 2048\n      with_pool: True\n    base_classifier:\n      name: \"MoCoClassifier\"\n      with_pool: False\n      num_features: 2048\n      num_classes: 128\n    momentum_encoder:\n      name: \"resnet50\"\n      with_pool: False\n      num_classes: 0 # remove last classifier\n    momentum_projector:\n      name: \"MoCoV2Projector\"\n      in_dim: 2048\n      out_dim: 2048\n      with_pool: True\n    momentum_classifier:\n      name: \"MoCoClassifier\"\n      with_pool: False\n      num_features: 2048\n      num_classes: 128\n  loss:\n    train:\n      name: 'CELoss'\n\nOptimizer:\n  name: Momentum\n  momentum: 0.9\n  weight_decay: 0.0001\n  lr:\n    name: CosineDecay\n    run_mode: epoch\n    update_unit: epoch\n    learning_rate: 0.03\n\nData:\n  Train:\n    dataset:\n      name: ContrativeLearningDataset\n      root: ./dataset/ILSVRC2012/train\n      transform_ops:\n        - DecodeImage:\n            to_rgb: True\n            channel_first: False\n        - RandCropImage:\n            size: 224\n            scale: [0.2, 1.0]\n            interpolation: bicubic\n            backend: pil\n        - ColorJitter:\n            brightness: 0.4\n            contrast: 0.4\n            saturation: 0.4\n            hue: 0.1\n            p: 0.8\n        - RandomGrayscale:\n            p: 0.2\n        - GaussianBlur:\n            sigma: [.1, 2.]\n            p: 0.5\n        - RandFlipImage:\n            flip_code: 1\n        - NormalizeImage:\n            scale: 1.0/255.0\n            mean: [0.5, 0.5, 0.5]\n            std: [0.5, 0.5, 0.5]\n            order: ''\n        - ToCHWImage:\n\n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 32 # total batchsize 256\n      drop_last: True\n      shuffle: True\n    loader:\n      num_workers: 8\n      use_shared_memory: True\n"
  },
  {
    "path": "ppfleetx/configs/vis/vit/ViT_base_patch16_224_inference.yaml",
    "content": "Global:\n  device: gpu\n  seed: 2021\n  global_batch_size:\n  local_batch_size: 1\n  micro_batch_size: 1\n\nDistributed:\n  dp_degree:\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n    sharding_offload: False\n    reduce_overlap: False\n    broadcast_overlap: False\n\nEngine:\n  run_mode: 'epoch'\n  num_train_epochs: 300\n  eval_freq: 1\n  accumulate_steps: 1\n  logging_freq: 10\n  mix_precision:\n    enable: True\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"elementwise_div\"]\n    custom_white_list: []\n  save_load:\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir: ./ckpt\n\nModel:\n  use_recompute: False\n  module: \"GeneralClsModule\"\n  model:\n    name: \"ViT_base_patch16_224\"\n    class_num: 1000\n    drop_rate: 0.1\n  loss:\n    train:\n      name: 'ViTCELoss'\n      epsilon: 0.0001\n    eval:\n      name: 'CELoss'\n  metric:\n    train:\n      name: 'TopkAcc'\n      topk: [1, 5]\n    eval:\n      name: 'TopkAcc'\n      topk: [1, 5]\n\nOptimizer:\n  name: AdamW\n  weight_decay: 0.3\n  beta1: 0.9\n  beta2: 0.999\n  epsilon: 1.0e-8\n  lr:\n    name: ViTLRScheduler\n    learning_rate: 0.003\n    decay_type: cosine\n    warmup_steps: 10000\n  grad_clip:\n    name: \"ClipGradByGlobalNorm\"\n    clip_norm: 1.0\n\nInference:\n  model_dir: ./output\n  mp_degree: 1\n\n  TensorRT:\n    max_batch_size: 1\n    workspace_size: 1<<30\n    min_subgraph_size: 3\n    precision: fp16\n    use_static: False\n    use_calib_mode: False\n    collect_shape: False\n    shape_range_info_filename: ./shape.pbtxt\n"
  },
  {
    "path": "ppfleetx/configs/vis/vit/ViT_base_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml",
    "content": "_base_: ../base.yaml\n\nGlobal:\n  device: gpu\n  seed: 2021\n\nEngine:\n  run_mode: 'epoch'\n  num_train_epochs: 300\n  eval_freq: 1\n  accumulate_steps: 1\n  logging_freq: 10\n  mix_precision:\n    enable: True\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"elementwise_div\"]\n    custom_white_list: []\n  save_load:\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir: \n\nDistributed:\n  dp_degree:\n\nModel:\n  module: \"GeneralClsModule\"\n  model:\n    name: \"ViT_base_patch16_224\"\n    class_num: 1000\n    drop_rate: 0.1\n  loss:\n    train:\n      name: 'ViTCELoss'\n      epsilon: 0.0001\n    eval:\n      name: 'CELoss'\n  metric:\n    train:\n      name: 'TopkAcc'\n      topk: [1, 5]\n    eval:\n      name: 'TopkAcc'\n      topk: [1, 5]\n\nOptimizer:\n  name: AdamW\n  weight_decay: 0.3\n  beta1: 0.9\n  beta2: 0.999\n  epsilon: 1.0e-8\n  lr:\n    name: ViTLRScheduler\n    learning_rate: 0.003\n    decay_type: cosine\n    warmup_steps: 10000\n  grad_clip:\n    name: \"ClipGradByGlobalNorm\"\n    clip_norm: 1.0\n\nData:\n  Train:\n    dataset:\n      name: GeneralClsDataset\n      image_root: ./dataset/ILSVRC2012/\n      class_num: 1000\n      cls_label_path: ./dataset/ILSVRC2012/train_list.txt\n      transform_ops:\n        - DecodeImage:\n            to_rgb: True\n            channel_first: False\n        - RandCropImage:\n            size: 224\n            scale: [0.05, 1.0]\n            interpolation: bicubic\n            backend: pil\n        - RandFlipImage:\n            flip_code: 1\n        - NormalizeImage:\n            scale: 1.0/255.0\n            mean: [0.5, 0.5, 0.5]\n            std: [0.5, 0.5, 0.5]\n            order: ''\n        - ToCHWImage:\n\n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 256\n      drop_last: True\n      shuffle: True\n    loader:\n      num_workers: 8\n      use_shared_memory: True\n\n  Eval:\n    dataset: \n      name: GeneralClsDataset\n      image_root: ./dataset/ILSVRC2012/\n      cls_label_path: ./dataset/ILSVRC2012/val_list.txt\n      transform_ops:\n        - DecodeImage:\n            to_rgb: True\n            channel_first: False\n        - ResizeImage:\n            resize_short: 256\n            interpolation: bicubic\n            backend: pil\n        - CenterCropImage:\n            size: 224\n        - NormalizeImage:\n            scale: 1.0/255.0\n            mean: [0.5, 0.5, 0.5]\n            std: [0.5, 0.5, 0.5]\n            order: ''\n        - ToCHWImage:\n        \n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 256\n      drop_last: False\n      shuffle: False\n    loader:\n      num_workers: 8\n      use_shared_memory: True\n"
  },
  {
    "path": "ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml",
    "content": "_base_: ../base.yaml\n\nGlobal:\n  device: gpu\n  seed: 2021\n\nEngine:\n  run_mode: 'epoch'\n  num_train_epochs: 8\n  eval_freq: 1\n  accumulate_steps: 1\n  logging_freq: 10\n  mix_precision:\n    enable: True\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"elementwise_div\"]\n    custom_white_list: []\n  save_load:\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\nDistributed:\n  dp_degree:\n\nModel:\n  module: \"GeneralClsModule\"\n  model:\n    name: \"ViT_base_patch16_384\"\n    class_num: 1000\n    drop_rate: 0.1\n    pretrained:\n      prefix_path: ./pretrained/vit/imagenet2012-ViT-B_16-224\n      finetune: True\n  loss:\n    train:\n      name: 'CELoss'\n    eval:\n      name: 'CELoss'\n  metric:\n    train:\n      name: 'TopkAcc'\n      topk: [1, 5]\n    eval:\n      name: 'TopkAcc'\n      topk: [1, 5]\n\nOptimizer:\n  name: Momentum\n  weight_decay: 0.0001\n  momentum: 0.9\n  lr:\n    name: ViTLRScheduler\n    learning_rate: 0.004\n    decay_type: cosine\n    warmup_steps: 500\n  grad_clip:\n    name: \"ClipGradByGlobalNorm\"\n    clip_norm: 0.35\n\n\nData:\n  Train:\n    dataset:\n      name: GeneralClsDataset\n      image_root: ./dataset/ILSVRC2012/\n      class_num: 1000\n      cls_label_path: ./dataset/ILSVRC2012/train_list.txt\n      transform_ops:\n        - DecodeImage:\n            to_rgb: True\n            channel_first: False\n        - RandCropImage:\n            size: 384\n            scale: [0.05, 1.0]\n            interpolation: bilinear\n            backend: pil\n        - RandFlipImage:\n            flip_code: 1\n        - NormalizeImage:\n            scale: 1.0/255.0\n            mean: [0.5, 0.5, 0.5]\n            std: [0.5, 0.5, 0.5]\n            order: ''\n        - ToCHWImage:\n\n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 32 # total batchsize 512\n      drop_last: True\n      shuffle: True\n    loader:\n      num_workers: 8\n      use_shared_memory: True\n\n  Eval:\n    dataset: \n      name: GeneralClsDataset\n      image_root: ./dataset/ILSVRC2012/\n      cls_label_path: ./dataset/ILSVRC2012/val_list.txt\n      transform_ops:\n        - DecodeImage:\n            to_rgb: True\n            channel_first: False\n        - ResizeImage:\n            size: 384\n            interpolation: bilinear\n            backend: pil\n        - NormalizeImage:\n            scale: 1.0/255.0\n            mean: [0.5, 0.5, 0.5]\n            std: [0.5, 0.5, 0.5]\n            order: ''\n        - ToCHWImage:\n        \n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 256\n      drop_last: False\n      shuffle: False\n    loader:\n      num_workers: 8\n      use_shared_memory: True\n"
  },
  {
    "path": "ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_cifar10_1n8c_dp_fp16o2.yaml",
    "content": "_base_: ../base.yaml\n\nGlobal:\n  device: gpu\n  seed: 2021\n\nEngine:\n  run_mode: 'epoch'\n  num_train_epochs: 103\n  eval_freq: 1\n  accumulate_steps: 1\n  logging_freq: 10\n  mix_precision:\n    enable: False\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"elementwise_div\"]\n    custom_white_list: []\n  save_load:\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\nDistributed:\n  dp_degree:\n\nModel:\n  module: \"GeneralClsModule\"\n  model:\n    name: \"ViT_base_patch16_384\"\n    class_num: 10\n    drop_rate: 0.1\n    pretrained:\n      prefix_path: ./pretrained/vit/imagenet2012-ViT-B_16-224\n      finetune: True\n  loss:\n    train:\n      name: 'CELoss'\n    eval:\n      name: 'CELoss'\n  metric:\n    train:\n      name: 'TopkAcc'\n      topk: [1, 5]\n    eval:\n      name: 'TopkAcc'\n      topk: [1, 5]\n\nOptimizer:\n  name: Momentum\n  weight_decay: 0.0001\n  momentum: 0.9\n  lr:\n    name: ViTLRScheduler\n    learning_rate: 0.004\n    decay_type: cosine\n    warmup_steps: 500\n  grad_clip:\n    name: \"ClipGradByGlobalNorm\"\n    clip_norm: 0.35\n\n\nData:\n  Train:\n    dataset:\n      name: CIFAR10\n      root: ./dataset/cifar-10-batches-py/\n      mode: train\n      transform_ops:\n        - RandCropImage:\n            size: 384\n            scale: [0.05, 1.0]\n            interpolation: bilinear\n            backend: pil\n        - RandFlipImage:\n            flip_code: 1\n        - NormalizeImage:\n            scale: 1.0/255.0\n            mean: [0.5, 0.5, 0.5]\n            std: [0.5, 0.5, 0.5]\n            order: ''\n        - ToCHWImage:\n\n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 64 # total batchsize 512\n      drop_last: True\n      shuffle: True\n    loader:\n      num_workers: 8\n      use_shared_memory: True\n\n  Eval:\n    dataset: \n      name: CIFAR10\n      root: ./dataset/cifar-10-batches-py/\n      mode: test\n      transform_ops:\n        - ResizeImage:\n            size: 384\n            interpolation: bilinear\n            backend: pil\n        - NormalizeImage:\n            scale: 1.0/255.0\n            mean: [0.5, 0.5, 0.5]\n            std: [0.5, 0.5, 0.5]\n            order: ''\n        - ToCHWImage:\n        \n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 64\n      drop_last: False\n      shuffle: False\n    loader:\n      num_workers: 8\n      use_shared_memory: True\n\nCompress:\n  Quantization:\n    enable: True\n    weight_quantize_type: 'abs_max'\n    activation_quantize_type: 'moving_average_abs_max'\n    activation_preprocess_type: 'PACT'\n    weight_bits: 8\n    activation_bits: 8\n    onnx_format: True\n"
  },
  {
    "path": "ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml",
    "content": "_base_: ../base.yaml\n\nGlobal:\n  device: gpu\n  seed: 2021\n\nEngine:\n  run_mode: 'epoch'\n  num_train_epochs: 8\n  eval_freq: 1\n  accumulate_steps: 1\n  logging_freq: 10\n  mix_precision:\n    enable: True\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"elementwise_div\"]\n    custom_white_list: []\n  save_load:\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\nDistributed:\n  dp_degree:\n\nModel:\n  module: \"GeneralClsModule\"\n  model:\n    name: \"ViT_base_patch16_384\"\n    class_num: 1000\n    drop_rate: 0.1\n    pretrained:\n      prefix_path: ./pretrained/vit/imagenet2012-ViT-B_16-384\n      finetune: True\n  loss:\n    train:\n      name: 'CELoss'\n    eval:\n      name: 'CELoss'\n  metric:\n    train:\n      name: 'TopkAcc'\n      topk: [1, 5]\n    eval:\n      name: 'TopkAcc'\n      topk: [1, 5]\n\nOptimizer:\n  name: Momentum\n  weight_decay: 0.0001\n  momentum: 0.9\n  lr:\n    name: ViTLRScheduler\n    learning_rate: 0.004\n    decay_type: cosine\n    warmup_steps: 500\n  grad_clip:\n    name: \"ClipGradByGlobalNorm\"\n    clip_norm: 0.35\n\n\nData:\n  Train:\n    dataset:\n      name: GeneralClsDataset\n      image_root: ./dataset/ILSVRC2012/\n      class_num: 1000\n      cls_label_path: ./dataset/ILSVRC2012/train_list.txt\n      transform_ops:\n        - DecodeImage:\n            to_rgb: True\n            channel_first: False\n        - RandCropImage:\n            size: 384\n            scale: [0.05, 1.0]\n            interpolation: bilinear\n            backend: pil\n        - RandFlipImage:\n            flip_code: 1\n        - NormalizeImage:\n            scale: 1.0/255.0\n            mean: [0.5, 0.5, 0.5]\n            std: [0.5, 0.5, 0.5]\n            order: ''\n        - ToCHWImage:\n\n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 32 # total batchsize 512\n      drop_last: True\n      shuffle: True\n    loader:\n      num_workers: 8\n      use_shared_memory: True\n\n  Eval:\n    dataset: \n      name: GeneralClsDataset\n      image_root: ./dataset/ILSVRC2012/\n      cls_label_path: ./dataset/ILSVRC2012/val_list.txt\n      transform_ops:\n        - DecodeImage:\n            to_rgb: True\n            channel_first: False\n        - ResizeImage:\n            size: 384\n            interpolation: bilinear\n            backend: pil\n        - NormalizeImage:\n            scale: 1.0/255.0\n            mean: [0.5, 0.5, 0.5]\n            std: [0.5, 0.5, 0.5]\n            order: ''\n        - ToCHWImage:\n        \n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 256\n      drop_last: False\n      shuffle: False\n    loader:\n      num_workers: 8\n      use_shared_memory: True\n\n\nCompress:\n  Quantization:\n    enable: True\n    weight_quantize_type: 'channel_wise_abs_max'\n    activation_quantize_type: 'moving_average_abs_max'\n    activation_preprocess_type: 'PACT'\n    weight_bits: 8\n    activation_bits: 8\n    onnx_format: True\n"
  },
  {
    "path": "ppfleetx/configs/vis/vit/ViT_large_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml",
    "content": "_base_: ../base.yaml\n\nGlobal:\n  device: gpu\n  seed: 2021\n\nEngine:\n  run_mode: 'epoch'\n  num_train_epochs: 8\n  eval_freq: 1\n  accumulate_steps: 1\n  logging_freq: 10\n  mix_precision:\n    enable: True\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"elementwise_div\"]\n    custom_white_list: []\n  save_load:\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\nDistributed:\n  dp_degree:\n\nModel:\n  module: \"GeneralClsModule\"\n  model:\n    name: \"ViT_large_patch16_384\"\n    class_num: 1000\n    drop_rate: 0.1\n    pretrained:\n      prefix_path: ./pretrained/vit/imagenet21k-ViT-L_16\n      finetune: True\n  loss:\n    train:\n      name: 'CELoss'\n    eval:\n      name: 'CELoss'\n  metric:\n    train:\n      name: 'TopkAcc'\n      topk: [1, 5]\n    eval:\n      name: 'TopkAcc'\n      topk: [1, 5]\n\nOptimizer:\n  name: Momentum\n  weight_decay: 0.0001\n  momentum: 0.9\n  lr:\n    name: ViTLRScheduler\n    learning_rate: 0.03\n    decay_type: cosine\n    warmup_steps: 500\n  grad_clip:\n    name: \"ClipGradByGlobalNorm\"\n    clip_norm: 1.0\n\n\nData:\n  Train:\n    dataset:\n      name: GeneralClsDataset\n      image_root: ./dataset/ILSVRC2012/\n      class_num: 1000\n      cls_label_path: ./dataset/ILSVRC2012/train_list.txt\n      transform_ops:\n        - DecodeImage:\n            to_rgb: True\n            channel_first: False\n        - RandCropImage:\n            size: 384\n            scale: [0.05, 1.0]\n            interpolation: bilinear\n            backend: pil\n        - RandFlipImage:\n            flip_code: 1\n        - NormalizeImage:\n            scale: 1.0/255.0\n            mean: [0.5, 0.5, 0.5]\n            std: [0.5, 0.5, 0.5]\n            order: ''\n        - ToCHWImage:\n\n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 32 # total batchsize 512\n      drop_last: True\n      shuffle: True\n    loader:\n      num_workers: 8\n      use_shared_memory: True\n\n  Eval:\n    dataset: \n      name: GeneralClsDataset\n      image_root: ./dataset/ILSVRC2012/\n      cls_label_path: ./dataset/ILSVRC2012/val_list.txt\n      transform_ops:\n        - DecodeImage:\n            to_rgb: True\n            channel_first: False\n        - ResizeImage:\n            size: 384\n            interpolation: bilinear\n            backend: pil\n        - NormalizeImage:\n            scale: 1.0/255.0\n            mean: [0.5, 0.5, 0.5]\n            std: [0.5, 0.5, 0.5]\n            order: ''\n        - ToCHWImage:\n        \n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 64\n      drop_last: False\n      shuffle: False\n    loader:\n      num_workers: 8\n      use_shared_memory: True\n"
  },
  {
    "path": "ppfleetx/configs/vis/vit/ViT_large_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml",
    "content": "_base_: ../base.yaml\n\nGlobal:\n  device: gpu\n  seed: 2021\n\nEngine:\n  run_mode: 'epoch'\n  num_train_epochs: 8\n  eval_freq: 1\n  accumulate_steps: 1\n  logging_freq: 10\n  mix_precision:\n    enable: True\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"elementwise_div\"]\n    custom_white_list: []\n  save_load:\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\nDistributed:\n  dp_degree:\n\nModel:\n  module: \"GeneralClsModule\"\n  model:\n    name: \"ViT_large_patch16_384\"\n    class_num: 1000\n    drop_rate: 0.1\n    pretrained:\n      prefix_path: ./pretrained/vit/imagenet21k-ViT-L_16\n      finetune: True\n  loss:\n    train:\n      name: 'CELoss'\n    eval:\n      name: 'CELoss'\n  metric:\n    train:\n      name: 'TopkAcc'\n      topk: [1, 5]\n    eval:\n      name: 'TopkAcc'\n      topk: [1, 5]\n\nOptimizer:\n  name: Momentum\n  weight_decay: 0.0001\n  momentum: 0.9\n  lr:\n    name: ViTLRScheduler\n    learning_rate: 0.03\n    decay_type: cosine\n    warmup_steps: 500\n  grad_clip:\n    name: \"ClipGradByGlobalNorm\"\n    clip_norm: 1.0\n\n\nData:\n  Train:\n    dataset:\n      name: GeneralClsDataset\n      image_root: ./dataset/ILSVRC2012/\n      class_num: 1000\n      cls_label_path: ./dataset/ILSVRC2012/train_list.txt\n      transform_ops:\n        - DecodeImage:\n            to_rgb: True\n            channel_first: False\n        - RandCropImage:\n            size: 384\n            scale: [0.05, 1.0]\n            interpolation: bilinear\n            backend: pil\n        - RandFlipImage:\n            flip_code: 1\n        - NormalizeImage:\n            scale: 1.0/255.0\n            mean: [0.5, 0.5, 0.5]\n            std: [0.5, 0.5, 0.5]\n            order: ''\n        - ToCHWImage:\n\n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 32 # total batchsize 512\n      drop_last: True\n      shuffle: True\n    loader:\n      num_workers: 8\n      use_shared_memory: True\n\n  Eval:\n    dataset: \n      name: GeneralClsDataset\n      image_root: ./dataset/ILSVRC2012/\n      cls_label_path: ./dataset/ILSVRC2012/val_list.txt\n      transform_ops:\n        - DecodeImage:\n            to_rgb: True\n            channel_first: False\n        - ResizeImage:\n            size: 384\n            interpolation: bilinear\n            backend: pil\n        - NormalizeImage:\n            scale: 1.0/255.0\n            mean: [0.5, 0.5, 0.5]\n            std: [0.5, 0.5, 0.5]\n            order: ''\n        - ToCHWImage:\n        \n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 64\n      drop_last: False\n      shuffle: False\n    loader:\n      num_workers: 8\n      use_shared_memory: True\n\n\nCompress:\n  Quantization:\n    enable: True\n    weight_quantize_type: 'channel_wise_abs_max'\n    activation_quantize_type: 'moving_average_abs_max'\n    activation_preprocess_type: 'PACT'\n    weight_bits: 8\n    activation_bits: 8\n    onnx_format: True\n"
  },
  {
    "path": "ppfleetx/configs/vis/vit/ViT_tiny_patch16_224_ci_cifar10_1n8c_dp_fp16o2.yaml",
    "content": "_base_: ../base.yaml\n\nGlobal:\n  device: gpu\n  seed: 2021\n\nEngine:\n  run_mode: 'epoch'\n  num_train_epochs: 1\n  eval_freq: 1\n  accumulate_steps: 1\n  logging_freq: 10\n  mix_precision:\n    enable: True\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"elementwise_div\"]\n    custom_white_list: []\n  save_load:\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\nDistributed:\n  dp_degree:\n\nModel:\n  module: \"GeneralClsModule\"\n  model:\n    name: \"ViT_tiny_patch16_224\"\n    class_num: 10\n    drop_rate: 0.1\n  loss:\n    train:\n      name: 'ViTCELoss'\n      epsilon: 0.0001\n    eval:\n      name: 'CELoss'\n  metric:\n    train:\n      name: 'TopkAcc'\n      topk: [1, 5]\n    eval:\n      name: 'TopkAcc'\n      topk: [1, 5]\n\nOptimizer:\n  name: AdamW\n  weight_decay: 0.3\n  beta1: 0.9\n  beta2: 0.999\n  epsilon: 1.0e-8\n  lr:\n    name: ViTLRScheduler\n    learning_rate: 0.003\n    decay_type: cosine\n    warmup_steps: 10000\n  grad_clip:\n    name: \"ClipGradByGlobalNorm\"\n    clip_norm: 1.0\n\nData:\n  Train:\n    dataset:\n      name: CIFAR10\n      root: ./dataset/cifar-10-batches-py/\n      mode: train\n      transform_ops:\n        - RandCropImage:\n            size: 224\n            scale: [0.05, 1.0]\n            interpolation: bicubic\n            backend: pil\n        - RandFlipImage:\n            flip_code: 1\n        - NormalizeImage:\n            scale: 1.0/255.0\n            mean: [0.5, 0.5, 0.5]\n            std: [0.5, 0.5, 0.5]\n            order: ''\n        - ToCHWImage:\n\n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 256\n      drop_last: True\n      shuffle: True\n    loader:\n      num_workers: 8\n      use_shared_memory: True\n\n  Eval:\n    dataset: \n      name: CIFAR10\n      root: ./dataset/cifar-10-batches-py/\n      mode: test\n      transform_ops:\n        - ResizeImage:\n            resize_short: 256\n            interpolation: bicubic\n            backend: pil\n        - CenterCropImage:\n            size: 224\n        - NormalizeImage:\n            scale: 1.0/255.0\n            mean: [0.5, 0.5, 0.5]\n            std: [0.5, 0.5, 0.5]\n            order: ''\n        - ToCHWImage:\n        \n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 256\n      drop_last: False\n      shuffle: False\n    loader:\n      num_workers: 8\n      use_shared_memory: True\n"
  },
  {
    "path": "ppfleetx/configs/vis/vit/auto/ViT_tiny_patch16_224_ci_cifar10_1n8c_dp_fp16o2.yaml",
    "content": "_base_: ./base.yaml\n\nGlobal:\n  device: gpu\n  seed: 2021\n  local_batch_size: 256\n  micro_batch_size: 256\n\nEngine:\n  num_train_epochs: 1\n  eval_freq: 1\n  accumulate_steps: 1\n  logging_freq: 10\n  mix_precision:\n    level: \"o2\"\n    scale_loss: 32768.0\n    custom_black_list: [\"reduce_sum\", \"elementwise_div\"]\n    custom_white_list: []\n  save_load:\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\nDistributed:\n  dp_degree:\n\nModel:\n  module: \"GeneralClsModuleAuto\"\n  model:\n    name: \"ViT_tiny_patch16_224\"\n    class_num: 10\n    drop_rate: 0.1\n  loss:\n    name: 'ViTCELoss'\n  metric:\n    name: 'TopkAcc'\n    topk: [1, 5]\n\nOptimizer:\n  name: AdamW\n  weight_decay: 0.3\n  beta1: 0.9\n  beta2: 0.999\n  epsilon: 1.0e-8\n  lr:\n    name: ViTLRScheduler\n    learning_rate: 0.003\n    decay_type: cosine\n    warmup_steps: 10000\n  grad_clip:\n    name: \"ClipGradByGlobalNorm\"\n    clip_norm: 1.0\n\nData:\n  Train:\n    sample_split: 1\n    dataset:\n      name: CIFAR10\n      root: ./dataset/cifar-10-batches-py/\n      mode: train\n      transform_ops:\n        - RandCropImage:\n            size: 224\n            scale: [0.05, 1.0]\n            interpolation: bicubic\n            backend: pil\n        - RandFlipImage:\n            flip_code: 1\n        - NormalizeImage:\n            scale: 1.0/255.0\n            mean: [0.5, 0.5, 0.5]\n            std: [0.5, 0.5, 0.5]\n            order: ''\n        - ToCHWImage:\n\n  Eval:\n    sample_split: 1\n    dataset: \n      name: CIFAR10\n      root: ./dataset/cifar-10-batches-py/\n      mode: test\n      transform_ops:\n        - ResizeImage:\n            resize_short: 256\n            interpolation: bicubic\n            backend: pil\n        - CenterCropImage:\n            size: 224\n        - NormalizeImage:\n            scale: 1.0/255.0\n            mean: [0.5, 0.5, 0.5]\n            std: [0.5, 0.5, 0.5]\n            order: ''\n        - ToCHWImage:\n"
  },
  {
    "path": "ppfleetx/configs/vis/vit/auto/base.yaml",
    "content": "Global:\n  device: gpu\n  seed: 2021\n  global_batch_size: \n  local_batch_size: 1\n  micro_batch_size: 1\n\nEngine:\n  run_mode: epoch\n  max_steps: -1\n  eval_freq: 1\n  eval_iters: -1\n  test_iters: -1\n  save_load:\n    save_steps: -1\n    save_epoch: 1\n    output_dir: ./output\n    ckpt_dir:\n\nDistributed:\n  dp_degree:\n  mp_degree: 1\n  pp_degree: 1\n  sharding:\n    sharding_degree: 1\n    sharding_stage: 1\n\n\nModel:\n  use_recompute: False\n"
  },
  {
    "path": "ppfleetx/core/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom .engine import *\nfrom .module import *\n"
  },
  {
    "path": "ppfleetx/core/engine/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom .basic_engine import BasicEngine\nfrom .inference_engine import InferenceEngine, TensorRTConfig\nfrom .eager_engine import EagerEngine\nfrom .auto_engine import AutoEngine\n"
  },
  {
    "path": "ppfleetx/core/engine/auto_engine.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport time\nimport sys\nimport logging\nimport numpy as np\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.distributed as dist\nimport paddle.fluid.core as core\nfrom paddle.distributed.fleet import auto\nfrom paddle.optimizer.lr import LRScheduler\n\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.core.engine import BasicEngine\nfrom ppfleetx.core.module import BasicModule\nfrom ppfleetx.utils.version import version_check\nfrom ppfleetx.data import utils\nfrom ppfleetx.optims import build_lr_scheduler, build_optimizer\n\nlogging.basicConfig(level=logging.INFO)\nlogger = logging.getLogger(__name__)\n\n\nclass AutoEngine(BasicEngine):\n    def __init__(self, configs, module=None, mode='train'):\n        super().__init__()\n        version_check()\n\n        model = None\n        loss_fn = None\n\n        if module and not isinstance(module, BasicModule):\n            raise TypeError(\n                \"'module' must be sub classes of `BasicModule`, but got: {model.__class__.__name__}.\"\n            )\n\n        if module:\n            if module.model and not isinstance(\n                    module.model, nn.Layer) and not callable(module.model):\n                raise TypeError(\n                    \"'model' must be sub classes of `paddle.nn.Layer` or any callable function, but got: {module.model.__class__.__name__}.\"\n                )\n            model = module.model\n\n            if mode == 'train':\n                if module.loss_fn and not isinstance(\n                        module.loss_fn,\n                        nn.Layer) and not callable(module.loss_fn):\n                    raise TypeError(\n                        \"'loss_fn' must be sub classes of `paddle.nn.Layer` or any callable function, but got: {module.loss_fn.__class__.__name__}.\"\n                    )\n            else:\n                module.loss_fn = None\n                module.model.eval()\n            loss_fn = module.loss_fn\n\n        self._module = module\n\n        # lr_scheduler and optimizer\n        lr = build_lr_scheduler(\n            configs.Optimizer.lr) if mode == \"train\" else None\n        optimizer = build_optimizer(configs.Optimizer, model,\n                                    lr) if mode == \"train\" else None\n\n        # engine configs\n        self._configs = configs['Engine']\n        self._max_steps = self._configs['max_steps']\n        self._verbose = self._configs[\"verbose\"]\n        self._eval_freq = self._configs['eval_freq']\n        self._eval_iters = self._configs['eval_iters']\n        self._test_iters = self._configs['test_iters']\n        self._logging_freq = self._configs['logging_freq']\n        self._num_train_epochs = self._configs['num_train_epochs']\n        self._strategy = self._configs['strategy']\n\n        # save & load\n        self._save_steps = self._configs['save_load']['save_steps']\n        self._save_epoch = self._configs['save_load']['save_epoch']\n        self._output_dir = self._configs['save_load']['output_dir']\n        self._ckpt_dir = self._configs['save_load']['ckpt_dir']\n\n        # engine fit inputs\n        self.batch_size = configs['Global']['global_batch_size']\n\n        # init engine\n        self._auto_engine = auto.Engine(\n            model, loss_fn, optimizer, strategy=self._strategy)\n\n    def fit(self, epoch=1, train_dataset=None, valid_dataset=None):\n\n        train_sample_split = train_dataset.sample_split if train_dataset else None\n        valid_sample_split = valid_dataset.sample_split if valid_dataset else None\n\n        self._auto_engine.fit(train_data=train_dataset,\n                              valid_data=valid_dataset,\n                              train_sample_split=train_sample_split,\n                              valid_sample_split=valid_sample_split,\n                              epochs=self._num_train_epochs,\n                              batch_size=self.batch_size,\n                              steps_per_epoch=self._max_steps,\n                              valid_steps=self._eval_iters,\n                              valid_freq=self._eval_freq,\n                              collate_fn=train_dataset.collate_fn,\n                              log_freq=self._logging_freq,\n                              save_dir=self._output_dir,\n                              save_freq=self._save_steps,\n                              verbose=self._verbose)\n\n    def evaluate(self, valid_dataset=None):\n\n        self._auto_engine.evaluate(\n            valid_data=valid_dataset,\n            valid_sample_split=valid_dataset.sample_split,\n            batch_size=self.batch_size,\n            steps=self._max_steps,\n            collate_fn=valid_dataset.collate_fn)\n\n    def predict(self, test_dataset=None):\n\n        self._auto_engine.predict(\n            test_data=test_dataset,\n            test_sample_split=test_dataset.sample_split,\n            batch_size=self.batch_size,\n            steps=self._max_steps,\n            collate_fn=test_dataset.collate_fn)\n\n    def export(self):\n        self._auto_engine.prepare(self._module.input_spec(), mode=\"predict\")\n        self.save(training=False)\n\n    def tune(self, tune_dataset=None):\n        self._auto_engine._tune(\n            tune_dataset,\n            tune_sample_split=tune_dataset.sample_split,\n            batch_size=self.batch_size)\n\n    def save(self, training=True):\n        if self._output_dir and isinstance(self._output_dir, str):\n            path = os.path.join(self._output_dir, \"auto\")\n            self._auto_engine.save(path, training=training)\n        else:\n            raise TypeError(\"`save` requires a valid value of `output_dir`.\")\n\n    def load(self):\n        if self._ckpt_dir and isinstance(self._ckpt_dir, str):\n            self._auto_engine.load(self._ckpt_dir)\n        else:\n            logger.warning(\"`load` requires a valid value of `ckpt_dir`.\")\n\n    def export_from_prog(self):\n        paddle.enable_static()\n\n        if not (self._ckpt_dir and isinstance(self._ckpt_dir, str)):\n            raise ValueError(\"invalid ckpt_dir.\")\n\n        exe = paddle.static.Executor()\n\n        [inference_program, feed_target_names,\n         fetch_targets] = paddle.static.load_inference_model(\n             path_prefix=self._ckpt_dir, executor=exe)\n        feed_targets = [\n            inference_program.global_block().var(name)\n            for name in feed_target_names\n        ]\n\n        self._auto_engine.prepare(\n            inputs=feed_targets,\n            main_program=inference_program,\n            startup_program=paddle.static.Program(),\n            mode=\"predict\")\n\n        model_dict = self._auto_engine.main_program.state_dict()\n        for param in list(\n                filter(lambda var: var.persistable,\n                       self._auto_engine.main_program.list_vars())):\n            if param.type in [\n                    core.VarDesc.VarType.FEED_MINIBATCH,\n                    core.VarDesc.VarType.FETCH_LIST\n            ]:\n                continue\n            if param.dtype != model_dict[param.name]._dtype():\n                model_dict[param.name] = model_dict[param.name]._as_type(\n                    param.dtype)\n        self._auto_engine.main_program.set_state_dict(model_dict)\n\n        path = os.path.join(self._output_dir, \"auto_dist0\")\n        paddle.static.save_inference_model(\n            path,\n            feed_targets,\n            fetch_targets,\n            exe,\n            program=self._auto_engine.main_program, )\n\n        paddle.disable_static()\n"
  },
  {
    "path": "ppfleetx/core/engine/basic_engine.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nclass BasicEngine:\n    \"\"\"\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n\n    def fit(self, *args, **kwargs):\n        raise NotImplementedError\n\n    def evaluate(self, *args, **kwargs):\n        raise NotImplementedError\n\n    def predict(self, *args, **kwargs):\n        raise NotImplementedError\n\n    def save(self, *args, **kwargs):\n        raise NotImplementedError\n\n    def load(self, *args, **kwargs):\n        raise NotImplementedError\n\n    def inference(self, *args, **kwargs):\n        raise NotImplementedError\n"
  },
  {
    "path": "ppfleetx/core/engine/eager_engine.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport time\nimport sys\nimport logging\nfrom tokenize import group\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.distributed as dist\nimport paddle.distributed.fleet as fleet\nfrom paddle.optimizer.lr import LRScheduler\n\nfrom paddle.distributed.parallel import sync_params_buffers\nfrom paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients\nfrom paddle.profiler import SummaryView\nfrom paddle.distributed.fleet.meta_parallel import TensorParallel\nfrom paddle.distributed.sharding import group_sharded_parallel\n\nimport paddleslim\nfrom ppfleetx.distributed.apis import env, amp\nfrom ppfleetx.optims import build_lr_scheduler, build_optimizer\nfrom ppfleetx.utils.log import logger, get_timestamp, convert_timestamp_to_data\nfrom ppfleetx.core.engine import BasicEngine, InferenceEngine, TensorRTConfig\nfrom ppfleetx.core.module import BasicModule\nfrom ppfleetx.utils.tensor_fusion_helper import all_reduce_parameters\nfrom ppfleetx.utils.version import version_check\nfrom ppfleetx.utils.export import export_inference_model\nfrom paddle.incubate.distributed.utils.io import save_for_auto_inference\nfrom ppfleetx.utils.device import synchronize as device_synchronize\nfrom ppfleetx.utils.compression_helper import prune_model, quant_model\n\n\nclass EagerEngine(BasicEngine):\n    \"\"\"\n    The common engine for all models that support single-card and distributed\n    training, validation and test. Only used in eager dygraph mode.\n    \"\"\"\n\n    def __init__(self, configs, module, optimizer=None, lr=None, mode='train'):\n        \"\"\"\n        Initialize an engine depending on the user-defined module and configs.\n\n        Args:\n\n            module(BasicModule): user-defined module. After assigning computations\n                and configurations of model/optimizers/lr Schedulers, engine can\n                support the whole loop of training/validation/test.\n\n            configs(dict): the configurations that engine needs for training/validation/test\n                loop. Such as mix precision strategy, save&load and the infos of steps/epoches.\n\n        Return:\n\n            An instance of `EagerEngine`.\n\n        Examples::\n\n            class TestModule(BasicModule):\n\n                def __init__(self):\n                    super().__init__()\n                    self.model = paddle.nn.Linear(28 * 28, 10)\n                    self.loss_fn = paddle.nn.MSELoss()\n\n                def forward(self, x):\n                    return paddle.relu(self.model(x.reshape(-1)))\n\n                def training_step(self, batch):\n                    x, y = batch\n                    loss = self.loss_fn(self(x), y)\n                    return loss\n\n                def configure_optimizers(self):\n                    return paddle.optimizer.Adam(\n                        parameters=self.model.parameters(), learning_rate=0.02)\n\n            module = TestModule()\n            engine = EagerEngine(module, configs)\n\n        \"\"\"\n        super().__init__()\n        version_check()\n\n        self.mode = mode\n\n        if not isinstance(module, BasicModule):\n            raise TypeError(\n                \"'module' must be sub classes of `BasicModule`, but got: {model.__class__.__name__}.\"\n            )\n\n        self._module = module\n\n        if module.model and not isinstance(\n                module.model, nn.Layer) and not callable(module.model):\n            raise TypeError(\n                \"'model' must be sub classes of `paddle.nn.Layer` or any callable function, but got: {module.model.__class__.__name__}.\"\n            )\n\n        # if mode == 'train':\n        #     if module.loss_fn and not isinstance(\n        #             module.loss_fn, nn.Layer) and not callable(module.loss_fn):\n        #         raise TypeError(\n        #             \"'loss_fn' must be sub classes of `paddle.nn.Layer` or any callable function, but got: {module.loss_fn.__class__.__name__}.\"\n        #         )\n\n        # global configs\n        self._global_batch_size = configs['Global']['global_batch_size']\n\n        # engine configs\n        self._configs = configs['Engine']\n\n        self._run_mode = self._configs.get('run_mode', 'step')\n        assert self._run_mode in ['epoch', 'step'\n                                  ], 'run_mode must be epoch or step'\n        self._max_steps = self._configs['max_steps']\n        self._eval_freq = self._configs['eval_freq']\n        self._eval_iters = self._configs['eval_iters']\n        self._test_iters = self._configs['test_iters']\n        self._logging_freq = self._configs['logging_freq']\n        self._num_train_epochs = self._configs['num_train_epochs']\n        self._accumulate_steps = self._configs['accumulate_steps']\n\n        amp_config = self._configs['mix_precision']\n        self._amp_enable = amp_config['enable']\n        if mode == 'export' and self._amp_enable:\n            logger.info(\"NOTE: disable mix_precision in export mode\")\n            self._amp_enable = False\n\n        self._amp_dtype = amp_config.get('dtype', 'float16')\n        self._amp_level = amp_config.get('level', 'O2')\n        self._use_main_grad = amp_config.get('use_main_grad', False)\n        self._scale_loss = amp_config['scale_loss']\n        self._custom_black_list = amp_config['custom_black_list']\n        self._custom_white_list = amp_config['custom_white_list']\n\n        self._save_steps = self._configs['save_load']['save_steps']\n        self._save_epoch = self._configs['save_load']['save_epoch']\n\n        self._output_dir = self._configs['save_load']['output_dir']\n        self._ckpt_dir = self._configs['save_load']['ckpt_dir']\n\n        self._compress_configs = None\n        self.prune_configs = None\n        self.quant_configs = None\n        self._quant_mode = False\n        if 'Compress' in configs:\n            self.mode = 'compress'\n            self._compress_configs = configs['Compress']\n            if \"Prune\" in self._compress_configs:\n                self.prune_configs = self._compress_configs[\"Prune\"]\n            if \"Quantization\" in self._compress_configs:\n                self.quant_configs = self._compress_configs[\"Quantization\"]\n                self._quant_mode = True\n            self.compress_model()\n\n        # TODO(haohongxiang): Remove there extra configs after reconstruct of Fleet API\n        self._dist_configs = configs['Distributed']\n        self._dp_degree = self._dist_configs['dp_degree']\n        self._mp_degree = self._dist_configs['mp_degree']\n        self._pp_degree = self._dist_configs['pp_degree']\n        sharding_config = self._dist_configs['sharding']\n\n        self._sharding_stage = sharding_config['sharding_stage']\n        self._sharding_degree = sharding_config['sharding_degree']\n        self._sharding_offload = sharding_config['sharding_offload']\n        self._reduce_overlap = sharding_config['reduce_overlap']\n        self._broadcast_overlap = sharding_config['broadcast_overlap']\n\n        self._use_recompute = configs['Model']['use_recompute']\n\n        if self._amp_enable:\n            if mode == 'train' and self._amp_dtype == \"float16\":\n                self._scaler = paddle.amp.GradScaler(\n                    init_loss_scaling=self._scale_loss)\n            else:  # bfloat16\n                self._scaler = paddle.amp.GradScaler(\n                    init_loss_scaling=1, use_dynamic_loss_scaling=False)\n\n            # Save dtype is the same as model dtype. Also can set save_dtype='float32' when\n            # training with pure fp16 strategy, but will cause the rise of memory.\n            if self._amp_level == \"O2\":\n                self._module.model = paddle.amp.decorate(\n                    models=self._module.model,\n                    dtype=self._amp_dtype,\n                    level=self._amp_level)\n        else:\n            self._scaler = None\n\n        if mode == 'train':\n            self._use_increments = configs.Optimizer.lr.pop('use_increments',\n                                                            False)\n            self._lr_scheduler_mode = configs.Optimizer.lr.pop('run_mode',\n                                                               'step')\n            assert self._lr_scheduler_mode in [\n                'epoch', 'step'\n            ], 'lr.run_mode must be epoch or step'\n        self._lr_scheduler = build_lr_scheduler(\n            configs.Optimizer.lr) if mode == 'train' else None\n\n        self._optimizer = build_optimizer(\n            configs.Optimizer, self._module.model,\n            self._lr_scheduler) if mode == 'train' else None\n\n        if self._amp_enable and self._amp_dtype in [\n                'float16', 'bfloat16'\n        ] and self._amp_level == 'O2' and self._use_main_grad:\n            self._module.model = amp.MixPrecisionLayer(\n                self._module.model, dtype=self._amp_dtype)\n            self._optimizer = amp.MixPrecisionOptimizer(self._optimizer)\n            self._scaler = amp.MixPrecisionScaler(self._scaler)\n\n        # distributed configs\n        self._distributed = (dist.get_world_size() > 1)\n\n        if self._distributed:\n            self._hcg = env.get_hcg()\n            self._dp_group = self._hcg.get_data_parallel_group()\n            self._sharding_group = self._hcg.get_sharding_parallel_group()\n\n            self._dp_rank = self._hcg.get_data_parallel_rank()\n            self._mp_rank = self._hcg.get_model_parallel_rank()\n            self._pp_rank = self._hcg.get_stage_id()\n            self._sharding_rank = self._hcg.get_sharding_parallel_rank()\n\n            self._wrap_with_fleet()\n        else:\n            self._dp_rank = 0\n\n        # using for save/load\n        self._load_recovery = {'step': 0, 'epoch': 0, 'rng_state': -1}\n\n        if 'Inference' in configs:\n            self._inference_configs = configs['Inference']\n            self._inference_engine = None\n\n        self.profiler = None\n        if 'Profiler' in configs and configs.get('Profiler', {}).get('enable',\n                                                                     False):\n            self.profiler_config = configs['Profiler']\n\n            scheduler = self.profiler_config.get('scheduler', None)\n            profiler_log = self.profiler_config.get('profiler_log',\n                                                    './profiler_log')\n            record_shapes = self.profiler_config.get('record_shapes', True)\n            profile_memory = self.profiler_config.get('profile_memory', True)\n            self.profiler = paddle.profiler.Profiler(\n                targets=[\n                    paddle.profiler.ProfilerTarget.CPU,\n                    paddle.profiler.ProfilerTarget.GPU\n                ],\n                scheduler=scheduler,\n                on_trace_ready=paddle.profiler.export_chrome_tracing(\n                    profiler_log),\n                record_shapes=record_shapes,\n                profile_memory=profile_memory)\n            self.profiler.start()\n            logger.warning(\n                \"Profiler is enabled, do not enable it in production.\")\n\n    def _wrap_with_fleet(self):\n        if self._sharding_stage in [2, 3]:\n            assert self._pp_degree == 1, \"sharding stage2/3 will support pipeline parallel later\"\n            self._wrap_sharding_2_3()\n        else:\n            self._wrap_3D_parallel()\n\n    def _wrap_sharding_2_3(self):\n        if self._dp_degree > 1 and self._sharding_stage == 3:\n            sync_params_buffers(\n                self._module.model,\n                comm_group=self._dp_group,\n                src_rank=self._dp_group.ranks[0])\n\n        if self._mp_degree > 1:\n            assert self._sharding_stage == 2, \"only support mp + sharding stage2 hybrid parallel now.\"\n            self._module.model = TensorParallel(\n                self._module.model, self._hcg, strategy=None)\n\n        level = \"p_g_os\" if self._sharding_stage == 3 else \"os_g\"\n        origin_model = self._module.model\n        self._module.model, self._optimizer, self._scaler = group_sharded_parallel(\n            model=self._module.model,\n            optimizer=self._optimizer,\n            level=level,\n            scaler=self._scaler,\n            group=self._sharding_group,\n            offload=self._sharding_offload,\n            dp_group=self._dp_group if self._dp_group.nranks > 1 else None)\n        if self._reduce_overlap:\n            self._module.model._set_reduce_overlap(self._reduce_overlap)\n        if self._broadcast_overlap:\n            self._optimizer._set_broadcast_overlap(\n                self._broadcast_overlap, layers=origin_model, num_groups=2)\n\n    def _wrap_3D_parallel(self):\n        if isinstance(self._module.model, amp.MixPrecisionLayer):\n            if dist.get_world_size() == self._dp_degree:\n                sync_params_buffers(\n                    self._module.model,\n                    comm_group=self._dp_group,\n                    src_rank=self._dp_group.ranks[0])\n            elif self._pp_degree > 1:\n                self._module.model = fleet.distributed_model(\n                    self._module.model._layers)\n        else:\n            self._module.model = fleet.distributed_model(self._module.model)\n        self._optimizer = fleet.distributed_optimizer(self._optimizer)\n        self._scaler = fleet.distributed_scaler(\n            self._scaler) if self._scaler is not None else self._scaler\n\n    def _train_one_epoch(self,\n                         epoch_index,\n                         train_data_loader=None,\n                         valid_data_loader=None):\n        self._module.model.train()\n\n        # time count\n        train_losses = []\n        train_step_start = get_timestamp()\n        skip_first = True\n        # Note(GuoxiaWang): Do not use len(train_data_loader()),\n        # it will cause a memory leak.\n        total_train_batch = self._max_steps if self._run_mode == 'step' else len(\n            train_data_loader)\n        total_train_step = self._max_steps if self._run_mode == 'step' else total_train_batch * self._num_train_epochs\n        total_eval_batch = len(\n            valid_data_loader) if valid_data_loader is not None else 0\n        valid_data_loader = valid_data_loader(\n        ) if valid_data_loader is not None else None\n        eval_finished_step = 0\n        for step, batch in enumerate(train_data_loader()):\n\n            if epoch_index == self._load_recovery['epoch']:\n                if step < self._load_recovery['step']:\n                    continue\n\n            loss = self._fit_impl(batch)\n            train_losses.append(loss)\n\n            if self._lr_scheduler is not None and self._lr_scheduler_mode == 'step':\n                if self._scaler is None or self._scaler._found_inf == 0:\n                    self._lr_scheduler.step(epoch=self._global_batch_size\n                                            if self._use_increments else None)\n\n            if (step + 1) % self._logging_freq == 0:\n                train_step_cost = get_timestamp() - train_step_start\n                numpy_losses = [float(loss) for loss in train_losses]\n                log_dict = {\n                    'epoch': epoch_index,\n                    'total_epoch': self._num_train_epochs,\n                    'batch': step,\n                    'total_batch': total_train_batch,\n                    'total_step': total_train_step,\n                    'train_cost': train_step_cost\n                    if step == 0 else train_step_cost / self._logging_freq,\n                    'loss': sum(numpy_losses) / len(numpy_losses),\n                    'lr': self._optimizer.get_lr(),\n                    'found_inf': self._scaler._found_inf\n                    if self._scaler is not None else 0,\n                }\n                if self._amp_enable:\n                    log_dict['loss_scale'] = self._scaler._scale.numpy()[0]\n                self._module.training_step_end(log_dict)\n\n                train_step_start = get_timestamp()\n                train_losses = []\n\n            self._optimizer.clear_grad()\n\n            if self._run_mode == 'step' and not skip_first:\n                if self._eval_freq > 0 and step % self._eval_freq == 0:\n\n                    eval_losses = []\n                    eval_step_start = get_timestamp()\n\n                    for eval_step, batch in enumerate(valid_data_loader):\n                        eval_finished_step += 1\n                        loss = self._evaluate_impl(batch)\n                        eval_losses.append(loss)\n\n                        if eval_step >= self._eval_iters - 1:\n                            break\n\n                    eval_step_cost = get_timestamp() - eval_step_start\n                    eval_loss = sum(eval_losses) / len(eval_losses)\n\n                    log_dict = {\n                        'loss': float(eval_loss),\n                        'epoch': epoch_index,\n                        'batch': eval_finished_step,\n                        'total_batch': total_eval_batch,\n                        'eval_cost': eval_step_cost / self._logging_freq,\n                    }\n                    self._module.validation_step_end(log_dict)\n\n                if self._save_steps > 0 and step % self._save_steps == 0:\n                    device_synchronize()\n                    self.save(epoch=epoch_index, step=step)\n            else:\n                skip_first = False\n\n            if self._run_mode == 'step' and step >= self._max_steps:\n                return\n\n            if self.profiler:\n                self.profiler.step()\n\n    def fit(self, epoch=1, train_data_loader=None, valid_data_loader=None):\n        \"\"\"\n        Run the full process of training/validation/save loop.\n\n        Args:\n\n            epoch(int): the epoch index.\n\n            train_data_loader(DataLoader, None): a collection of :class:`paddle.io.DataLoader`, specifying training samples.\n\n            valid_data_loader(DataLoader, None): a collection of :class:`paddle.io.DataLoader`, specifying validation samples.\n\n        \"\"\"\n        self._module.model.train()\n\n        train_start = get_timestamp()\n\n        start_epoch = self._load_recovery['epoch']\n        if self._load_recovery['rng_state'] != -1:\n            paddle.set_cuda_rng_state(self._load_recovery['rng_state'])\n\n        for epoch_index in range(start_epoch, epoch):\n            train_epoch_start = get_timestamp()\n            self._train_one_epoch(epoch_index, train_data_loader,\n                                  valid_data_loader)\n\n            train_epoch_cost = get_timestamp() - train_epoch_start\n            log_dict = {\n                'epoch': epoch_index,\n                'train_cost': train_epoch_cost,\n            }\n            self._module.training_epoch_end(log_dict)\n\n            if self._lr_scheduler is not None and self._lr_scheduler_mode == 'epoch':\n                self._lr_scheduler.step()\n\n            if self._run_mode == 'epoch' and self._eval_freq > 0 and \\\n                epoch_index % self._eval_freq == 0:\n                eval_epoch_start = get_timestamp()\n                self._evaluate_one_epoch(epoch_index, valid_data_loader)\n                eval_epoch_cost = get_timestamp() - eval_epoch_start\n                log_dict = {\n                    'epoch': epoch_index,\n                    'eval_cost': eval_epoch_cost,\n                }\n                self._module.validation_epoch_end(log_dict)\n\n            if self._save_epoch > 0 and self._run_mode == 'epoch' and epoch_index % self._save_epoch == 0:\n                self.save(epoch=epoch_index, step=len(train_data_loader))\n\n        logger.info(\n            \"The training process is complete and total cost of time for training is : {}\".\n            format(convert_timestamp_to_data(get_timestamp() - train_start)))\n\n        if self.profiler:\n            self._profiler_done()\n\n    def _fit_impl(self, batch):\n        self._module.model.train()\n\n        batch = self._module.pretreating_batch(batch)\n        if self._pp_degree == 1:\n            if self._use_recompute and isinstance(self._module.model,\n                                                  paddle.DataParallel):\n                with self._module.model.no_sync():\n                    loss = self._model_forward_backward(batch)\n                if not hasattr(self._optimizer, \"all_fused_tensors\"\n                               ) or self._optimizer.all_fused_tensors is None:\n                    try:\n                        fused_allreduce_gradients(\n                            list(self._module.model.parameters()), None)\n                    except:\n                        m = self._module.model.state_dict()\n                        fused_allreduce_gradients(\n                            list(self._module.model.parameters()), None)\n                else:\n                    all_reduce_parameters(self._optimizer.all_fused_tensors,\n                                          self._dp_group)\n            elif isinstance(self._module.model, amp.MixPrecisionLayer) \\\n                and self._distributed and dist.get_world_size() == self._dp_degree:\n                loss = self._model_forward_backward(batch)\n                fused_allreduce_gradients(\n                    list(self._module.model.parameters()), None)\n            else:\n                loss = self._model_forward_backward(batch)\n        else:\n            with paddle.amp.auto_cast(\n                    enable=self._amp_enable,\n                    custom_black_list=self._custom_black_list,\n                    custom_white_list=self._custom_white_list,\n                    dtype=self._amp_dtype,\n                    level=self._amp_level):\n                batch = self._module.model._prepare_training(\n                    batch, self._optimizer, self._lr_scheduler)\n                loss = self._module.model.forward_backward_pipeline(\n                    batch, self._scaler)\n\n        self._optim_update_params()\n        return loss\n\n    def _model_forward_backward(self, batch):\n        if self._accumulate_steps == 1 or self._pp_degree > 1:\n            batches = [batch]\n        else:\n            split_batches = [\n                paddle.split(b, self._accumulate_steps) for b in batch\n            ]\n            batches = []\n            for i in range(len(split_batches[0])):\n                micro_batch = [split_batch[i] for split_batch in split_batches]\n                batches.append(micro_batch)\n        final_loss = None\n        for micro_batch in batches:\n            with paddle.amp.auto_cast(\n                    self._amp_enable,\n                    custom_black_list=self._custom_black_list,\n                    custom_white_list=self._custom_white_list,\n                    dtype=self._amp_dtype,\n                    level=self._amp_level):\n                loss = self._module.training_step(micro_batch)\n\n            if self._amp_enable and self._amp_dtype == \"float16\":\n                loss_bw = self._scaler.scale(loss)\n            else:\n                loss_bw = loss\n            if self._accumulate_steps > 1:\n                # div the loss for backward\n                loss_bw = loss_bw / self._accumulate_steps\n\n            self._module.backward(loss_bw)\n\n            detach_loss = loss.detach()\n            if final_loss is None:\n                final_loss = detach_loss\n            else:\n                final_loss = paddle.add(final_loss, detach_loss)\n        if self._accumulate_steps > 1:\n            # div the loss for print\n            final_loss = final_loss / self._accumulate_steps\n        return final_loss\n\n    def _optim_update_params(self):\n        if self._sharding_stage in [3] and self._dp_degree > 1:\n            fused_allreduce_gradients(self._module.model.parameters(),\n                                      self._hcg)\n\n            for p in self._module.model.parameters():\n                if hasattr(p, \"bw_storage\"):\n                    assert p.grad is None, \"This case shouldn't happen.\"\n                    p.bw_storage.scale_(1.0 / self._dp_group.nranks)\n                    dist.all_reduce(p.bw_storage, group=self._dp_group)\n\n        if self._amp_enable and self._amp_dtype == \"float16\":\n            self._scaler.step(self._optimizer)\n            self._scaler.update()\n        else:\n            self._optimizer.step()\n\n    @paddle.no_grad()\n    def evaluate(self, epoch=1, valid_data_loader=None):\n        \"\"\"\n        run one evaluation epoch over the validation set.\n\n        Args:\n\n            epoch(int): the epoch index.\n\n            valid_data_loader(DataLoader, None): a collection of :class:`paddle.io.DataLoader`, specifying validation samples.\n\n        \"\"\"\n        self._module.model.eval()\n\n        for epoch_index in range(epoch):\n            eval_epoch_start = get_timestamp()\n            self._evaluate_one_epoch(epoch_index, valid_data_loader)\n\n            eval_epoch_cost = get_timestamp() - eval_epoch_start\n            log_dict = {\n                'epoch': epoch_index,\n                'eval_cost': eval_epoch_cost,\n            }\n            self._module.validation_epoch_end(log_dict)\n\n        logger.info(\"The evaluting process is complete.\")\n        del valid_data_loader\n        return\n\n    @paddle.no_grad()\n    def _evaluate_one_epoch(self, epoch=1, valid_data_loader=None):\n        self._module.model.eval()\n\n        eval_step_start = get_timestamp()\n        eval_losses = []\n        total_eval_batch = len(valid_data_loader)\n        valid_data_loader = valid_data_loader(\n        ) if valid_data_loader is not None else None\n        for eval_step, batch in enumerate(valid_data_loader):\n            loss = self._evaluate_impl(batch)\n            eval_losses.append(float(loss))\n\n            if eval_step % self._logging_freq == 0:\n                eval_step_cost = get_timestamp() - eval_step_start\n                log_dict = {\n                    'loss': sum(eval_losses) / len(eval_losses),\n                    'epoch': epoch,\n                    'batch': eval_step,\n                    'total_batch': total_eval_batch,\n                    'eval_cost': eval_step_cost\n                    if eval_step == 0 else eval_step_cost / self._logging_freq,\n                }\n                self._module.validation_step_end(log_dict)\n                eval_step_start = get_timestamp()\n                eval_losses = []\n\n            if self._run_mode == 'step' and eval_step >= self._max_steps:\n                logger.info(\"[eval] epoch {} : evaluting process is complete.\".\n                            format(epoch))\n                return\n\n    @paddle.no_grad()\n    def _evaluate_impl(self, batch):\n        self._module.model.eval()\n\n        batch = self._module.pretreating_batch(batch)\n        with paddle.amp.auto_cast(\n                self._amp_enable,\n                custom_black_list=self._custom_black_list,\n                custom_white_list=self._custom_white_list,\n                dtype=self._amp_dtype,\n                level=self._amp_level):\n            if self._pp_degree == 1:\n                loss = self._module.validation_step(batch)\n            else:\n                loss = self._module.model.eval_batch(batch, compute_loss=True)\n\n        return loss\n\n    @paddle.no_grad()\n    def predict(self, epoch=1, test_data_loader=None):\n        \"\"\"\n        run one evaluation epoch over the test set.\n\n        Args:\n\n            epoch(int): the epoch index.\n\n            test_data_loader(DataLoader, None): a collection of :class:`paddle.io.DataLoader`, specifying test samples.\n\n        \"\"\"\n        self._module.model.eval()\n\n        test_start = get_timestamp()\n        test_losses = []\n        test_data_loader = test_data_loader()\n        for test_step, batch in enumerate(test_data_loader):\n            loss = self._predict_impl(batch)\n\n            test_losses.append(float(loss))\n\n            if test_step % self._logging_freq == 0:\n                test_cost = get_timestamp() - test_start\n                log_dict = {\n                    'loss': sum(test_losses) / len(test_losses),\n                    'epoch': epoch,\n                    'batch': test_step,\n                    'test_cost': test_cost\n                    if test_step == 0 else test_cost / self._logging_freq,\n                }\n                self._module.test_step_end(log_dict)\n                test_start = get_timestamp()\n                test_losses = []\n\n            if test_step >= self._max_steps:\n                logger.info(\"The predicting process is complete.\")\n                del test_data_loader\n                return\n\n    @paddle.no_grad()\n    def _predict_impl(self, batch):\n        self._module.model.eval()\n        batch = self._module.pretreating_batch(batch)\n\n        with paddle.amp.auto_cast(\n                self._amp_enable,\n                custom_black_list=self._custom_black_list,\n                custom_white_list=self._custom_white_list,\n                dtype=self._amp_dtype,\n                level=self._amp_level):\n            if self._pp_degree == 1:\n                loss = self._module.test_step(batch)\n            else:\n                loss = self._module.model.eval_batch(batch, compute_loss=True)\n\n        return loss\n\n    def save(self, epoch=0, step=0):\n        \"\"\"\n        save the state dicts of model and optimizer into an checkpoint.\n        \"\"\"\n        if self._dp_rank != 0:\n            logger.info(\"DP_Rank %d doesn't save model\" % self._dp_rank)\n            return\n\n        if self._output_dir and isinstance(self._output_dir, str):\n            output_dir = os.path.join(self._output_dir,\n                                      \"epoch_%d_step_%d\" % (epoch, step))\n            if not os.path.exists(output_dir):\n                os.makedirs(output_dir, exist_ok=True)\n            logger.info(\"Save model to %s\" % output_dir)\n\n            save_dir = \"{}/mp_{:0>2d}_sharding_{:0>2d}_pp_{:0>2d}\".format(\n                output_dir, self._mp_rank, self._sharding_rank,\n                self._pp_rank) if self._distributed else output_dir\n\n            if self._sharding_stage == 3:\n                self._module.model.get_all_parameters(convert2cpu=False)\n            paddle.save(self._module.model.state_dict(),\n                        os.path.join(save_dir, \"model.pdparams\"))\n            paddle.save(self._optimizer.state_dict(),\n                        os.path.join(save_dir, \"model_state.pdopt\"))\n\n            meta_dict = {\n                \"epoch\": epoch,\n                \"step\": step,\n                \"cuda_rng_state\": paddle.get_cuda_rng_state()\n            }\n            paddle.save(meta_dict, os.path.join(save_dir, \"meta_state.pdopt\"))\n\n            save_auto_dir = os.path.join(output_dir, \"auto_infer\")\n            save_for_auto_inference(\n                os.path.join(save_auto_dir, \"auto\"), self._module.model)\n\n        else:\n            raise TypeError(\"`save` requires a valid value of `output_dir`.\")\n\n    def compress_model(self):\n        if self._compress_configs is None: return\n        self._distributed = (dist.get_world_size() > 1)\n        # Load pretrained model before compression\n        if 'pretrained' in self._compress_configs and self._compress_configs[\n                'pretrained'] is not None:\n            self._ckpt_dir = self._compress_configs['pretrained']\n            self.load()\n            # Avoid loading again\n            self._configs['save_load']['ckpt_dir'] = None\n\n        if self.prune_configs is not None and self.prune_configs.enable:\n            prune_model(self._module.model, self.prune_configs,\n                        self._module.input_spec())\n        #NOTE(minghaoBD): We haven't fully tested Prune+Quantization, so an \"else if\" is put here for separation.\n        elif self.quant_configs is not None and self.quant_configs.enable:\n            self._module.model, self.quanter = quant_model(self._module.model,\n                                                           self.quant_configs)\n\n    def load(self):\n        \"\"\"\n        load the saved checkpoint file and update the state dicts of model and optimizer.\n        \"\"\"\n        if self._ckpt_dir and isinstance(self._ckpt_dir, str):\n            logger.info(\"Try to load checkpoint from %s \" % self._ckpt_dir)\n\n            if self._quant_mode:\n                load_dir = self._ckpt_dir\n            else:\n                load_dir = \"{}/mp_{:0>2d}_sharding_{:0>2d}_pp_{:0>2d}\".format(\n                    self._ckpt_dir, self._mp_rank, self._sharding_rank,\n                    self._pp_rank) if self._distributed else self._ckpt_dir\n            model_path = os.path.join(load_dir, \"model.pdparams\")\n            opt_path = os.path.join(load_dir, \"model_state.pdopt\")\n            meta_path = os.path.join(load_dir, \"meta_state.pdopt\")\n\n            if os.path.exists(model_path):\n                model_dict = paddle.load(model_path)\n                for name, param in self._module.model.state_dict().items():\n                    assert name in model_dict.keys(\n                    ), \"No param named `{}` was found in checkpoint file.\".format(\n                        name)\n\n                    if param.dtype != model_dict[name].dtype:\n                        model_dict[name] = model_dict[name].cast(param.dtype)\n\n                self._module.model.set_state_dict(model_dict)\n            else:\n                raise ValueError(\"No optimizer checkpoint file found in %s.\" %\n                                 model_path)\n\n            if self.mode == 'train':\n                if os.path.exists(opt_path):\n                    opt_dict = paddle.load(opt_path)\n                    self._optimizer.set_state_dict(opt_dict)\n                else:\n                    raise ValueError(\n                        \"No optimizer checkpoint file found in %s.\" % opt_path)\n\n                if os.path.exists(meta_path):\n                    meta_dict = paddle.load(meta_path)\n                    self._load_recovery = {\n                        'step': meta_dict['step'],\n                        'epoch': meta_dict['epoch'],\n                        'rng_state': meta_dict['cuda_rng_state']\n                    }\n                else:\n                    raise ValueError(\"No meta checkpoint file found in %s.\" %\n                                     meta_path)\n\n            logger.info(\"successfully load checkpoints\")\n        else:\n            logger.warning(\"`load` requires a valid value of `ckpt_dir`.\")\n            raise TypeError(\"`load` requires a valid value of `ckpt_dir`.\")\n\n    def export(self):\n        self._module.model.eval()\n        input_spec = self._module.input_spec()\n\n        save_dir = os.path.join(self._output_dir,\n                                \"rank_{}\".format(self._dp_rank))\n\n        if not self._quant_mode:\n            export_inference_model(self._module.model, input_spec, save_dir,\n                                   'model')\n        else:\n            logger.info(\"export quantized model.\")\n            export_inference_model(\n                self._module.model,\n                input_spec,\n                save_dir,\n                'model',\n                export_quant_model=True,\n                quanter=self.quanter)\n\n    def inference(self, data):\n        if self._inference_engine is None:\n            # parse TensorRT config\n            tensorrt_config = None\n            if 'TensorRT' in self._inference_configs:\n                tensorrt_config = TensorRTConfig(\n                    **self._inference_configs['TensorRT'])\n\n            self._inference_engine = InferenceEngine(\n                self._inference_configs['model_dir'],\n                self._inference_configs['mp_degree'], tensorrt_config)\n\n        return self._inference_engine.predict(data)\n\n    def _print_summary(self):\n        views_dict = {\n            SummaryView.DeviceView: 'device',\n            SummaryView.OverView: 'overview',\n            SummaryView.ModelView: 'model',\n            SummaryView.DistributedView: 'dist',\n            SummaryView.KernelView: 'kernel',\n            SummaryView.OperatorView: 'op',\n            SummaryView.MemoryView: 'mem',\n            SummaryView.MemoryManipulationView: 'memcpy',\n            SummaryView.UDFView: 'udf',\n        }\n\n        default_views = [\n            SummaryView.OverView,\n            SummaryView.ModelView,\n            SummaryView.KernelView,\n            SummaryView.OperatorView,\n        ]\n\n        def gen_views(cfg):\n            # print all summary view if detailed=True\n            if self.profiler_config.get('detailed', False):\n                return None\n\n            views = []\n            # override default view with user defined value if detailed=False\n            for view in SummaryView:\n                v = self.profiler_config.get('summary', {}).get(\n                    views_dict[view], None)\n                if v is True or (v is None and view in default_views):\n                    views.append(view)\n\n            return views or None\n\n        self.profiler.summary(\n            sorted_by=paddle.profiler.SortedKeys.GPUTotal,\n            views=gen_views(self.profiler_config))\n\n    def _profiler_done(self):\n        if not self.profiler:\n            return\n\n        logger.info(\"Profiler finished, prepare to print summary...\")\n\n        self.profiler.stop()\n\n        self._print_summary()\n        profiler_log = self.profiler_config.get('profiler_log',\n                                                './profiler_log')\n        logger.info(\n            \"For more information please install visualdl and run it with following command:\"\n        )\n        logger.info(\n            \"-------------------------------------------------------------------------------\"\n        )\n        logger.info(f\"visualdl --host 0.0.0.0 --logdir {profiler_log}\")\n        logger.info(\n            \"-------------------------------------------------------------------------------\"\n        )\n"
  },
  {
    "path": "ppfleetx/core/engine/inference_engine.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport numpy as np\nfrom collections.abc import Sequence, Mapping\n\nimport paddle\nimport paddle.distributed.fleet as fleet\n\n# TensorRT precisions\nTRT_PRECISIONS = {\n    'fp32': paddle.inference.PrecisionType.Float32,\n    'fp16': paddle.inference.PrecisionType.Half,\n    'int8': paddle.inference.PrecisionType.Int8,\n}\n\n\nclass _StaticGuard(object):\n    def __init__(self):\n        pass\n\n    def __enter__(self):\n        paddle.enable_static()\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        paddle.disable_static()\n\n\nclass TensorRTConfig(object):\n    \"\"\"\n    TensorRT Inference Configuration\n\n    Args:\n        max_batch_size (int): The maxmum batch size of input data. Default 1\n        workspace_size (int): The size of TensorRT workspace in bytes. Default 1<<30\n        min_subgraph_size (int): The minimum subgraph node size to convert subgraph to TensorRT engine. Default 3\n        precision (str): The inference precision, can be 'fp32', 'fp16' and 'int8'. Default 'fp16'\n        use_static (bool): Whether to serialize and save TensorRT engine. Default False\n        use_calib_mode (bool): Whether to use TensorRT calibration. Default False\n        collect_shape (bool): Whether to collect dynamic shape. Default False\n        shape_range_info_filename (str): Path to dynamic shape range file. Default None\n    \"\"\"\n\n    def __init__(self,\n                 max_batch_size=1,\n                 workspace_size=1 << 30,\n                 min_subgraph_size=3,\n                 precision='fp16',\n                 use_static=False,\n                 use_calib_mode=False,\n                 collect_shape=False,\n                 shape_range_info_filename=None):\n        self.max_batch_size = max_batch_size\n        self.workspace_size = eval(workspace_size)\n        self.min_subgraph_size = min_subgraph_size\n        self.precision = precision\n        self.use_static = use_static\n        self.use_calib_mode = use_calib_mode\n        self.shape_range_info_filename = shape_range_info_filename\n        self.collect_shape = collect_shape\n\n    @property\n    def precision(self):\n        return TRT_PRECISIONS[self._precision]\n\n    @precision.setter\n    def precision(self, value):\n        print(\"value\", value)\n        assert value.lower() in ['fp32', 'fp16', 'int8'], \\\n            \"TensorRT precision can only be 'fp32', 'fp16' or 'int8', \" \\\n            \"but got {}\".format(value.lower())\n        self._precision = value.lower()\n\n    @property\n    def collect_shape(self):\n        return self._collect_shape\n\n    @collect_shape.setter\n    def collect_shape(self, value):\n        if value:\n            assert self.shape_range_info_filename is not None, \\\n                    \"shape_range_info_filename should be set in \" \\\n                    \"collect_shape mode\"\n        else:\n            assert self.shape_range_info_filename and \\\n                    os.path.isfile(self.shape_range_info_filename), \\\n                    \"shape_range_info_filename {} is not a \" \\\n                    \"file\".format(self.shape_range_info_filename)\n        self._collect_shape = value\n\n\nclass InferenceEngine(object):\n    \"\"\"\n    Model Parallel Inference Engine\n\n    Args:\n        model_dir (string): root directory of inference model\n        mp_degree (int): model parallel size\n        tensorrt_config (TensorRTConfig): configurations for TensorRT inference\n    \"\"\"\n\n    def __init__(self,\n                 model_dir,\n                 mp_degree=1,\n                 tensorrt_config=None,\n                 device=None):\n        self.model_dir = model_dir\n        self.mp_degree = mp_degree\n        self.tensorrt_config = tensorrt_config\n        self.auto = False\n        self.device = device\n\n        for fname in os.listdir(model_dir):\n            if \"auto\" in fname:\n                self.auto = True\n                break\n\n        if mp_degree == 1:\n            self.nranks = 1\n            self.rank = 0\n        else:\n            self.nranks = fleet.worker_num()\n            self.rank = fleet.worker_index()\n\n        if not self.auto:\n            self._check_model()\n\n        self._static_guard = _StaticGuard()\n        with self._static_guard:\n            self._init_predictor()\n\n    def _check_model(self):\n        if not os.path.isdir(self.model_dir):\n            raise ValueError('model_dir is not a directory')\n\n        rank_path = os.path.join(self.model_dir, \"rank_{}\".format(self.rank))\n        if not os.path.isdir(rank_path):\n            raise ValueError('rank_{} directory not found'.format(self.rank))\n        model_files = []\n        param_files = []\n        for fname in os.listdir(rank_path):\n            if os.path.splitext(fname)[1] == '.pdmodel':\n                model_files.append(fname)\n            if os.path.splitext(fname)[1] == '.pdiparams':\n                param_files.append(fname)\n\n        def _check_and_get_file(files, tag):\n            if len(files) == 0:\n                raise ValueError(\"no {} file found under {}\".format(tag,\n                                                                    rank_path))\n            elif len(files) > 1:\n                raise ValueError(\"multiple {} file found under {}\".format(\n                    tag, rank_path))\n            else:\n                return os.path.join(self.model_dir,\n                                    'rank_{}'.format(self.rank), files[0])\n\n        self.model_file = _check_and_get_file(model_files, 'pdmodel')\n        self.param_file = _check_and_get_file(param_files, 'pdiparams')\n\n    def _generate_comm_init_config(self, rank, nranks):\n        ring_id_to_ranks = ','.join(['0'] + [str(i) for i in range(nranks)])\n        rank_to_ring_ids = ''.join(['{},0\\n'.format(i) for i in range(nranks)])\n        comm_str = '[ring_id -> ranks]\\n' + ring_id_to_ranks + \\\n                    '\\n[rank -> ring_ids]\\n' + rank_to_ring_ids\n\n        config_fname = \"./.comm_config{}.csv\".format(rank)\n        if os.path.exists(config_fname):\n            os.remove(config_fname)\n        with open(config_fname, 'w') as f:\n            f.write(comm_str)\n\n        return config_fname\n\n    def _init_predictor(self):\n        if self.auto:\n            self.model_file = os.path.join(\n                self.model_dir, 'auto_dist{}.pdmodel'.format(self.rank))\n            self.param_file = os.path.join(\n                self.model_dir, 'auto_dist{}.pdiparams'.format(self.rank))\n        config = paddle.inference.Config(self.model_file, self.param_file)\n\n        config.enable_memory_optim()\n        config.switch_ir_optim(True)\n        if self.device:\n            device_id = int(\n                os.environ.get(f'FLAGS_selected_{self.device}s', 0))\n            config.enable_custom_device(self.device, device_id)\n        elif paddle.fluid.core.is_compiled_with_cuda():\n            device_id = int(os.environ.get('FLAGS_selected_gpus', 0))\n            config.enable_use_gpu(100, device_id)\n        elif paddle.fluid.core.is_compiled_with_xpu():\n            device_id = int(os.environ.get('FLAGS_selected_xpus', 0))\n            config.enable_xpu()\n            config.set_xpu_device_id(device_id)\n\n        # distributed config\n        if self.mp_degree > 1:\n            trainer_endpoints = fleet.worker_endpoints()\n            current_endpoint = trainer_endpoints[self.rank]\n\n            dist_config = config.dist_config()\n            dist_config.set_ranks(self.nranks, self.rank)\n            dist_config.set_endpoints(trainer_endpoints, current_endpoint)\n            dist_config.enable_dist_model(True)\n\n            if self.auto:\n                config_fname = os.path.join(self.model_dir, \"rank_mapping.csv\")\n            else:\n                config_fname = self._generate_comm_init_config(self.rank,\n                                                               self.nranks)\n            dist_config.set_comm_init_config(config_fname)\n            config.set_dist_config(dist_config)\n\n        # TensorRT config\n        if self.tensorrt_config:\n            config.enable_tensorrt_engine(\n                max_batch_size=self.tensorrt_config.max_batch_size,\n                workspace_size=self.tensorrt_config.workspace_size,\n                min_subgraph_size=self.tensorrt_config.min_subgraph_size,\n                precision_mode=self.tensorrt_config.precision,\n                use_static=self.tensorrt_config.use_static,\n                use_calib_mode=self.tensorrt_config.use_calib_mode)\n\n            if self.tensorrt_config.collect_shape:\n                config.collect_shape_range_info(\n                    self.tensorrt_config.shape_range_info_filename)\n            else:\n                config.enable_tuned_tensorrt_dynamic_shape(\n                    self.tensorrt_config.shape_range_info_filename, True)\n\n        self.predictor = paddle.inference.create_predictor(config)\n\n    def input_names(self):\n        return self.predictor.get_input_names()\n\n    def output_names(self):\n        return self.predictor.get_output_names()\n\n    def predict(self, data):\n        # data in dict/list format\n        with self._static_guard:\n            if isinstance(data, Sequence):\n                if len(data) != len(self.input_names()):\n                    raise ValueError()\n                for d, name in zip(data, self.input_names()):\n                    handle = self.predictor.get_input_handle(name)\n                    handle.copy_from_cpu(np.array(d.copy()))\n            elif isinstance(data, Mapping):\n                # key check\n                for k, v in data.items():\n                    handle = self.predictor.get_input_handle(k)\n                    handle.copy_from_cpu(np.array(v))\n            else:\n                raise ValueError()\n\n            self.predictor.run()\n            return {name: self.predictor.get_output_handle(name).copy_to_cpu() \\\n                    for name in self.output_names()}\n"
  },
  {
    "path": "ppfleetx/core/module/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom .basic_module import BasicModule\n"
  },
  {
    "path": "ppfleetx/core/module/basic_module.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# The file has been adapted from lightning file:\n# https://github.com/Lightning-AI/lightning/blob/master/src/pytorch_lightning/core/module.py\n# Git commit hash: 2d9e00fab64c8b19a8646f755a95bcb092aa710f\n# We retain the following license from the original files:\n\n# Copyright 2018-2021 William Falcon. All rights reserved.\n#\n# This source code is licensed under the BSD license found in the\n# LICENSE file in the root directory of this source tree.\n\nimport paddle\nimport paddle.nn as nn\n\n\nclass BasicModule(nn.Layer):\n    \"\"\"\n\n    \"\"\"\n\n    def __init__(self, configs, *args, **kwargs):\n        self.configs = self.process_configs(configs)\n        super().__init__(*args, **kwargs)\n        self.model = self.get_model()\n\n    def process_configs(self, configs):\n        return configs\n\n    def get_model(self):\n        raise NotImplementedError\n\n    def get_loss_fn(self):\n        pass\n\n    def pretreating_batch(self, batch):\n        return batch\n\n    def forward(self, *args, **kwargs):\n        return super().forward(*args, **kwargs)\n\n    def training_step(self, *args, **kwargs):\n        raise NotImplementedError\n\n    def training_step_end(self, *args, **kwargs):\n        pass\n\n    def validation_step(self, *args, **kwargs):\n        pass\n\n    def validation_step_end(self, *args, **kwargs):\n        pass\n\n    def test_step(self, *args, **kwargs):\n        pass\n\n    def test_step_end(self, *args, **kwargs):\n        pass\n\n    def backward(self, loss):\n        loss.backward()\n\n    def input_spec(self):\n        raise NotImplementedError(\n            \"Please redefine Module.input_spec for model export\")\n\n    def inference_end(self, outputs):\n        pass\n\n    def training_epoch_end(self, *args, **kwargs):\n        pass\n\n    def validation_epoch_end(self, *args, **kwargs):\n        pass\n"
  },
  {
    "path": "ppfleetx/data/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport sys\nimport copy\nimport random\nimport numpy as np\n\nimport paddle\n\nfrom ppfleetx.data import dataset, sampler, utils\nfrom ppfleetx.distributed.apis import env\nfrom ppfleetx.utils.log import logger\n\n\ndef build_auto_dataset(config, mode):\n    \"\"\"\n    build dataset for auto parallel\n    \"\"\"\n    assert mode in ['Train', 'Eval', 'Test'\n                    ], \"Dataset mode should be Train, Eval, Test\"\n\n    if mode not in config:\n        return None\n\n    dataset = build_dataset(config, mode)\n\n    collate_fn = None\n    if 'collate_fn' in config[mode].keys():\n        collate_fn_cfg = config[mode].pop('collate_fn', None)\n        if isinstance(collate_fn_cfg, str):\n            collate_fn = getattr(\n                utils, collate_fn_cfg) if collate_fn_cfg is not None else None\n        elif isinstance(collate_fn_cfg, dict):\n            collate_fn_class_name = collate_fn_cfg.pop(\"name\")\n            collate_fn = eval(\"utils.{}\".format(collate_fn_class_name))(\n                **collate_fn_cfg)\n            logger.debug(\"build collate_fn({}) success...\".format(collate_fn))\n\n    dataset.collate_fn = collate_fn\n    dataset.sample_split = config[mode].pop('sample_split', None)\n    return dataset\n\n\ndef build_dataset(config, mode):\n    # build dataset\n    config_dataset = config[mode].dataset\n    config_dataset = copy.deepcopy(config_dataset)\n    dataset_name = config_dataset.pop('name')\n    dataset = eval(\"dataset.{}\".format(dataset_name))(**config_dataset)\n\n    logger.debug(\"build dataset({}) success...\".format(dataset))\n\n    return dataset\n\n\ndef build_dataloader(config, mode):\n    assert mode in ['Train', 'Eval', 'Test'\n                    ], \"Dataset mode should be Train, Eval, Test\"\n\n    if mode not in config:\n        return None\n\n    dataset = build_dataset(config, mode)\n\n    batch_sampler = None\n    # build sampler\n    if 'sampler' in config[mode].keys():\n        config_sampler = config[mode].sampler\n        config_sampler = copy.deepcopy(config_sampler)\n        sampler_name = config_sampler.pop(\"name\")\n        batch_sampler = eval(\"sampler.{}\".format(sampler_name))(\n            dataset, **config_sampler)\n        logger.debug(\"build batch_sampler({}) success...\".format(\n            batch_sampler))\n\n    collate_fn = None\n    config_loader = {}\n    # build dataloader\n    if 'loader' in config[mode].keys():\n        config_loader = config[mode].loader\n        config_loader = copy.deepcopy(config_loader)\n\n        collate_fn_cfg = config_loader.pop('collate_fn', None)\n        if isinstance(collate_fn_cfg, str):\n            collate_fn = getattr(\n                utils, collate_fn_cfg) if collate_fn_cfg is not None else None\n        elif isinstance(collate_fn_cfg, dict):\n            collate_fn_class_name = collate_fn_cfg.pop(\"name\")\n            collate_fn = eval(\"utils.{}\".format(collate_fn_class_name))(\n                **collate_fn_cfg)\n            logger.debug(\"build collate_fn({}) success...\".format(collate_fn))\n\n    def worker_init_fn(worker_id):\n        \"\"\" set seed in subproces for dataloader when num_workers > 0\"\"\"\n        np.random.seed(env.get_dp_seed() + worker_id)\n        random.seed(env.get_dp_seed() + worker_id)\n\n    data_loader = paddle.io.DataLoader(\n        dataset=dataset,\n        batch_sampler=batch_sampler,\n        collate_fn=collate_fn,\n        worker_init_fn=worker_init_fn,\n        **config_loader)\n\n    logger.debug(\"build data_loader({}) success...\".format(data_loader))\n    return data_loader\n"
  },
  {
    "path": "ppfleetx/data/data_tools/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/data/data_tools/cpp/Makefile",
    "content": "CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color\nCPPFLAGS += $(shell $(PYTHON_BIN) -m pybind11 --includes)\nCPPFLAGS += $(shell python3-config --includes)\n\nLIBNAME = fast_index_map_helpers\nLIBEXT = .so\n\ndefault: $(LIBNAME)$(LIBEXT)\n\n%$(LIBEXT): %.cpp\n\t$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@\n"
  },
  {
    "path": "ppfleetx/data/data_tools/cpp/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/data/data_tools/cpp/compile.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport subprocess\npath = os.path.abspath(os.path.dirname(__file__))\n\n\ndef compile_helper():\n    \"\"\"Compile helper function ar runtime. Make sure this\n    is invoked on a single process.\"\"\"\n    import sys\n    excutable = sys.executable\n    ret = subprocess.run(['make', '-C', path, f'PYTHON_BIN={excutable}'])\n    if ret.returncode != 0:\n        print(\"Making C++ dataset helpers module failed, exiting.\")\n        sys.exit(1)\n"
  },
  {
    "path": "ppfleetx/data/data_tools/cpp/fast_index_map_helpers.cpp",
    "content": "/*\n coding=utf-8\n Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.\n Licensed under the Apache License, Version 2.0 (the \"License\");\n you may not use this file except in compliance with the License.\n You may obtain a copy of the License at\n     http://www.apache.org/licenses/LICENSE-2.0\n Unless required by applicable law or agreed to in writing, software\n distributed under the License is distributed on an \"AS IS\" BASIS,\n WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n See the License for the specific language governing permissions and\n limitations under the License.\n */\n\n/* Helper methods for fast index mapping builds */\n\n#include <algorithm>\n#include <iostream>\n#include <limits>\n#include <random>\n#include <stdexcept>\n\n#include <math.h>\n#include <pybind11/numpy.h>\n#include <pybind11/pybind11.h>\n\nnamespace py = pybind11;\nusing namespace std;\n\nconst int32_t LONG_SENTENCE_LEN = 512;\n\nvoid build_blending_indices(\n    py::array_t<uint8_t> &dataset_index,        // NOLINT\n    py::array_t<int64_t> &dataset_sample_index, // NOLINT\n    const py::array_t<double> &weights, const int32_t num_datasets,\n    const int64_t size, const bool verbose) {\n  /* Given multiple datasets and a weighting array, build samples\n   such that it follows those wieghts.*/\n\n  if (verbose) {\n    std::cout << \"> building indices for blendable datasets ...\" << std::endl;\n  }\n\n  // Get the pointer access without the checks.\n  auto dataset_index_ptr = dataset_index.mutable_unchecked<1>();\n  auto dataset_sample_index_ptr = dataset_sample_index.mutable_unchecked<1>();\n  auto weights_ptr = weights.unchecked<1>();\n\n  // Initialize buffer for number of samples used for each dataset.\n  int64_t current_samples[num_datasets];\n  for (int64_t i = 0; i < num_datasets; ++i) {\n    current_samples[i] = 0;\n  }\n\n  // For each sample:\n  for (int64_t sample_idx = 0; sample_idx < size; ++sample_idx) {\n    // Determine where the max error in sampling is happening.\n    auto sample_idx_double = std::max(static_cast<double>(sample_idx), 1.0);\n    int64_t max_error_index = 0;\n    double max_error = weights_ptr[0] * sample_idx_double -\n                       static_cast<double>(current_samples[0]);\n    for (int64_t dataset_idx = 1; dataset_idx < num_datasets; ++dataset_idx) {\n      double error = weights_ptr[dataset_idx] * sample_idx_double -\n                     static_cast<double>(current_samples[dataset_idx]);\n      if (error > max_error) {\n        max_error = error;\n        max_error_index = dataset_idx;\n      }\n    }\n\n    // Populate the indices.\n    dataset_index_ptr[sample_idx] = static_cast<uint8_t>(max_error_index);\n    dataset_sample_index_ptr[sample_idx] = current_samples[max_error_index];\n\n    // Update the total samples.\n    current_samples[max_error_index] += 1;\n  }\n\n  // print info\n  if (verbose) {\n    std::cout << \" > sample ratios:\" << std::endl;\n    for (int64_t dataset_idx = 0; dataset_idx < num_datasets; ++dataset_idx) {\n      auto ratio = static_cast<double>(current_samples[dataset_idx]) /\n                   static_cast<double>(size);\n      std::cout << \"   dataset \" << dataset_idx\n                << \", input: \" << weights_ptr[dataset_idx]\n                << \", achieved: \" << ratio << std::endl;\n    }\n  }\n}\n\npy::array build_sample_idx(const py::array_t<int64_t> &sizes_,\n                           const py::array_t<int64_t> &doc_idx_,\n                           const int32_t seq_length, const int32_t num_epochs,\n                           const int64_t tokens_per_epoch) {\n  /* Sample index (sample_idx) is used for gpt2 like dataset for which\n     the documents are flattened and the samples are built based on this\n     1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2]\n     where [..., 0] contains the index into `doc_idx` and [..., 1] is the\n     starting offset in that document.*/\n\n  // Consistency checks.\n  assert(seq_length > 1);\n  assert(num_epochs > 0);\n  assert(tokens_per_epoch > 1);\n\n  // Remove bound checks.\n  auto sizes = sizes_.unchecked<1>();\n  auto doc_idx = doc_idx_.unchecked<1>();\n\n  // Mapping and it's length (1D).\n  int64_t num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length;\n  int64_t *sample_idx = new int64_t[2 * (num_samples + 1)];\n\n  cout << \"    using:\" << endl << std::flush;\n  cout << \"     number of documents:       \" << doc_idx_.shape(0) / num_epochs\n       << endl\n       << std::flush;\n  cout << \"     number of epochs:          \" << num_epochs << endl\n       << std::flush;\n  cout << \"     sequence length:           \" << seq_length << endl\n       << std::flush;\n  cout << \"     total number of samples:   \" << num_samples << endl\n       << std::flush;\n\n  // Index into sample_idx.\n  int64_t sample_index = 0;\n  // Index into doc_idx.\n  int64_t doc_idx_index = 0;\n  // Begining offset for each document.\n  int64_t doc_offset = 0;\n  // Start with first document and no offset.\n  sample_idx[2 * sample_index] = doc_idx_index;\n  sample_idx[2 * sample_index + 1] = doc_offset;\n  ++sample_index;\n\n  while (sample_index <= num_samples) {\n    // Start with a fresh sequence.\n    int64_t remaining_seq_length = seq_length + 1;\n    while (remaining_seq_length != 0) {\n      // Get the document length.\n      auto doc_id = doc_idx[doc_idx_index];\n      auto doc_length = sizes[doc_id] - doc_offset;\n      // And add it to the current sequence.\n      remaining_seq_length -= doc_length;\n      // If we have more than a full sequence, adjust offset and set\n      // remaining length to zero so we return from the while loop.\n      // Note that -1 here is for the same reason we have -1 in\n      // `_num_epochs` calculations.\n      if (remaining_seq_length <= 0) {\n        doc_offset += (remaining_seq_length + doc_length - 1);\n        remaining_seq_length = 0;\n      } else {\n        // Otherwise, start from the begining of the next document.\n        ++doc_idx_index;\n        doc_offset = 0;\n      }\n    }\n    // Record the sequence.\n    sample_idx[2 * sample_index] = doc_idx_index;\n    sample_idx[2 * sample_index + 1] = doc_offset;\n    ++sample_index;\n  }\n\n  // Method to deallocate memory.\n  py::capsule free_when_done(sample_idx, [](void *mem_) {\n    int64_t *mem = reinterpret_cast<int64_t *>(mem_);\n    delete[] mem;\n  });\n\n  // Return the numpy array.\n  const auto byte_size = sizeof(int64_t);\n  return py::array(std::vector<int64_t>{num_samples + 1, 2}, // shape\n                   {2 * byte_size, byte_size}, // C-style contiguous strides\n                   sample_idx,                 // the data pointer\n                   free_when_done);            // numpy array references\n}\n\ninline int32_t get_target_sample_len(const int32_t short_seq_ratio,\n                                     const int32_t max_length,\n                                     std::mt19937 &rand32_gen) {\n  /* Training sample length. */\n  if (short_seq_ratio == 0) {\n    return max_length;\n  }\n  const auto random_number = rand32_gen();\n  if ((random_number % short_seq_ratio) == 0) {\n    return 2 + random_number % (max_length - 1);\n  }\n  return max_length;\n}\n\ntemplate <typename DocIdx>\npy::array\nbuild_mapping_impl(const py::array_t<int64_t> &docs_,\n                   const py::array_t<int32_t> &sizes_, const int32_t num_epochs,\n                   const uint64_t max_num_samples, const int32_t max_seq_length,\n                   const double short_seq_prob, const int32_t seed,\n                   const bool verbose, const int32_t min_num_sent) {\n  /* Build a mapping of (start-index, end-index, sequence-length) where\n     start and end index are the indices of the sentences in the sample\n     and sequence-length is the target sequence length.\n  */\n\n  // Consistency checks.\n  assert(num_epochs > 0);\n  assert(max_seq_length > 1);\n  assert(short_seq_prob >= 0.0);\n  assert(short_seq_prob <= 1.0);\n  assert(seed > 0);\n\n  // Remove bound checks.\n  auto docs = docs_.unchecked<1>();\n  auto sizes = sizes_.unchecked<1>();\n\n  // For efficiency, convert probability to ratio. Note: rand() generates int.\n  int32_t short_seq_ratio = 0;\n  if (short_seq_prob > 0) {\n    short_seq_ratio = static_cast<int32_t>(round(1.0 / short_seq_prob));\n  }\n\n  if (verbose) {\n    const auto sent_start_index = docs[0];\n    const auto sent_end_index = docs[docs_.shape(0) - 1];\n    const auto num_sentences = sent_end_index - sent_start_index;\n    cout << \"    using:\" << endl << std::flush;\n    cout << \"     number of documents:            \" << docs_.shape(0) - 1\n         << endl\n         << std::flush;\n    cout << \"     sentences range:                [\" << sent_start_index << \", \"\n         << sent_end_index << \")\" << endl\n         << std::flush;\n    cout << \"     total number of sentences:      \" << num_sentences << endl\n         << std::flush;\n    cout << \"     number of epochs:               \" << num_epochs << endl\n         << std::flush;\n    cout << \"     maximum number of samples:      \" << max_num_samples << endl\n         << std::flush;\n    cout << \"     maximum sequence length:        \" << max_seq_length << endl\n         << std::flush;\n    cout << \"     minimum sentences num:          \" << min_num_sent << endl\n         << std::flush;\n    cout << \"     short sequence probability:     \" << short_seq_prob << endl\n         << std::flush;\n    cout << \"     short sequence ration (1/prob): \" << short_seq_ratio << endl\n         << std::flush;\n    cout << \"     seed:                           \" << seed << endl\n         << std::flush;\n  }\n\n  // Mapping and it's length (1D).\n  int64_t num_samples = -1;\n  DocIdx *maps = NULL;\n\n  // Perform two iterations, in the first iteration get the size\n  // and allocate memory and in the second iteration populate the map.\n  bool second = false;\n  for (int32_t iteration = 0; iteration < 2; ++iteration) {\n    // Set the seed so both iterations produce the same results.\n    std::mt19937 rand32_gen(seed);\n\n    // Set the flag on second iteration.\n    second = (iteration == 1);\n\n    // Counters:\n    uint64_t empty_docs = 0;\n    uint64_t one_sent_docs = 0;\n    uint64_t long_sent_docs = 0;\n\n    // Current map index.\n    uint64_t map_index = 0;\n\n    // For each epoch:\n    for (int32_t epoch = 0; epoch < num_epochs; ++epoch) {\n      if (map_index >= max_num_samples) {\n        if (verbose && (!second)) {\n          cout << \"    reached \" << max_num_samples << \" samples after \"\n               << epoch << \" epochs ...\" << endl\n               << std::flush;\n        }\n        break;\n      }\n      if (epoch > 0 && map_index == 0) {\n        cout << endl\n             << \"     No available documtment find this dataset.\" << endl\n             << std::flush;\n        throw std::invalid_argument(\n            \"Invalid dataset! the document should be with more than \" +\n            std::to_string(min_num_sent) + \" scentences.\");\n      }\n      // For each document:\n      for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc) {\n        // Document sentences are in [sent_index_first, sent_index_last)\n        const auto sent_index_first = docs[doc];\n        const auto sent_index_last = docs[doc + 1];\n\n        // At the begining of the document previous index is the\n        // start index.\n        auto prev_start_index = sent_index_first;\n\n        // Remaining documents.\n        auto num_remain_sent = sent_index_last - sent_index_first;\n\n        // Some bookkeeping\n        if ((epoch == 0) && (!second)) {\n          if (num_remain_sent == 0) {\n            ++empty_docs;\n          }\n          if (num_remain_sent == 1) {\n            ++one_sent_docs;\n          }\n        }\n\n        // Detect documents with long sentences.\n        bool contains_long_sentence = false;\n        if (num_remain_sent > 1) {\n          for (auto sent_index = sent_index_first; sent_index < sent_index_last;\n               ++sent_index) {\n            if (sizes[sent_index] > LONG_SENTENCE_LEN) {\n              if ((epoch == 0) && (!second)) {\n                ++long_sent_docs;\n              }\n              contains_long_sentence = true;\n              break;\n            }\n          }\n        }\n\n        // If we have more than two sentences.\n        if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence)) {\n          // Set values.\n          auto seq_len = int32_t{0};\n          auto num_sent = int32_t{0};\n          auto target_seq_len = get_target_sample_len(\n              short_seq_ratio, max_seq_length, rand32_gen);\n\n          // Loop through sentences.\n          for (auto sent_index = sent_index_first; sent_index < sent_index_last;\n               ++sent_index) {\n            // Add the size and number of sentences.\n            seq_len += sizes[sent_index];\n            ++num_sent;\n            --num_remain_sent;\n\n            // If we have reached the target length.\n            // and if not only one sentence is left in the document.\n            // and if we have at least two sentneces.\n            // and if we have reached end of the document.\n            if (((seq_len >= target_seq_len) && (num_remain_sent > 1) &&\n                 (num_sent >= min_num_sent)) ||\n                (num_remain_sent == 0)) {\n              // Check for overflow.\n              if ((3 * map_index + 2) > std::numeric_limits<int64_t>::max()) {\n                cout << \"number of samples exceeded maximum \"\n                     << \"allowed by type int64: \"\n                     << std::numeric_limits<int64_t>::max() << endl;\n                throw std::overflow_error(\"Number of samples\");\n              }\n\n              // Populate the map.\n              if (second) {\n                const auto map_index_0 = 3 * map_index;\n                maps[map_index_0] = static_cast<DocIdx>(prev_start_index);\n                maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);\n                maps[map_index_0 + 2] = static_cast<DocIdx>(target_seq_len);\n              }\n\n              // Update indices / counters.\n              ++map_index;\n              prev_start_index = sent_index + 1;\n              target_seq_len = get_target_sample_len(\n                  short_seq_ratio, max_seq_length, rand32_gen);\n              seq_len = 0;\n              num_sent = 0;\n            }\n\n          } // for (auto sent_index=sent_index_first; ...\n        }   // if (num_remain_sent > 1) {\n      }     // for (int doc=0; doc < num_docs; ++doc) {\n    }       // for (int epoch=0; epoch < num_epochs; ++epoch) {\n\n    if (!second) {\n      if (verbose) {\n        cout << \"   number of empty documents: \" << empty_docs << endl\n             << std::flush;\n        cout << \"   number of documents with one sentence: \" << one_sent_docs\n             << endl\n             << std::flush;\n        cout << \"   number of documents with long sentences: \" << long_sent_docs\n             << endl\n             << std::flush;\n        cout << \"   will create mapping for \" << map_index << \" samples\" << endl\n             << std::flush;\n      }\n      assert(maps == NULL);\n      assert(num_samples < 0);\n      maps = new DocIdx[3 * map_index];\n      num_samples = static_cast<int64_t>(map_index);\n    }\n\n  } // for (int iteration=0; iteration < 2; ++iteration) {\n\n  // Shuffle.\n  // We need a 64 bit random number generator as we might have more\n  // than 2 billion samples.\n  std::mt19937_64 rand64_gen(seed + 1);\n  for (auto i = (num_samples - 1); i > 0; --i) {\n    const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));\n    const auto i0 = 3 * i;\n    const auto j0 = 3 * j;\n    // Swap values.\n    swap(maps[i0], maps[j0]);\n    swap(maps[i0 + 1], maps[j0 + 1]);\n    swap(maps[i0 + 2], maps[j0 + 2]);\n  }\n\n  // Method to deallocate memory.\n  py::capsule free_when_done(maps, [](void *mem_) {\n    DocIdx *mem = reinterpret_cast<DocIdx *>(mem_);\n    delete[] mem;\n  });\n\n  // Return the numpy array.\n  const auto byte_size = sizeof(DocIdx);\n  return py::array(std::vector<int64_t>{num_samples, 3}, // shape\n                   {3 * byte_size, byte_size}, // C-style contiguous strides\n                   maps,                       // the data pointer\n                   free_when_done);            // numpy array references\n}\n\npy::array build_mapping(const py::array_t<int64_t> &docs_,\n                        const py::array_t<int> &sizes_, const int num_epochs,\n                        const uint64_t max_num_samples,\n                        const int max_seq_length, const double short_seq_prob,\n                        const int seed, const bool verbose,\n                        const int32_t min_num_sent) {\n  if (sizes_.size() > std::numeric_limits<uint32_t>::max()) {\n    if (verbose) {\n      cout << \"    using uint64 for data mapping...\" << endl << std::flush;\n    }\n    return build_mapping_impl<uint64_t>(\n        docs_, sizes_, num_epochs, max_num_samples, max_seq_length,\n        short_seq_prob, seed, verbose, min_num_sent);\n  } else {\n    if (verbose) {\n      cout << \"    using uint32 for data mapping...\" << endl << std::flush;\n    }\n    return build_mapping_impl<uint32_t>(\n        docs_, sizes_, num_epochs, max_num_samples, max_seq_length,\n        short_seq_prob, seed, verbose, min_num_sent);\n  }\n}\n\ntemplate <typename DocIdx>\npy::array build_blocks_mapping_impl(\n    const py::array_t<int64_t> &docs_, const py::array_t<int32_t> &sizes_,\n    const py::array_t<int32_t> &titles_sizes_, const int32_t num_epochs,\n    const uint64_t max_num_samples, const int32_t max_seq_length,\n    const int32_t seed, const bool verbose, const bool use_one_sent_blocks) {\n  /* Build a mapping of (start-index, end-index, sequence-length) where\n     start and end index are the indices of the sentences in the sample\n     and sequence-length is the target sequence length.\n  */\n\n  // Consistency checks.\n  assert(num_epochs > 0);\n  assert(max_seq_length > 1);\n  assert(seed > 0);\n\n  // Remove bound checks.\n  auto docs = docs_.unchecked<1>();\n  auto sizes = sizes_.unchecked<1>();\n  auto titles_sizes = titles_sizes_.unchecked<1>();\n\n  if (verbose) {\n    const auto sent_start_index = docs[0];\n    const auto sent_end_index = docs[docs_.shape(0) - 1];\n    const auto num_sentences = sent_end_index - sent_start_index;\n    cout << \"    using:\" << endl << std::flush;\n    cout << \"     number of documents:            \" << docs_.shape(0) - 1\n         << endl\n         << std::flush;\n    cout << \"     sentences range:                [\" << sent_start_index << \", \"\n         << sent_end_index << \")\" << endl\n         << std::flush;\n    cout << \"     total number of sentences:      \" << num_sentences << endl\n         << std::flush;\n    cout << \"     number of epochs:               \" << num_epochs << endl\n         << std::flush;\n    cout << \"     maximum number of samples:      \" << max_num_samples << endl\n         << std::flush;\n    cout << \"     maximum sequence length:        \" << max_seq_length << endl\n         << std::flush;\n    cout << \"     seed:                           \" << seed << endl\n         << std::flush;\n  }\n\n  // Mapping and its length (1D).\n  int64_t num_samples = -1;\n  DocIdx *maps = NULL;\n\n  // Acceptable number of sentences per block.\n  int min_num_sent = 2;\n  if (use_one_sent_blocks) {\n    min_num_sent = 1;\n  }\n\n  // Perform two iterations, in the first iteration get the size\n  // and allocate memory and in the second iteration populate the map.\n  bool second = false;\n  for (int32_t iteration = 0; iteration < 2; ++iteration) {\n    // Set the flag on second iteration.\n    second = (iteration == 1);\n\n    // Current map index.\n    uint64_t map_index = 0;\n\n    uint64_t empty_docs = 0;\n    uint64_t one_sent_docs = 0;\n    uint64_t long_sent_docs = 0;\n    // For each epoch:\n    for (int32_t epoch = 0; epoch < num_epochs; ++epoch) {\n      // assign every block a unique id\n      int32_t block_id = 0;\n\n      if (map_index >= max_num_samples) {\n        if (verbose && (!second)) {\n          cout << \"    reached \" << max_num_samples << \" samples after \"\n               << epoch << \" epochs ...\" << endl\n               << std::flush;\n        }\n        break;\n      }\n      // For each document:\n      for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc) {\n        // Document sentences are in [sent_index_first, sent_index_last)\n        const auto sent_index_first = docs[doc];\n        const auto sent_index_last = docs[doc + 1];\n        const auto target_seq_len = max_seq_length - titles_sizes[doc];\n\n        // At the begining of the document previous index is the\n        // start index.\n        auto prev_start_index = sent_index_first;\n\n        // Remaining documents.\n        auto num_remain_sent = sent_index_last - sent_index_first;\n\n        // Some bookkeeping\n        if ((epoch == 0) && (!second)) {\n          if (num_remain_sent == 0) {\n            ++empty_docs;\n          }\n          if (num_remain_sent == 1) {\n            ++one_sent_docs;\n          }\n        }\n        // Detect documents with long sentences.\n        bool contains_long_sentence = false;\n        if (num_remain_sent >= min_num_sent) {\n          for (auto sent_index = sent_index_first; sent_index < sent_index_last;\n               ++sent_index) {\n            if (sizes[sent_index] > LONG_SENTENCE_LEN) {\n              if ((epoch == 0) && (!second)) {\n                ++long_sent_docs;\n              }\n              contains_long_sentence = true;\n              break;\n            }\n          }\n        }\n        // If we have enough sentences and no long sentences.\n        if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence)) {\n          // Set values.\n          auto seq_len = int32_t{0};\n          auto num_sent = int32_t{0};\n\n          // Loop through sentences.\n          for (auto sent_index = sent_index_first; sent_index < sent_index_last;\n               ++sent_index) {\n            // Add the size and number of sentences.\n            seq_len += sizes[sent_index];\n            ++num_sent;\n            --num_remain_sent;\n\n            // If we have reached the target length.\n            // and there are an acceptable number of sentences left\n            // and if we have at least the minimum number of sentences.\n            // or if we have reached end of the document.\n            if (((seq_len >= target_seq_len) &&\n                 (num_remain_sent >= min_num_sent) &&\n                 (num_sent >= min_num_sent)) ||\n                (num_remain_sent == 0)) {\n              // Populate the map.\n              if (second) {\n                const auto map_index_0 = 4 * map_index;\n                // Each sample has 4 items: the starting sentence index, ending\n                // sentence index,\n                // the index of the document from which the block comes (used\n                // for fetching titles)\n                // and the unique id of the block (used for creating block\n                // indexes)\n\n                maps[map_index_0] = static_cast<DocIdx>(prev_start_index);\n                maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);\n                maps[map_index_0 + 2] = static_cast<DocIdx>(doc);\n                maps[map_index_0 + 3] = static_cast<DocIdx>(block_id);\n              }\n\n              // Update indices / counters.\n              ++map_index;\n              ++block_id;\n              prev_start_index = sent_index + 1;\n              seq_len = 0;\n              num_sent = 0;\n            }\n          } // for (auto sent_index=sent_index_first; ...\n        }   // if (num_remain_sent > 1) {\n      }     // for (int doc=0; doc < num_docs; ++doc) {\n    }       // for (int epoch=0; epoch < num_epochs; ++epoch) {\n\n    if (!second) {\n      if (verbose) {\n        cout << \"   number of empty documents: \" << empty_docs << endl\n             << std::flush;\n        cout << \"   number of documents with one sentence: \" << one_sent_docs\n             << endl\n             << std::flush;\n        cout << \"   number of documents with long sentences: \" << long_sent_docs\n             << endl\n             << std::flush;\n        cout << \"   will create mapping for \" << map_index << \" samples\" << endl\n             << std::flush;\n      }\n      assert(maps == NULL);\n      assert(num_samples < 0);\n      maps = new DocIdx[4 * map_index];\n      num_samples = static_cast<int64_t>(map_index);\n    }\n\n  } // for (int iteration=0; iteration < 2; ++iteration) {\n\n  // Shuffle.\n  // We need a 64 bit random number generator as we might have more\n  // than 2 billion samples.\n  std::mt19937_64 rand64_gen(seed + 1);\n  for (auto i = (num_samples - 1); i > 0; --i) {\n    const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));\n    const auto i0 = 4 * i;\n    const auto j0 = 4 * j;\n    // Swap values.\n    swap(maps[i0], maps[j0]);\n    swap(maps[i0 + 1], maps[j0 + 1]);\n    swap(maps[i0 + 2], maps[j0 + 2]);\n    swap(maps[i0 + 3], maps[j0 + 3]);\n  }\n\n  // Method to deallocate memory.\n  py::capsule free_when_done(maps, [](void *mem_) {\n    DocIdx *mem = reinterpret_cast<DocIdx *>(mem_);\n    delete[] mem;\n  });\n\n  // Return the numpy array.\n  const auto byte_size = sizeof(DocIdx);\n  return py::array(std::vector<int64_t>{num_samples, 4}, // shape\n                   {4 * byte_size, byte_size}, // C-style contiguous strides\n                   maps,                       // the data pointer\n                   free_when_done);            // numpy array references\n}\n\npy::array build_blocks_mapping(\n    const py::array_t<int64_t> &docs_, const py::array_t<int> &sizes_,\n    const py::array_t<int> &titles_sizes_, const int num_epochs,\n    const uint64_t max_num_samples, const int max_seq_length, const int seed,\n    const bool verbose, const bool use_one_sent_blocks) {\n  if (sizes_.size() > std::numeric_limits<uint32_t>::max()) {\n    if (verbose) {\n      cout << \"    using uint64 for data mapping...\" << endl << std::flush;\n    }\n    return build_blocks_mapping_impl<uint64_t>(\n        docs_, sizes_, titles_sizes_, num_epochs, max_num_samples,\n        max_seq_length, seed, verbose, use_one_sent_blocks);\n  } else {\n    if (verbose) {\n      cout << \"    using uint32 for data mapping...\" << endl << std::flush;\n    }\n    return build_blocks_mapping_impl<uint32_t>(\n        docs_, sizes_, titles_sizes_, num_epochs, max_num_samples,\n        max_seq_length, seed, verbose, use_one_sent_blocks);\n  }\n}\n\nPYBIND11_MODULE(fast_index_map_helpers, m) {\n  m.def(\"build_mapping\", &build_mapping);\n  m.def(\"build_blocks_mapping\", &build_blocks_mapping);\n  m.def(\"build_sample_idx\", &build_sample_idx);\n  m.def(\"build_blending_indices\", &build_blending_indices);\n}\n"
  },
  {
    "path": "ppfleetx/data/data_tools/ernie/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/data/data_tools/ernie/preprocess/README.md",
    "content": "# PaddleFleetX 预训练数据准备流程\n\n本示例致力于打造基于PaddleFleetX预训练模型的最佳实践。\n\n\n我们将预训练数据过程划分为以下部分\n\n- 原始数据转换，原始文本转换为jsonl的json字符串格式。\n- 数据ID化，断句、分词、tokenize转化为token id格式。\n- 训练index文件生成，生成train、valid、test的每个样本索引。\n- token动态mask(可选)，python 层实时mask文本。\n\n本目录下主要包含一下文件：\n```\n├── create_pretraining_data.py\n├── dataset_utils.py\n├── ernie_dataset.py\n├── helpers.cpp\n├── Makefile\n├── README.md\n└── trans_to_json.py\n\n```\n其中，`trans_to_json.py`是原始数据转化的脚本，将数据转化为json串格式。\n`create_pretraining_data.py`将jsonl文本，断句、分词后，tokenizer转化为token id。\n`dataset_utils.py`中包含了index生成、动态mask的实现。\n`ernie_dataset.py`通过调用`dataset_utils.py`的一些函数，产生ernie的输入dataset。\n\n\n### 环境依赖\n\n - tqdm\n - numpy\n - pybind11\n - tool_helpers\n - lac (可选)\n - zstandard (可选)\n\n安装命令`pip install tqdm numpy pybind11 tool_helpers lac zstandard`。另，部分功能需要`g++>=4.8`编译支持\n\n\n## 训练全流程数据Pipeline\n\n飞桨是自主研发、功能完备、开源开放的产业级深度学习平台，集深度学习核心训练和推理框架、基础模型库、端到端开发套件和丰富的工具组件于一体\n\n|步骤|阶段&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;|数据格式| 样例|\n|-|-|-|-|\n| 0️⃣初始状态 | -|原始数据： <br/> **每个doc之间用空行间隔开** <br/> - 中文，默认每句换行符，作为句子结束。<br/> - 英文，默认使用nltk判断句子结束  | ```飞桨是功能完备、开源开放的产业级深度学习平台。``` <br/> ```飞桨拥有核心训练和推理框架、基础模型库。``` <br/><br/> ```PaddleNLP是自然语言处理领域的优秀工具。```  |\n|1️⃣原始数据转换<br/>`trans_to_json.py`|预处理 <br>输入：0️⃣初始状态 <br>输出：jsonl|jsonl格式：每个doc对应一行json字符串| ```{\"text\": \"飞桨是功能完备、开源开放的产业级深度学习平台。飞桨拥有...\"}```<br/>```{\"text\": \"PaddleNLP是自然语言...\"}```\n|❇️(**可选**)数据中文分词<br/>`words_segmentation.py`|语料分词：中文WWM <br>输入：jsonl  <br> 输出：0️⃣初始状态| 将jsonl格式的数据，恢复成分词后的原始格式数据 <br> | ```飞桨 是 功能 完备、开源 开放的 产业级 深度学习 平台。``` <br/> ```飞桨 拥有 核心 训练和推理 框架、基础 模型库。``` <br/><br/> ```PaddleNLP 是 自然语言处理领域 的 优秀工具。```\n|2️⃣数据ID化<br/>`create_pretrain_data.py`|预处理| npy格式：数据id化后的token id <br/>npz格式：数据句子、文章位置索引 | -\n|3️⃣训练index文件生成|训练启动|npy格式：<br/> 根据训练步数max_steps生成<br/>train、valid、test的每个样本索引文件| -\n|4️⃣token动态mask（可选）| Dataset取数据 | 无 |-\n\n\n注意：\n- **❇️(**可选**)数据中文分词** 是中文预训练做 WWM 的可选步骤\n  - 当你的数据比较少时，分词耗时较少，不需要词步骤。直接在`create_pretrain_data.py`步骤中分词即可。\n  - 目的是为了提前分词，加快后续数据ID转化步骤。\n  - 如果这里输入的是 jsonl格式文件，最好为多文件，`trans_to_json.py` 时候开启`no-merge`选项。\n  - 当你的数据集比较大，或者需要尝试多次转换数据的时候，提前分词可以避免`create_pretrain_data.py`时每次都运行一次分词程序。\n- 转换后，需要重新 进行步骤 1️⃣`原始数据转换 trans_to_json.py`，最后2️⃣`数据ID化`步骤设置`--cn_splited=True`参数。\n- 2️⃣`数据ID化`也可以在转化ID的同时，一起实现分词。不需要❇️`数据中文分词`步骤。\n\n\n## 数据教程汇总\n\n针对目前开源的数据集，PaddleFleetX提供了详细的数据教程，点击对应数据集的链接，即可开始进行数据制作：\n\n| 名称 | 文本类型 | 纯文本大小 | 适配模型\n|-|-|-|-|\n| [CLUECorpusSmall](./docs/CLUECorpusSmall.md)| 中文 | 14GB | ERNIE\n| [OpenWebText2](./docs/OpenWebText2.md) | 英文 | 70GB | GPT\n| [WuDaoCorpus2.0 Base](./docs/WuDaoCorpusBase.md)| 中文 |  200GB | ERNIE\n| [CLUECorpus2020](./docs/CLUECorpus2020.md)| 中文 | 200GB | ERNIE\n\n## ERNIE预训练详细准备\n\n下面以ERNIE预训练为例，简要介绍一下预训练的全流程。\n\n### 原始数据\n首先下载样例数据：\n```\ncd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略\n\nmkdir preprocess && cd preprocess\nwget https://bj.bcebos.com/paddlenlp/models/transformers/data_tools/baike.txt\ncd ..\n```\n\n### 原始数据转换 jsonl 格式\n使用`trans_to_json.py`转化为json串格式，下面是脚本的使用说明\n```\noptional arguments:\n  -h, --help            show this help message and exit\n  --input_path INPUT_PATH\n                        Path to you raw files. Folder or file path.\n                        必须设置，可以是文件夹或者单个文件。文件夹中的目录默认最多搜索两层子目录。\n  --output_path OUTPUT_PATH\n                        Path to save the output json files.\n                        必须设置，输出文件的名字。\n  --json_key JSON_KEY   The content key of json file.\n                        建议不修改，默认的key是text\n  --doc_spliter DOC_SPLITER\n                        Spliter between documents. We will strip the line, if you use blank line to split doc, leave it blank.\n                        根据实际情况修改，默认空行作为文章换行符。\n  --min_doc_length MIN_DOC_LENGTH\n                        Minimal char of a documment.\n                        可选。过滤掉长度多短的文章，默认值10\n  --workers WORKERS     Number of worker processes to launch\n                        可选。多进程转化文件，适用于 input_path 中包含的文件数据较多的情况。每个文件，分配给不同worker处理\n  --log_interval LOG_INTERVAL\n                        Interval between progress updates.\n                        可选。此处的interval是值处理完文件个数的间隔。\n  --no-merge            Don't merge the file.\n                        可选。默认不开启这个选项，默认每个文件转换的jsonl文本，会拼接成到同一个文件。\n  --no-shuffle          Don't shuffle the file.\n                        可选。默认不开启这个选项，默认对处理完进行shuffle。\n```\n根据说明，我们使用下面简单命令，可以得到`baike_sample.jsonl`文件。此处，我们对文章所有doc进行了shuffle。\n```shell\ncd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略\n\npython ./ppfleetx/data/data_tools/ernie/preprocess/trans_to_json.py  --input_path ./preprocess --output_path preprocess/baike_sample\n\n#查看数据\nhead -1 baike_sample.jsonl\n{\"text\": \"中国效仿西方发展工业的过程，于中华民国国民政府成立后至中日战争开战前夕已顺畅发展，尽管其间受到内外因素的多重干扰。尔后直至中日战争和国共战争的结束，\n中国始有较为长期的和平发展时期。\\n1980年代以来，邓小平政府宣布改革开放，开始实行社会主义市场经济并推行经济体制改革。中国大陆近年至2010年，GDP超过72000亿美元，\n已经成为美国之后的世界第二经济大国，普遍认为中国是世界上发展速度最快的经济体，但是人均国民生产总值仍位于世界中等水平（第89位），并逐渐受到资源限制和贫富差距加\n大的制约。中华人民共和国省份中，广东为GDP最高的第一强省，浙江为人均收入最高的第一富省。中国大陆、香港、澳门、台湾之间的经济联系在全球化的过程中日益紧密。\\n\"}\n```\n\n### 数据ID化\n本部分，我们使用 `create_pretraining_data.py` 脚本将前面得到的 `baike_sample.jsonl` 进行tokenize id化处理。\n```\noptional arguments:\n  -h, --help            show this help message and exit\n  --model_name MODEL_NAME\n                        What model to use.\n                        必须设置，如：ernie-1.0-base-zh, 可以参考已有的模型名称 https://paddlenlp.readthedocs.io/zh/latest/model_zoo/index.html#transformer\n  --tokenizer_name {ErnieTokenizer,BertTokenizer,GPTTokenizer,GPTChineseTokenizer}\n                        What type of tokenizer to use.\n                        模型对应的tokenizer, 目前暂时只支持 ERNIE，BERT，GPT\ndata input/output:\n  --input_path INPUT_PATH\n                        Path to input JSON files.\n                        必须设置，输入文件jsonl的目录\n  --output_prefix OUTPUT_PREFIX\n                        Output prefix to store output file.\n                        必须设置，输出文件的名称。\n                        假设名称为XXX，则会输出 XXX_ids.npy, XXX_idx.npz 两个文件。\n                        npy文件，数据id化后的token ids; npz文件，数据句子、文章位置索引。\n  --data_format {JSON}  Only support json format for now. One document per line.\n                        不需要设置。目前默认处理jsonl数据格式\n  --json_key JSON_KEY   For JSON format. Space separate listed of keys to extract from json\n                        文本串json的key值。同前面trans_to_json.py的json_key，默认text为key\n  --split_sentences     Split documents into sentences.\n                        是否需要将文章划分成句子。一般而言，GPT不需要，BERT/ERNIE模型需要\n\nchinese words:\n  --chinese             Is corpus need words segmentation step for chinese words.\n                        中文情形必须设置。处理的文本类型是否是中文。\n  --cn_whole_word_segment\n                        Is corpus need words segmentation step for chinese words WWM.\n                        可选。是否需要WWM策略。一般而言，BERT/ERNIE模型需要，GPT不需要。\n  --cn_seg_func {lac,seg,jieba}\n                        Words segment function for chinese words.\n                        默认jieba，jieba速度较快，lac模型更准确，计算量高。\n  --cn_splited          Is chinese corpus is splited in to words.\n                        分词后的文本，可选。设置此选项则，cn_seg_func不起作用。\n                        例如分词后文本串 \"中国 效仿 西方 发展 工业 的过 程\"\n  --cn_split_dimer CN_SPLIT_DIMER\n                        Split dimer between chinese words.\n                        配合cn_splited使用，默认空格表示分词间隔。\n\ncommon config:\n  --append_eos          Append an <eos> token to the end of a document.\n                        gpt模型专用，gpt设置此选项，表示doc结束。\n  --log_interval LOG_INTERVAL\n                        Interval between progress updates\n                        打印日志间隔，interval表示处理 文本行数/doc数的 间隔。\n  --workers WORKERS     Number of worker processes to launch\n                        处理文本id化的进程个数。\n```\n通过下面脚本转化，我们可以得到处理好的预训练数据，token ids:`baike_sample_ids.npy`, 文章索引信息`baike_sample_idx.npz`.\n```\ncd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略\n\npython -u  ./ppfleetx/data/data_tools/ernie/preprocess/create_pretraining_data.py \\\n    --model_name ernie-1.0-base-zh \\\n    --tokenizer_name ErnieTokenizer \\\n    --input_path preprocess/baike_sample.jsonl \\\n    --split_sentences\\\n    --chinese \\\n    --cn_whole_word_segment \\\n    --output_prefix preprocess/baike_sample  \\\n    --workers 1 \\\n    --log_interval 5\n```\n1. 如果您使用已经分好词的语料，可以设置 --cn_splited 为 True，同时指定--cn_split_dimer如空格。\n2. 使用自定义词表的话，请指定model_name为词表所在的文件夹地址。\n\n\n### ERNIE 预训练开始\n得到了处理好的训练数据，拷贝到data目录，即可开始ERNIE模型预训练。\n```\ncd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略\n\nmkdir data\nmv ./preprocess/baike_sample* ./data\n\nsh ./projects/ernie/pretrain_ernie_base.sh\n# 建议修改 pretrain_ernie_base.sh 中的配置，将max_steps设置小一些。\n```\n代码说明：\n\n- ernie预训练使用的 dataset 代码文件在 `ernie_dataset.py`\n- 数据集index生成，动态mask相关代码实现在`dataset_utils.py`\n\n用户可以根据自己的需求，灵活修改mask方式。具体可以参考`dataset_utils.py`中`create_masked_lm_predictions`函数。\n可以自定义的选项有do_whole_word_mask, favor_longer_ngram, do_permutation, geometric_dist等，\n可以参考[Megatron](https://github.com/NVIDIA/Megatron-LM)使用这些lm_mask策略。\n\n### FAQ\n\n#### C++代码编译失败怎么办？\n- 请先检查pybind11包是否安装，g++、make工具是否正常。\n- 编译失败可能是本文件夹下的Makefile命令出现了一些问题。可以将Makefile中的python3、python3-config设置成完全的路径，如/usr/bin/python3.7。\n\n## 参考内容\n\n注: 大部分数据流程，参考自[Megatron](https://github.com/NVIDIA/Megatron-LM)，特此表达感谢。\n"
  },
  {
    "path": "ppfleetx/data/data_tools/ernie/preprocess/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/data/data_tools/ernie/preprocess/create_pretraining_data.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport io\nimport re\nimport argparse\nimport json\nimport multiprocessing\nimport sys\nimport time\n\nimport numpy as np\nfrom tqdm import tqdm\n\nimport paddlenlp.transformers as tfs\n\ntry:\n    import nltk\n    nltk_available = True\nexcept ImportError:\n    nltk_available = False\n\n\ndef get_args():\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\n        '--model_name', type=str, required=True, help='What model to use.')\n    parser.add_argument(\n        '--tokenizer_name',\n        type=str,\n        required=True,\n        choices=[\n            'ErnieTokenizer', 'BertTokenizer', 'GPTTokenizer',\n            'GPTChineseTokenizer', 'ElectraTokenizer'\n        ],\n        help='What type of tokenizer to use.')\n    group = parser.add_argument_group(title='data input/output')\n    group.add_argument(\n        '--input_path',\n        type=str,\n        required=True,\n        help='Path to input JSON files.')\n    group.add_argument(\n        '--output_prefix',\n        type=str,\n        required=True,\n        help='Output prefix to store output file.')\n    group.add_argument(\n        '--data_format',\n        type=str,\n        default='text',\n        choices=['JSON'],\n        help='Only support json format for now. One document per line.')\n    group.add_argument(\n        '--json_key',\n        type=str,\n        default='text',\n        help='For JSON format. Space separate listed of keys to extract from json'\n    )\n    group.add_argument(\n        '--split_sentences',\n        action='store_true',\n        help='Split documents into sentences.')\n\n    group = parser.add_argument_group(title='chinese words')\n    group.add_argument(\n        '--chinese',\n        action='store_true',\n        help=\"Is corpus need words segmentation step for chinese words.\")\n    group.add_argument(\n        '--cn_whole_word_segment',\n        action='store_true',\n        help=\"Is corpus need words segmentation step for chinese words WWM.\")\n    group.add_argument(\n        '--cn_seg_func',\n        type=str,\n        default='jieba',\n        choices=['lac', 'seg', 'jieba'],\n        help='Words segment function for chinese words.')\n    group.add_argument(\n        '--cn_splited',\n        action='store_true',\n        help=\"Is chinese corpus is splited in to words.\")\n    group.add_argument(\n        '--cn_split_dimer',\n        type=str,\n        default=' ',\n        help=\"Split dimer between chinese words.\")\n\n    group = parser.add_argument_group(title='common config')\n    group.add_argument(\n        '--append_eos',\n        action='store_true',\n        help='Append an <eos> token to the end of a document.')\n    group.add_argument(\n        '--log_interval',\n        type=int,\n        default=100,\n        help='Interval between progress updates')\n    group.add_argument(\n        '--workers',\n        type=int,\n        default=1,\n        help='Number of worker processes to launch')\n\n    args = parser.parse_args()\n    return args\n\n\ndef lexical_analysis_fn():\n    from LAC import LAC\n    lac = LAC(mode=\"lac\")\n\n    def process(line):\n        words, _ = lac.run(line)\n        return words\n\n    return process\n\n\ndef chinese_segmentation_fn():\n    from LAC import LAC\n    lac_cws = LAC(mode='seg')\n\n    def process(line):\n        words = lac_cws.run(line)\n        return words\n\n    return process\n\n\ndef jieba_segmentation_fn():\n    import jieba\n\n    def process(line):\n        words = jieba.cut(line)\n        return list(words)\n\n    return process\n\n\nCHINESE_SEG_FUNC = {\n    'lac': lexical_analysis_fn(),\n    'seg': chinese_segmentation_fn(),\n    'jieba': jieba_segmentation_fn(),\n}\n\n\ndef get_whole_word_mask_tokens(tokens, words, max_word_length=6):\n    \"\"\"\n    Do whole word mask on Chinese word.\n    First, we do Chinese word segmentation on the sequence of tokens, which are from the WordPiece tokenization.\n    Then, we add the '##' mark on chinese characters which are in the middle of Chinese words.\n    And if the tokens are not chinese characters, we just exploit the results of WordPiece tokenization as words.\n    Such as, \n         - text line : 通过利用mercer核，将样本从输入空间映射到高维特征空间，使原来没有显现的特征突现出来，取得了很好的图像分割效果。\n         - the input tokens (after WordPiece): \n            ['通', '过', '利', '用', 'me', '##rc', '##er', '核', '，', '将', '样', '本', '从', '输', '入', '空', '间', '映', \n            '射', '到', '高', '维', '特', '征', '空', '间', '，', '使', '原', '来', '没', '有', '显', '现', '的', '特', '征', \n            '突', '现', '出', '来', '，', '取', '得', '了', '很', '好', '的', '图', '像', '分', '割', '效', '果', '。']\n        - the Chinese words (after Chinese word segmentation like jieba)\n            ['通过', '利用', 'mercer', '核', '，', '将', '样本', '从', '输入', '空间', '映射', '到', '高维', '特征', \n            '空间', '，', '使', '原来', '没有', '显现', '的', '特征', '突现', '出来', '，', '取得', '了', '很', '好', \n            '的', '图像', '分割', '效果', '。']\n        - the output whole word mask tokens:\n            ['通', '##过', '利', '##用', 'me', '##rc', '##er', '核', '，', '将', '样', '##本', '从', '输', '##入', \n            '空', '##间', '映', '##射', '到', '高', '##维', '特', '##征', '空', '##间', '，', '使', '原', '##来', \n            '没', '##有', '显', '##现', '的', '特', '##征', '突', '##现', '出', '##来', '，', '取', '##得', '了', \n            '很', '好', '的', '图', '##像', '分', '##割', '效', '##果', '。']\n\n    Args:\n        tokens(list(str)): The sequence of tokens, which are from the WordPiece tokenization.\n        words(list(str)): The sequence of Chinese words.\n        max_word_length(int, optional): \n            The maximum chinese character in Chinese words. It avoids too long Chinese word to be masked.\n            Defaults as 4.\n\n    Returns:\n         new_tokens(list(str)): The new token will be done with whole word masking strategy.\n\n    \"\"\"\n\n    new_tokens = []\n    # opt for long document\n    words_set = set(words)\n    i = 0\n    while i < len(tokens):\n        # non-chinese character, then do word piece\n        if len(re.findall('[\\u4E00-\\u9FA5]', tokens[i])) == 0:\n            new_tokens.append(tokens[i])\n            i += 1\n            continue\n\n        # add \"##\" mark on the middel tokens of Chinese words\n        # such as [\"通过\", \"利用\"] -> [\"通\", \"##过\"， \"利\", \"##用\"]\n        has_add = False\n        for length in range(max_word_length, 0, -1):\n            if i + length > len(tokens):\n                continue\n            if ''.join(tokens[i:i + length]) in words_set:\n                new_tokens.append(tokens[i])\n                for l in range(1, length):\n                    new_tokens.append('##' + tokens[i + l])\n                i += length\n                has_add = True\n                break\n\n        if not has_add:\n            new_tokens.append(tokens[i])\n            i += 1\n    return new_tokens\n\n\nclass IdentitySplitter(object):\n    def tokenize(self, *text):\n        return text\n\n\nclass NewlineSplitter():\n    def tokenize(self, text):\n        return text.split(\"\\n\")\n\n\nclass Converter(object):\n    def __init__(self, args):\n        self.args = args\n\n    def initializer(self):\n        Converter.tokenizer = getattr(\n            tfs,\n            self.args.tokenizer_name).from_pretrained(self.args.model_name)\n        if self.args.cn_whole_word_segment:\n            # Extend chinese char vocab for ErnieTokinzer\n            Converter.tokenizer.extend_chinese_char()\n\n        # Split document to sentence.\n        if self.args.split_sentences:\n            if self.args.chinese:\n                Converter.splitter = NewlineSplitter()\n            else:\n                if not nltk_available:\n                    print(\"NLTK is not available to split sentences.\")\n                    exit()\n                splitter = nltk.load(\"tokenizers/punkt/english.pickle\")\n                Converter.splitter = splitter\n        else:\n            Converter.splitter = IdentitySplitter()\n\n        # Split sentence whole words mask for chinese\n        if self.args.cn_whole_word_segment:\n            if self.args.cn_splited:\n                Converter.segment_func = lambda text: text.split(self.args.cn_split_dimer)\n            else:\n                Converter.segment_func = CHINESE_SEG_FUNC[\n                    self.args.cn_seg_func]\n            Converter.whole_word_mask = get_whole_word_mask_tokens\n        else:\n            Converter.segment_func = lambda x: x\n            Converter.whole_word_mask = lambda x, y: x\n\n        def process(text):\n            words = Converter.segment_func(text)\n            # if there are two empty word, the should a split dimer in the pos\n            if self.args.cn_splited:\n                pre_dimer = False\n                for index, w in enumerate(words):\n                    if pre_dimer and len(w) == 0:\n                        words[index] = self.args.cn_split_dimer\n                        pre_dimer = False\n                    elif len(w) == 0:\n                        pre_dimer = True\n                    else:\n                        pre_dimer = False\n\n            tokens = Converter.tokenizer.tokenize(\"\".join(words))\n            tokens = Converter.whole_word_mask(tokens, words)\n            tokens = Converter.tokenizer.convert_tokens_to_ids(tokens)\n            return tokens\n\n        Converter.process = process\n\n    def encode(self, json_line):\n        text = json.loads(json_line)[self.args.json_key]\n        doc_ids = []\n        for sentence in Converter.splitter.tokenize(text):\n            sentence_ids = Converter.process(sentence.strip())\n            if len(sentence_ids) > 0:\n                doc_ids.append(sentence_ids)\n\n        if len(doc_ids) > 0 and self.args.append_eos:\n            doc_ids[-1].append(Converter.tokenizer.eos_token_id)\n\n        return doc_ids, len(text.encode(\"utf-8\"))\n\n\ndef main():\n    args = get_args()\n\n    file_paths = []\n    if os.path.isfile(args.input_path):\n        file_paths.append(args.input_path)\n    else:\n        for root, _, fs in os.walk(args.input_path):\n            for f in fs:\n                file_paths.append(os.path.join(root, f))\n    convert = Converter(args)\n\n    # Try tokenizer is availiable\n    sample_tokenizer = getattr(\n        tfs, args.tokenizer_name).from_pretrained(args.model_name)\n    if sample_tokenizer.vocab_size < 2**16 - 1:\n        save_dtype = np.uint16\n    else:\n        save_dtype = np.int32\n\n    pool = multiprocessing.Pool(args.workers, initializer=convert.initializer)\n\n    # We use BytesIO to store the ids.\n    token_ids_stream = io.BytesIO()\n    sentlens_stream = io.BytesIO()\n    # # Cumsum on tokens num\n    # sent_cumsum_stream = io.BytesIO()\n    # sent_cumsum_stream.write((0).to_bytes(8, byteorder='little', signed=True))\n    # Cunsum on document on every sentence num, type=np.int64\n    doc_cumsum_stream = io.BytesIO()\n    doc_cumsum_stream.write((0).to_bytes(8, byteorder='little', signed=True))\n\n    sent_count = 0\n    # token_count = 0\n\n    file_paths.sort()\n\n    step = 0\n    total_bytes_processed = 0\n    startup_start = time.time()\n    for file_path in tqdm(file_paths):\n        if file_path.endswith(\".zst\"):\n            import zstandard\n            cctx = zstandard.ZstdDecompressor()\n            fh = open(file_path, 'rb')\n            text = io.BufferedReader(cctx.stream_reader(fh))\n        elif file_path.endswith(\".jsonl\"):\n            text = open(file_path, 'r', encoding='utf-8')\n        else:\n            print(\"Unexpected data format, skiped %s\" % file_path)\n            continue\n\n        encoded_docs = pool.imap(convert.encode, text, 256)\n        print(\"Processing %s\" % file_path)\n        for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):\n            step += 1\n            total_bytes_processed += bytes_processed\n            if len(doc) == 0:\n                continue\n\n            for sentence in doc:\n                sentence_len = len(sentence)\n                if sentence_len == 0:\n                    continue\n                sentlens_stream.write(\n                    sentence_len.to_bytes(\n                        4, byteorder='little', signed=True))\n                # token_count += sentence_len\n                # sent_cumsum_stream.write(\n                #     token_count.to_bytes(\n                #         8, byteorder='little', signed=True))\n                sent_count += 1\n                token_ids_stream.write(\n                    np.array(\n                        sentence, dtype=save_dtype).tobytes(order='C'))\n\n            doc_cumsum_stream.write(\n                sent_count.to_bytes(\n                    8, byteorder='little', signed=True))\n\n            if step % args.log_interval == 0:\n                current = time.time()\n                elapsed = current - startup_start\n                mbs = total_bytes_processed / elapsed / 1024 / 1024\n                print(\n                    f\"Processed {step} documents\",\n                    f\"({step/elapsed:.2f} docs/s, {mbs:.4f} MB/s).\",\n                    file=sys.stderr)\n\n    pool.close()\n    print(\"Saving tokens to files...\")\n    all_doc_ids = np.frombuffer(token_ids_stream.getbuffer(), dtype=save_dtype)\n    lens = np.frombuffer(sentlens_stream.getbuffer(), dtype=np.int32)\n    # sents = np.frombuffer(sent_cumsum_stream.getbuffer(), dtype=np.int64)\n    docs = np.frombuffer(doc_cumsum_stream.getbuffer(), dtype=np.int64)\n    np.save(args.output_prefix + \"_ids.npy\", all_doc_ids)\n    # np.savez(args.output_prefix + \"_idx.npz\", lens=lens, sents=sents, docs=docs)\n    np.savez(args.output_prefix + \"_idx.npz\", lens=lens, docs=docs)\n\n    print(\"Total sentences num: %d\" % len(lens))\n    print(\"Total documents num: %d\" % (len(docs) - 1))\n    print(\"Total tokens num: %d\" % len(all_doc_ids))\n    print(\"Average tokens per sentence: %.2f\" % (len(all_doc_ids) / len(lens)))\n    print(\"Average tokens per document: %.2f\" % (len(all_doc_ids) /\n                                                 (len(docs) - 1)))\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "ppfleetx/data/data_tools/ernie/preprocess/docs/CLUECorpus2020.md",
    "content": "## CLUECorpus2020 语料\n\n| 名称 | 文本类型 | 纯文本大小 |\n|-|-|-|\n| CLUECorpus2020| 中文 | 200GB |\n\nCLUECorpus2020 过对Common Crawl的中文部分进行语料清洗得到。开源部分提供了约200G左右的语料文本，详细介绍见[官网](https://github.com/CLUEbenchmark/CLUECorpus2020#%E6%95%B0%E6%8D%AE%E4%B8%8B%E8%BD%BD)，用户可以通过邮件申请下载，方式如下：\n\n> 数据下载\n> 申请方式： 将使用语料研究目的和用途，计划、研究机构和申请者介绍，发送到邮箱，并承诺不向第三方提供。\n>\n> 邮箱: CLUEbenchmark@163.com，标题是：CLUECorpus2020 200G语料库\n"
  },
  {
    "path": "ppfleetx/data/data_tools/ernie/preprocess/docs/CLUECorpusSmall.md",
    "content": "# CLUECorpusSmall\n\n| 名称 | 文本类型 | 纯文本大小 |\n|-|-|-|\n| CLUECorpusSmall| 中文 | 14GB |\n\n**数据集简介**：可用于语言建模、预训练或生成型任务等，数据量超过14G，近4000个定义良好的txt文件、50亿个字。主要部分来自于nlp_chinese_corpus项目\n包含如下子语料库（总共14G语料）：新闻语料[news2016zh_corpus.zip](https://bj.bcebos.com/v1/ai-studio-online/6bac09db4e6d4857b6d680d34447457490cb2dbdd8b8462ea1780a407f38e12b?responseContentDisposition=attachment%3B%20filename%3Dnews2016zh_corpus.zip)， 社区互动语料[webText2019zh_corpus.zip](https://bj.bcebos.com/v1/ai-studio-online/83da03f7b4974871a52348b41c16c7e3b34a26d5ca644f558df8435be4de51c3?responseContentDisposition=attachment%3B%20filename%3DwebText2019zh_corpus.zip)，维基百科语料[wiki2019zh_corpus.zip](https://bj.bcebos.com/v1/ai-studio-online/d7a166408d8b4ffdaf4de9cfca09f6ee1e2340260f26440a92f78134d068b28f?responseContentDisposition=attachment%3B%20filename%3Dwiki2019zh_corpus.zip)，评论数据语料[comment2019zh_corpus.zip](https://bj.bcebos.com/v1/ai-studio-online/b66ddd445735408383c42322850ac4bb82faf9cc611447c2affb925443de7a6d?responseContentDisposition=attachment%3B%20filename%3Dcomment2019zh_corpus.zip)。\n\n## 数据获取\n\n用户可以通过官方github网页下载，https://github.com/CLUEbenchmark/CLUECorpus2020 。同时，为方便用户，我们也提供了aistudio数据集下载地址。[part1](https://aistudio.baidu.com/aistudio/datasetdetail/60598)，[part2](https://aistudio.baidu.com/aistudio/datasetdetail/124357)。使用aistudio版本的数据，下载好后，可以核对md5值：\n```shell\n> md5sum ./*\n 8a8be341ebce39cfe9524fb0b46b08c5  ./comment2019zh_corpus.zip\n 4bdc2c941a7adb4a061caf273fea42b8  ./news2016zh_corpus.zip\n fc582409f078b10d717caf233cc58ddd  ./webText2019zh_corpus.zip\n 157dacde91dcbd2e52a60af49f710fa5  ./wiki2019zh_corpus.zip\n```\n解压文件\n```shell\nunzip comment2019zh_corpus.zip -d  clue_corpus_small_14g/comment2019zh_corpus\nunzip news2016zh_corpus.zip    -d  clue_corpus_small_14g/news2016zh_corpus\nunzip webText2019zh_corpus.zip -d  clue_corpus_small_14g/webText2019zh_corpus\nunzip wiki2019zh_corpus.zip    -d  clue_corpus_small_14g/wiki2019zh_corpus\n```\n将txt文件转换为jsonl格式\n```\ncd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略\n\npython ./ppfleetx/data/data_tools/ernie/preprocess/trans_to_json.py  --input_path ./clue_corpus_small_14g --output_path clue_corpus_small_14g.jsonl\n```\n现在我们得到了jsonl格式的数据集。\n\n## ERNIE 中文预训练数据制作\n\n下面是针对训练任务的数据集应用，此处以ernie为例。\n\n```\npython -u ./ppfleetx/data/data_tools/ernie/preprocess/create_pretraining_data.py \\\n    --model_name ernie-1.0-base-zh \\\n    --tokenizer_name ErnieTokenizer \\\n    --input_path clue_corpus_small_14g.jsonl \\\n    --split_sentences \\\n    --chinese \\\n    --cn_whole_word_segment \\\n    --cn_seg_func jieba \\\n    --output_prefix clue_corpus_small_14g_20220104 \\\n    --workers 48 \\\n    --log_interval 10000\n```\n\n- model_name 可以更换为其他 ERNIE 系列模型，如: `ernie-3.0-base-zh`\n- workers 表示转化的线程数目\n\n数据共有文档`15702702`条左右，由于分词比较耗时，大概一小时左右可以完成。在当前目录下产出训练所需数据。\n```\nclue_corpus_small_14g_20220104_ids.npy\nclue_corpus_small_14g_20220104_idx.npz\n```\n用户可以使用此数据进行预训练任务。\n"
  },
  {
    "path": "ppfleetx/data/data_tools/ernie/preprocess/docs/OpenWebText2.md",
    "content": "# OpenWebText2\n\n| 名称 | 文本类型 | 纯文本大小 |\n|-|-|-|\n| OpenWebText2 | 英文 | 70GB |\n\n## 数据获取\n\n[OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/)是一个开源的英文网页文本数据集，数据来源于Reddit，经过去重、清洗、提取，最终包含800多万个文档。\n本示例采用EleutherAI清洗好的[OpenWebText2数据](https://openwebtext2.readthedocs.io/en/latest/index.html#download-plug-and-play-version)\n\n下载以后通过以下命令解压：\n\n```shell\nwget https://mystic.the-eye.eu/public/AI/pile_preliminary_components/openwebtext2.jsonl.zst.tar\ntar -xvf openwebtext2.json.zst.tar -C  /path/to/openwebtext\n```\n\n## GPT训练数据制作\n\n然后使用[proprecess]](https://github.com/PaddlePaddle/PaddleFleetX/tree/develop/ppfleetx/data/data_tools/ernie/preprocess) 工具下的`create_pretraining_data.py`脚本进行数据集制作：\n```\npython -u ./ppfleetx/data/data_tools/ernie/preprocess/create_pretraining_data.py \\\n    --model_name gpt2-en \\\n    --tokenizer_name GPTTokenizer \\\n    --data_format JSON \\\n    --input_path /path/to/openwebtext/ \\\n    --append_eos \\\n    --output_prefix gpt_openwebtext  \\\n    --workers 40 \\\n    --log_interval 10000\n```\n处理时间约一个小时左右，就可以得到我们需要的`gpt_openwebtext_ids.npy`, `gpt_openwebtext_idx.npz`数据集文件。\n\n为了方便用户运行测试本模型，本项目提供了处理好的300M的训练样本：\n```shell\nwget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy\nwget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz\n```\n\n将所有预处理得到的文件统一放入一个文件夹中，以备训练使用：\n\n```\nmkdir data\nmv gpt_en_dataset_300m_ids.npy ./data\nmv gpt_en_dataset_300m_idx.npz ./data\n```\n"
  },
  {
    "path": "ppfleetx/data/data_tools/ernie/preprocess/docs/WuDaoCorpusBase.md",
    "content": "# WuDaoCorpus2.0 Base 语料\n\n\n| 名称 | 文本类型 | 纯文本大小 |\n|-|-|-|\n| WuDaoCorpus2.0 Base| 中文 | 200GB |\n\nWuDaoCorpora是悟道爬取的中文大规模语料。整体数量为3TB，目前开源的部分为WuDaoCorpus2.0 bases数据集，大小为200GB。\n\n## 数据获取\n\n**1. 下载解压**\n\n用户微信登录[官网](https://resource.wudaoai.cn/home)，即可直接下载数据。下载好的压缩数据约 64GB。解压\n```\nunrar x WuDaoCorpus2.0_base_200G.rar\n```\n**2. 语料分词**\n\n由于WuDao数据集比较大，分词比较耗时，这里先进行了语料分词：\n```shell\ncd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略\n\npython ./ppfleetx/data/data_tools/ernie/preprocess/words_segmentation.py \\\n    --input_path ./WuDaoCorpus2.0_base_200G \\\n    --workers 40  \\\n    --data_format wudao \\\n    --cn_seg_func seg \\\n    --output_path ./wudao_lac_cut \\\n```\n\n注：预训练需要实现 SOP( Sentence Order Predict) 任务，在分词的同时，我们使用 简单规则 进行了文本断句。如果语料只有一句话，建议去除SOP loss，训练时设置 `binary_head=False`。\n\n**3. 转换为jsonl格式**\n\n文本转化完成后。我们使用 `ppfleetx/data/data_tools/ernie/preprocess/trans_to_json.py`重新转换为jsonl格式（分词完毕）。\n```shell\ncd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略\n\npython ./ppfleetx/data/data_tools/ernie/preprocess/trans_to_json.py  \\\n    --input_path ./wudao_lac_cut \\\n    --output_path wudao_corpus_200g_0623.jsonl \\\n    --workers 40\n```\n在当前目录下产出数据`wudao_corpus_200g_0623.jsonl`。格式如下：\n```\n{\"text\": \"主持人 : 作为 一个 曲线救国 的 路线 我们 没 办法 。\\n金鑫 : 考试 和 分数 只是 一个 阶段性 的 评价 手段 , 不是 目的 , 就 像 人 活着 的 目的 不是 为了 吃饭 , 吃饭 是 为了 让 我们 活下去 , 我们 学习 的 目的 不是 为了 考试 , 不是 为了 那个 分数 , 而是 我 掌握 了 知识 , 成为 我 内在 的 能力 , 将来 我 去 创作 创造 工作 , 我能 把 它 做 得 更好 。\\n主持人 : 特别感谢 金总 今天 接受 我 的 访谈 , 也 让 我 从 别的 层面 看到 了 一对一 到底 存在 的 道理 是 什么 , 并且 能 发展 那么 好 的 原因 在 哪里 。\\n在 节目 后 您 谈谈 您 对 一对一 未来 的 希望 , 包括 您 对 它 未来 的 设想 是 什么 ？\\n金鑫 : 一对一 个性化 教育 现在 还是 在 初级阶段 , 如果 是 四个 阶段 的话 , 现在 还是 在 第一阶段 到 第二阶段 迈进 的 , 学大 在 这方面 我们 希望 能 做 得 更 快 更 远 一些 。\\n将来 个性化 教育 一定 是 能够 帮助 学生 在 成绩 上 的 提升 , 能够 更好 的 成长 , 进而 成为 对 社会 对 国家 更 有用 的 人才 , 就是 我们 的 成绩 、 成长 、 成才 。\\n学大 1 对 1 教育 的 教师 团队 由 各科 优秀教师 、 考试 指导 专家 、 心理 辅导 专家 及 学习 方法 指导 专家 组成 , 同时 配备 专职 班主任 及 学习 监管 师 , 全方位 辅导   顺利 而 有序 的 运作 。\\n其中 部分 教师 担任 多年 毕业班 教学 工作 , 多次 参与 中 考试 命题 研究 及 阅卷 工作 , 深谙 中 考试 精髓 , 能够 在 短 的 时间 内 引领 学生 掌握 中 考试 知识   重点 , 快速 提分 。\\n■   对于 成绩 差 的 学生 : 注重 学生 基础知识 , 力求 让 学生 在 基础 中 找 自信 , 在 自信 中 提升 ；\\n注重 主观题 的 解题 方法 及 思路 , 以此 来 加强 对 基础知识 的 运用 。\\n■   对于 成绩 需要 拔高 的 学生 : 找出 学生 弱点 , 加强 基础 , 重点 提高 弱势 项目 。\\n\"}\n{\"text\": \"武田信玄 是 天生 的 武将 , 一生 开拓 了 八十五万 石至 九十余万 石之多 的 领地 。\\n武田信玄  他 21 岁 时 流放 自己 的 父亲 武田信虎  至骏河 , 避免 父亲 传位 给 弟弟 , 从而 登上 了 第 19 代家督 之位 。\\n他 将 信 浓国 ( 现 长野县 ) 纳入 控制 范围 后 , 又 与 当时 的 豪强 今井氏 、 北条 氏 结成 三国 军事同盟 , 与 上 杉谦信 在 川 中岛 前后 展开 了 五次 大战 。\\n武田信玄  勇于 进攻 。\\n他 连续 攻打 邻国 , 扩大 自己 势力范围 , 可称 遇神 杀神 , 遇佛 杀佛 。\\n他 不仅 流放 了 自己 的 父亲 , 连 自己 的 嫡子 武田义信 因 与 他 在 战略 方向 上 相左 , 也 被 他 幽禁 于 佛寺 , 随即 被迫 自杀 。\\n武田信玄  虽然 是 战国 武将 中 的 最强者 , 但 他 的 弱点 是 年龄 。\\n信玄比 织田信长 年长 13 岁 , 比上 杉谦信 年长 9 岁 。\\n当信 玄年 届 五十 之 时 , 信长 和 谦信 犹 在 壮年 。\\n上杉谦信 而且 , 武田信玄  虽 驰骋 天下 , 却 未率 军 进过 京都 , 而 织田信长 在 永禄 十一年 ( 1568 年 ) 就 以 拥立 第 15 代 将军 足利义 昭 为名 率兵 上洛 了 。\\n所谓 \\\" 制 京都 者 得 天下 \\\" , 所以 , 想要 一统天下 , 武田信玄  的 时间 很 紧迫 。\\n元龟 三年 ( 1572 年 ) , 武田信玄  与 室 町 幕府 第 15 代 将军 足利义 昭 、 本愿 寺 显如 , 以及 浅井 氏 、 朝仓氏 等 反 织田信长 实力 组成 联盟 , 编织 \\\" 反信长 包围圈 \\\" 。\\n同年 10 月 3 日 , 武田信玄  率领 大军 , 开始 了 第一次 上洛之行 。\\n是 年 , 信玄 52 岁 , 这 也许 是 他 统一天下 的 最后 一次 机会 。\\n武田信玄 所 率领 的 是 当时 战国 最强 的 3 万甲州 精兵 。\\n打着 \\\" 风林火山 \\\" 的 旗帜 , 武田军 第一站 就 到达 了 织田信长 的 同盟 德川家康  所在 的 三河 远江 。\\n织田信长 德川家康  的 军队 在 甲州 精兵 之前 显得 不堪一击 , 到 了 10 月 13 日 , 只来 成 、 天 方城 、 一 宫城 、 饭田 城 、 各和城 、 向 笠 城 等 城池 纷纷 被 攻陷 。\\n德川家康  见势不妙 , 决定 在 浜松 城中 闭门不出 。\\n但是 武田信玄  毫不 松懈 , 又 将 家康 在 远江 地区 的 重要 据点 二俣城 攻破 。\\n德川家康  集合 所有 军队 共 1 万 1 千人 , 出城 与 信玄 决一死战 , 但 大败 而 还 , 险些 失 了 性命 。\\n这次 战争 被 称为 \\\" 三方 原战 \\\" , 德川家康  曾经 承认 这次 战争 是 他 生平 最大 的 失败 。\\n\"}\n```\n\n## ERNIE 中文预训练数据制作\n\n下面是针对训练任务的数据集应用，此处以ernie为例。\n\n```\npython -u ./ppfleetx/data/data_tools/ernie/preprocess/create_pretraining_data.py \\\n    --model_name ernie-1.0-base-zh \\\n    --tokenizer_name ErnieTokenizer \\\n    --input_path wudao_corpus_200g_0623.jsonl \\\n    --split_sentences \\\n    --chinese \\\n    --cn_whole_word_segment \\\n    --cn_seg_func jieba \\\n    --cn_splited \\\n    --output_prefix wudao_corpus_200g_0623 \\\n    --workers 48 \\\n    --log_interval 10000\n```\n\n- 我们提前分词好了，所以加上了 `cn_splited`，否则不需要使用此选项。\n- model_name 可以更换为其他 ERNIE 系列模型，如: `ernie-3.0-base-zh`\n- workers 表示转化的线程数目\n\n在当前目录下产出训练所需数据。\n```\nwudao_corpus_200g_0623_ids.npy\nwudao_corpus_200g_0623_idx.npz\n```\n用户可以使用此数据进行预训练任务。\n"
  },
  {
    "path": "ppfleetx/data/data_tools/ernie/preprocess/trans_to_json.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport re\nimport argparse\nimport json\nimport multiprocessing\nimport sys\nimport time\nimport shutil\nfrom functools import partial\n\nimport numpy as np\nfrom tqdm import tqdm\n\n\ndef get_args():\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\n        '--input_path',\n        type=str,\n        required=True,\n        help='Path to you raw files. Folder or file path.')\n    parser.add_argument(\n        '--output_path',\n        type=str,\n        required=True,\n        help='Path to save the output json files.')\n    parser.add_argument(\n        '--json_key',\n        type=str,\n        default='text',\n        help='The content key of json file.')\n    parser.add_argument(\n        '--doc_spliter',\n        type=str,\n        default='',\n        help=\"Spliter between documents. We will strip the line, if you use blank line to split doc, leave it blank.\"\n    )\n    parser.add_argument(\n        '--min_doc_length',\n        type=int,\n        default=10,\n        help=\"Minimal char of a documment.\")\n    parser.add_argument(\n        '--workers',\n        type=int,\n        default=1,\n        help='Number of worker processes to launch')\n    parser.add_argument(\n        '--log_interval',\n        type=int,\n        default=1,\n        help='Interval between progress updates.')\n    parser.add_argument(\n        '--no-merge', action='store_true', help='Don\\'t merge the file.')\n    parser.add_argument(\n        '--no-shuffle', action='store_true', help='Don\\'t shuffle the file.')\n    args = parser.parse_args()\n    return args\n\n\ndef raw_text_to_json(path, doc_spliter=\"\", json_key=\"text\", min_doc_length=10):\n    path = os.path.abspath(path)\n    if not os.path.exists(path):\n        print(\"No found file %s\" % path)\n        return 0, None\n\n    out_filepath = path + \".jsonl\"\n    fout = open(out_filepath, \"w\", encoding=\"utf-8\")\n    len_files = 0\n    with open(path, \"r\") as f:\n        doc = \"\"\n        line = f.readline()\n        while line:\n            len_files += len(line)\n            if line.strip() == doc_spliter:\n                if len(doc) > min_doc_length:\n                    fout.write(\n                        json.dumps(\n                            {\n                                json_key: doc\n                            }, ensure_ascii=False) + \"\\n\")\n                doc = \"\"\n            else:\n                doc += line\n            line = f.readline()\n\n        if len(doc) > min_doc_length:\n            fout.write(json.dumps({json_key: doc}, ensure_ascii=False) + \"\\n\")\n        doc = \"\"\n\n    return len_files, out_filepath\n\n\ndef merge_file(file_paths, output_path):\n    if not output_path.endswith(\".jsonl\"):\n        output_path = output_path + \".jsonl\"\n    print(\"Merging files into %s\" % output_path)\n    with open(output_path, 'wb') as wfd:\n        for f in file_paths:\n            if f is not None and os.path.exists(f):\n                with open(f, 'rb') as fd:\n                    shutil.copyfileobj(fd, wfd)\n                os.remove(f)\n    print(\"File save in %s\" % output_path)\n    return output_path\n\n\ndef shuffle_file(output_path):\n    print(\"Shuffling the jsonl file...\")\n    if os.path.exists(output_path):\n        os.system(\"shuf %s -o %s\" % (output_path, output_path))\n        print(\"File shuffled!!!\")\n    else:\n        raise ValueError(\"File not found: %s\" % output_path)\n\n\ndef main():\n    args = get_args()\n    startup_start = time.time()\n\n    file_paths = []\n    if os.path.isfile(args.input_path):\n        file_paths.append(args.input_path)\n    else:\n        for root, _, fs in os.walk(args.input_path):\n            for f in fs:\n                file_paths.append(os.path.join(root, f))\n\n    pool = multiprocessing.Pool(args.workers)\n\n    startup_end = time.time()\n    proc_start = time.time()\n    total_bytes_processed = 0\n    print(\"Time to startup:\", startup_end - startup_start)\n\n    trans_json = partial(\n        raw_text_to_json,\n        doc_spliter=args.doc_spliter,\n        json_key=args.json_key,\n        min_doc_length=args.min_doc_length)\n    encoded_files = pool.imap(trans_json, file_paths, 1)\n\n    out_paths = []\n    for i, (bytes_processed, out_path) in enumerate(encoded_files, start=1):\n        total_bytes_processed += bytes_processed\n        out_paths.append(out_path)\n        master_start = time.time()\n\n        if i % args.log_interval == 0:\n            current = time.time()\n            elapsed = current - proc_start\n            mbs = total_bytes_processed / elapsed / 1024 / 1024\n            print(\n                f\"Processed {i} files\",\n                f\"({i/elapsed} files/s, {mbs} MB/s).\",\n                file=sys.stderr)\n\n    if not args.no_merge:\n        output_path = merge_file(out_paths, args.output_path)\n        if not args.no_shuffle:\n            shuffle_file(output_path)\n\n\nif __name__ == \"__main__\":\n    main()\n    #profile.run(\"main()\", \"testprof\")\n"
  },
  {
    "path": "ppfleetx/data/data_tools/ernie/preprocess/words_segmentation.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport json\nimport re\nimport argparse\nimport multiprocessing\nimport os\nimport time\nimport jieba\nimport sys\nfrom functools import partial\n\n\ndef get_args():\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\n        '--input_path',\n        type=str,\n        required=True,\n        help='Path to you raw files. Folder or file path.')\n    parser.add_argument(\n        '--workers',\n        type=int,\n        default=1,\n        help='Number of worker processes to launch')\n    parser.add_argument(\n        '--output_path',\n        type=str,\n        default=\"./tmp\",\n        help='Path to save the output json files.')\n    parser.add_argument(\n        '--data_format',\n        type=str,\n        default=\"jsonl\",\n        choices=[\"jsonl\", \"wudao\"],\n        help='Path to you raw files. Folder or file path.')\n    parser.add_argument(\n        '--cn_seg_func',\n        type=str,\n        default='jieba',\n        choices=['lac', 'seg', 'jieba'],\n        help='Words segment function for chinese words.')\n    parser.add_argument(\n        '--log_interval',\n        type=int,\n        default=1,\n        help='Interval between progress updates.')\n    args = parser.parse_args()\n    return args\n\n\ndef lexical_analysis_fn():\n    from LAC import LAC\n    lac = LAC(mode=\"lac\")\n\n    def process(line):\n        words, _ = lac.run(line)\n        return words\n\n    return process\n\n\ndef chinese_segmentation_fn():\n    from LAC import LAC\n    lac_cws = LAC(mode='seg')\n\n    def process(line):\n        words = lac_cws.run(line)\n        return words\n\n    return process\n\n\ndef jieba_segmentation_fn():\n    import jieba\n\n    def process(line):\n        words = jieba.cut(line)\n        return list(words)\n\n    return process\n\n\nCHINESE_SEG_FUNC = {\n    'lac': lexical_analysis_fn(),\n    'seg': chinese_segmentation_fn(),\n    'jieba': jieba_segmentation_fn(),\n}\n\n\ndef read_wudao(path):\n    print(\"Loading %s\" % path)\n    with open(path, \"r\") as f:\n        try:\n            contents = json.load(f)\n        except Exception as e:\n            print(\"Failed to load %s\" % path)\n            raise StopIteration\n    for js in contents:\n        yield js[\"content\"]\n\n\ndef read_jsonl(path):\n    print(\"Loading %s\" % path)\n    with open(path, \"r\") as f:\n        line = f.readline()\n        while line:\n            contents = json.load(f)\n            yield contents[\"text\"]\n            line = f.readline()\n\n\nREADFILE_FUNC = {\n    'jsonl': read_jsonl,\n    'wudao': read_wudao,\n}\n\nspecial_chars = ['\\n', '。', '?', '？', ' ', ';', '；', '！', '!']\nsplit_chars = ['。', '?', '？', ';', '；', '!', '！']\n\n\ndef text_to_text(path, output_path, read_func, seg_func):\n    out_name = os.path.join(output_path, path[-20:])\n\n    print(\"Write into %s\" % out_name)\n    if os.path.exists(out_name):\n        print(\"File exists %s\" % out_name)\n        return 0, None\n\n    seg_func = CHINESE_SEG_FUNC[seg_func]\n    read_func = READFILE_FUNC[read_func]\n\n    import time\n    s = time.time()\n    data_len = 0\n    count = 0\n    with open(out_name, \"w\") as f:\n        for text in read_func(path):\n            # for js in contents:\n            count += 1\n            # text = js[\"content\"]\n            data_len += len(text.encode(\"utf-8\"))\n            # make special char only once,\n            # because of those token will be treat as sentence spliter.\n            # 此处为断句逻辑\n            for char in special_chars:\n                text = re.sub('[' + char + ']+[ ]*', char, text)\n            for char in split_chars:\n                text = text.replace(char, char + \"\\n\")\n\n            # 此处为分词逻辑\n            final = \"\"\n            for line in text.split(\"\\n\"):\n                if len(line) == 0:\n                    continue\n                words = seg_func(line)\n                final += \" \".join(words) + \"\\n\"\n            f.write(final + \"\\n\")\n\n    return data_len, None\n\n\ndef main():\n    args = get_args()\n    startup_start = time.time()\n\n    file_paths = []\n    if os.path.isfile(args.input_path):\n        file_paths.append(args.input_path)\n    else:\n        for root, _, fs in os.walk(args.input_path):\n            for f in fs:\n                file_paths.append(os.path.join(root, f))\n\n    pool = multiprocessing.Pool(args.workers)\n\n    startup_end = time.time()\n    proc_start = time.time()\n    total_bytes_processed = 0\n    print(\"Time to startup:\", startup_end - startup_start)\n\n    if not os.path.exists(args.output_path):\n        os.makedirs(args.output_path)\n\n    trans_func = partial(\n        text_to_text,\n        output_path=args.output_path,\n        seg_func=args.cn_seg_func,\n        read_func=args.data_format)\n\n    encoded_files = pool.imap(trans_func, file_paths, 1)\n\n    out_paths = []\n    for i, (bytes_processed, out_path) in enumerate(encoded_files, start=1):\n        total_bytes_processed += bytes_processed\n        out_paths.append(out_path)\n        master_start = time.time()\n\n        if i % args.log_interval == 0:\n            current = time.time()\n            elapsed = current - proc_start\n            mbs = total_bytes_processed / elapsed / 1024 / 1024\n            print(\n                f\"Processed {i} files\",\n                f\"({i/elapsed} files/s, {mbs} MB/s).\",\n                file=sys.stderr)\n    pool.close()\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "ppfleetx/data/data_tools/gpt/README.md",
    "content": "## GPT 模型预训练数据准备流程(中文数据处理正在支持中)\n\n我们将预训练数据过程划分为以下2个部分：\n\n1. 原始数据转换，原始文本转换为jsonl的json字符串格式。\n2. 数据ID化，断句、分词、tokenize转化为token id格式。\n\n本目录下主要包含以下文件：\n```\n├── preprocess_data.py # 将jsonl文本，断句、分词后，tokenizer转化为token id。\n├── README.md # 预训练数据准备流程教程\n└── raw_trans_to_json.py # 原始文本数据转化的脚本，将数据转化为json串格式。\n```\n\n## 目录切换\n```\n# 如果您还未下载 PaddleFleetX 套件，请先 clone 套件\n# git clone https://github.com/PaddlePaddle/PaddleFleetX.git\ncd PaddleFleetX\n\n# 以下所有命令都在 PaddleFleetX 根目录中执行\n```\n\n## 环境依赖\n\n - paddlepaddle-gpu>=2.3.0\n - python==3.7\n - tqdm==4.54.1\n - numpy==1.20.1\n - pybind11==2.10.0\n\n安装命令`pip install -r requirements.txt`。\n\n\n## 训练全流程数据 Pipeline\n\n|步骤|阶段|数据格式| 样例|\n|-|-|-|-|\n| 原始数据清洗 | 原始数据准备|原始数据： <br/> 每个doc之间用空行间隔开 <br/> - 中文，默认每句换行符，作为句子结束。<br/> - 英文，默认使用nltk判断句子结束。doc是又一段或多端文字组成，每段文字由一句或多句话文字组成。  | ```飞桨是功能完备、开源开放的产业级深度学习平台。``` <br/> ```飞桨拥有核心训练和推理框架、基础模型库。``` <br/><br/> ```PaddleNLP是自然语言处理领域的优秀工具。```  |\n|原始数据转换<br/>`raw_trans_to_json.py`|预处理|jsonl格式：每个doc对应一行json字符串| ```{\"text\": \"飞桨是功能完备、开源开放的产业级深度学习平台。飞桨拥有...\"}```<br/>```{\"text\": \"PaddleNLP是自然语言...\"}```\n|数据ID化<br/>`preprocess_data.py`|预处理| npy格式：数据id化后的token id <br/>npz格式：数据句子、文章位置索引 | -\n\n\n## 全流程示例\n\n下面以 GPT 预训练为例，简要介绍一下预训练数据处理的全流程。\n\n### 原始数据\n首先下载样例数据：\n```\nmkdir -p dataset/wikitext_103_en\nwget -O dataset/wikitext_103_en/wikitext-103-en.txt http://fleet.bj.bcebos.com/datasets/gpt/wikitext-103-en.txt\n```\n### 原始数据转换 jsonl 格式\n使用`raw_trans_to_json.py`转化为json串格式，下面是脚本的使用说明\n```\noptional arguments:\n  -h, --help            show this help message and exit\n  --input_path INPUT_PATH\n                        Path to you raw files. Folder or file path.\n                        必须设置，可以是文件夹或者单个文件。文件夹中的目录默认最多搜索两层子目录。\n  --output_path OUTPUT_PATH\n                        Path to save the output json files.\n                        必须设置，输出文件的名字。\n  --json_key JSON_KEY   The content key of json file.\n                        建议不修改，默认的key是text\n  --doc_spliter DOC_SPLITER\n                        Spliter between documents. We will strip the line, if you use blank line to split doc, leave it blank.\n                        根据实际情况修改，默认空行作为文章换行符。\n  --min_doc_length MIN_DOC_LENGTH\n                        Minimal char of a documment.\n                        可选。过滤掉长度多短的文章，默认值10\n  --workers WORKERS     Number of worker processes to launch\n                        可选。多进程转化文件，适用于 input_path 中包含的文件数据较多的情况。每个文件，分配给不同worker处理\n  --log_interval LOG_INTERVAL\n                        Interval between progress updates.\n                        可选。此处的interval是值处理完文件个数的间隔。\n  --no-merge            Don't merge the file.\n                        可选。默认不开启这个选项，默认每个文件转换的jsonl文本，会拼接成到同一个文件。\n  --no-shuffle          Don't shuffle the file.\n                        可选。默认不开启这个选项，默认对处理完进行shuffle。\n```\n根据说明，我们使用下面简单命令，可以得到`wikitext_103_en.jsonl`文件。此处，我们对所有doc进行了shuffle。\n```shell\npython ppfleetx/data/data_tools/gpt/raw_trans_to_json.py  --input_path ./dataset/wikitext_103_en --output_path ./dataset/wikitext_103_en/wikitext_103_en\n\n# output of terminal\n# Time to startup: 0.0075109004974365234\n# Processed 1 files (0.12870440603278582 files/s, 64.80481421466284 MB/s).\n# Merging files into wikitext_103_en.jsonl\n# File save in wikitext_103_en.jsonl\n# Shuffling the jsonl file...\n# File shuffled!!!\n\n# 查看数据。因为对数据有 shuffle，下面的内容可能会不一样。\ntail -1 ./dataset/wikitext_103_en/wikitext_103_en.jsonl\n{\"text\": \"The album was released in June 1973 . Although it received good reviews , it did not sell well , except in Austin , where it sold more copies than earlier records by Nelson did nationwide . The recording led Nelson to a new style ; he later stated regarding his new musical identity that Shotgun Willie had \\\" cleared his throat . \\\" It became his breakthrough record , and one of the first of the outlaw movement , music created without the influence of the conservative Nashville Sound . The album — the first to feature Nelson with long hair and a beard on the cover — gained him the interest of younger audiences . It peaked at number 41 on Billboard 's album chart and the songs \\\" Shotgun Willie \\\" and \\\" Stay All Night ( Stay A Little Longer ) \\\" peaked at number 60 and 22 on Billboard Hot 100 respectively .\\nRolling Stone wrote : \\\" With this flawless album , Willie Nelson finally demonstrates why he has for so long been regarded as a Country & Western singer @-@ songwriter 's singer @-@ songwriter ... At the age of 39 , Nelson finally seems destined for the stardom he deserves \\\" . Robert Christgau wrote : \\\" This attempt to turn Nelson into a star runs into trouble when it induces him to outshout Memphis horns or Western swing . \\\"\\nBillboard wrote : \\\" This is Willie Nelson at his narrative best . He writes and sings with the love and the hurt and the down @-@ to @-@ earth things he feels , and he has a few peers . \\\" Texas Monthly praised Nelson and Wexler regarding the change in musical style : \\\" They 've switched his arrangements from Ray Price to Ray Charles — the result : a revitalized music . He 's the same old Willie , but veteran producer Jerry Wexler finally captured on wax the energy Nelson projects in person \\\" . School Library Journal wrote : \\\" Willie Nelson differs ( from ) rock artists framing their music with a country & western facade — in that he appears a honky @-@ tonk stardust cowboy to the core . This album abounds in unabashed sentimentalism , nasal singing , lyrics preoccupied with booze , religion , and love gone bad , and stereotyped Nashville instrumentation ( twangy steel guitars , fiddles , and a clean rhythm section characterized by the minimal use of bass drum and cymbals , both of which gain heavy mileage with rock performers ) .\\nStephen Thomas Erlewine wrote in his review for Allmusic : \\\" Willie Nelson offered his finest record to date for his debut – possibly his finest album ever . Shotgun Willie encapsulates Willie 's world view and music , finding him at a peak as a composer , interpreter , and performer . This is laid @-@ back , deceptively complex music , equal parts country , rock attitude , jazz musicianship , and troubadour storytelling \\\" .\\n\"}\n```\n\n### 数据ID化\n我们使用 `preprocess_data.py` 脚本将前面得到的 `wikitext_103_en.jsonl` 进行tokenize id化处理。\n```\noptional arguments:\n  -h, --help            show this help message and exit\n  --model_name MODEL_NAME\n                        What model to use.\n                        必须设置，如：gpt2\n  --tokenizer_name {ErnieTokenizer,BertTokenizer,GPTTokenizer,GPTChineseTokenizer}\n                        What type of tokenizer to use.\n                        模型对应的tokenizer, 目前暂时只支持 Ernie，Bert，GPT\ndata input/output:\n  --input_path INPUT_PATH\n                        Path to input JSON files.\n                        必须设置，输入文件jsonl的目录\n  --output_prefix OUTPUT_PREFIX\n                        Output prefix to store output file.\n                        必须设置，输出文件的名称。\n                        假设名称为XXX，则会输出 XXX_ids.npy, XXX_idx.npz 两个文件。\n                        npy文件，数据id化后的token ids; npz文件，数据句子、文章位置索引。\n  --data_format {JSON}  Only support json format for now. One document per line.\n                        不需要设置。目前默认处理jsonl数据格式\n  --json_key JSON_KEY   For JSON format. Space separate listed of keys to extract from json\n                        文本串json的key值。同前面trans_to_json.py的json_key，默认text为key\n  --split_sentences     Split documents into sentences.\n                        是否需要将文章划分成句子。一般而言，GPT不需要，Bert/Ernie模型需要\n\nchinese words:\n  --chinese             Is corpus need words segmentation step for chinese words.\n                        中文情形必须设置。处理的文本类型是否是中文。\n  --cn_whole_word_segment\n                        Is corpus need words segmentation step for chinese words WWM.\n                        可选。是否需要WWM策略。一般而言，Bert/Ernie模型需要，GPT不需要。\n  --cn_seg_func {lac,seg,jieba}\n                        Words segment function for chinese words.\n                        默认jieba，jieba速度较快，lac模型更准确，计算量高。\n  --cn_splited          Is chinese corpus is splited in to words.\n                        分词后的文本，可选。设置此选项则，cn_seg_func不起作用。\n                        例如分词后文本串 \"中国 效仿 西方 发展 工业 的过 程\"\n  --cn_split_dimer CN_SPLIT_DIMER\n                        Split dimer between chinese words.\n                        配合cn_splited使用，默认空格表示分词间隔。\n\ncommon config:\n  --append_eos          Append an <eos> token to the end of a document.\n                        gpt模型专用，gpt设置此选项，表示doc结束。\n  --log_interval LOG_INTERVAL\n                        Interval between progress updates\n                        打印日志间隔，interval表示处理 文本行数/doc数的 间隔。\n  --workers WORKERS     Number of worker processes to launch\n                        处理文本id化的进程个数。\n```\n通过下面脚本转化，我们可以得到处理好的预训练数据，token ids:`wikitext_103_en.npy`, 文章索引信息`wikitext_103_en.npz`.\n在使用 `GPTTokenizer` 时需要用到 `gpt2-vocab.json` 与 `gpt2-merges.txt`，如果没有下载缓存过这两个文件，脚本会自动下载并缓存。当遇到网络问题时，可以自行下载并将这两个文件放置在 `~/.cache/ppfleetx/` 目录下。\n``` \npython ppfleetx/data/data_tools/gpt/preprocess_data.py \\\n    --model_name gpt2 \\\n    --tokenizer_name GPTTokenizer \\\n    --data_format JSON \\\n    --input_path ./dataset/wikitext_103_en/wikitext_103_en.jsonl \\\n    --append_eos \\\n    --output_prefix ./dataset/wikitext_103_en/wikitext_103_en  \\\n    --workers 40 \\\n    --log_interval 1000\n    \n# 处理完后 terminal 输出\n# Processed 267000 documents (9843.34 docs/s, 18.4880 MB/s).\n# Processed 268000 documents (9869.46 docs/s, 18.5351 MB/s).\n# 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:27<00:00, 27.17s/it]\n# Saving tokens to files...\n# Total sentences num: 268492\n# Total documents num: 268492\n# Total tokens num: 114130026\n# Average tokens per sentence: 425.08\n# Average tokens per document: 425.08\n```\n\n## 参考内容\n\n注: 大部分数据流程，参考自[Megatron](https://github.com/NVIDIA/Megatron-LM)和[PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP)，特此表达感谢。\n"
  },
  {
    "path": "ppfleetx/data/data_tools/gpt/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/data/data_tools/gpt/preprocess_data.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport io\nimport re\nimport argparse\nimport json\nimport multiprocessing\nimport sys\nimport time\n\nimport numpy as np\nfrom tqdm import tqdm\n\ntry:\n    from ppfleetx.data import tokenizers as tfs\nexcept ImportError:\n    __dir__ = os.path.dirname(os.path.abspath(__file__))\n    sys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../')))\n    from ppfleetx.data import tokenizers as tfs\n    from ppfleetx.utils.log import logger\n\ntry:\n    import nltk\n    nltk_available = True\nexcept ImportError:\n    nltk_available = False\n\nCHINESE_SEG_FUNC = {}\n\n\ndef get_args():\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\n        '--model_name', type=str, required=True, help='What model to use.')\n    parser.add_argument(\n        '--tokenizer_name',\n        type=str,\n        required=True,\n        choices=[\n            'ErnieTokenizer', 'BertTokenizer', 'GPTTokenizer',\n            'GPTChineseTokenizer', 'ElectraTokenizer'\n        ],\n        help='What type of tokenizer to use.')\n    group = parser.add_argument_group(title='data input/output')\n    group.add_argument(\n        '--input_path',\n        type=str,\n        required=True,\n        help='Path to input JSON files.')\n    group.add_argument(\n        '--output_prefix',\n        type=str,\n        required=True,\n        help='Output prefix to store output file.')\n    group.add_argument(\n        '--data_format',\n        type=str,\n        default='text',\n        choices=['JSON'],\n        help='Only support json format for now. One document per line.')\n    group.add_argument(\n        '--json_key',\n        type=str,\n        default='text',\n        help='For JSON format. Space separate listed of keys to extract from json'\n    )\n    group.add_argument(\n        '--split_sentences',\n        action='store_true',\n        help='Split documents into sentences.')\n\n    group = parser.add_argument_group(title='chinese words')\n    group.add_argument(\n        '--chinese',\n        action='store_true',\n        help=\"Is corpus need words segmentation step for chinese words.\")\n    group.add_argument(\n        '--cn_whole_word_segment',\n        action='store_true',\n        help=\"Is corpus need words segmentation step for chinese words WWM.\")\n    group.add_argument(\n        '--cn_seg_func',\n        type=str,\n        default='jieba',\n        choices=['lac', 'seg', 'jieba'],\n        help='Words segment function for chinese words.')\n    group.add_argument(\n        '--cn_splited',\n        action='store_true',\n        help=\"Is chinese corpus is splited in to words.\")\n    group.add_argument(\n        '--cn_split_dimer',\n        type=str,\n        default=' ',\n        help=\"Split dimer between chinese words.\")\n\n    group = parser.add_argument_group(title='common config')\n    group.add_argument(\n        '--append_eos',\n        action='store_true',\n        help='Append an <eos> token to the end of a document.')\n    group.add_argument(\n        '--log_interval',\n        type=int,\n        default=100,\n        help='Interval between progress updates')\n    group.add_argument(\n        '--workers',\n        type=int,\n        default=1,\n        help='Number of worker processes to launch')\n\n    args = parser.parse_args()\n    if args.chinese:\n        global CHINESE_SEG_FUNC\n        CHINESE_SEG_FUNC['lac'] = lexical_analysis_fn()\n        CHINESE_SEG_FUNC['seg'] = chinese_segmentation_fn()\n        CHINESE_SEG_FUNC['jieba'] = jieba_segmentation_fn()\n\n    return args\n\n\ndef lexical_analysis_fn():\n    from LAC import LAC\n    lac = LAC(mode=\"lac\")\n\n    def process(line):\n        words, _ = lac.run(line)\n        return words\n\n    return process\n\n\ndef chinese_segmentation_fn():\n    from LAC import LAC\n    lac_cws = LAC(mode='seg')\n\n    def process(line):\n        words = lac.run(line)\n        return words\n\n    return process\n\n\ndef jieba_segmentation_fn():\n    import jieba\n\n    def process(line):\n        words = jieba.cut(line)\n        return list(words)\n\n    return process\n\n\ndef get_whole_word_mask_tokens(tokens, words, max_word_length=4):\n    \"\"\"\n    Do whole word mask on Chinese word.\n    First, we do Chinese word segmentation on the sequence of tokens, which are from the WordPiece tokenization.\n    Then, we add the '##' mark on chinese characters which are in the middle of Chinese words.\n    And if the tokens are not chinese characters, we just exploit the results of WordPiece tokenization as words.\n    Such as, \n         - text line : 通过利用mercer核，将样本从输入空间映射到高维特征空间，使原来没有显现的特征突现出来，取得了很好的图像分割效果。\n         - the input tokens (after WordPiece): \n            ['通', '过', '利', '用', 'me', '##rc', '##er', '核', '，', '将', '样', '本', '从', '输', '入', '空', '间', '映', \n            '射', '到', '高', '维', '特', '征', '空', '间', '，', '使', '原', '来', '没', '有', '显', '现', '的', '特', '征', \n            '突', '现', '出', '来', '，', '取', '得', '了', '很', '好', '的', '图', '像', '分', '割', '效', '果', '。']\n        - the Chinese words (after Chinese word segmentation like jieba)\n            ['通过', '利用', 'mercer', '核', '，', '将', '样本', '从', '输入', '空间', '映射', '到', '高维', '特征', \n            '空间', '，', '使', '原来', '没有', '显现', '的', '特征', '突现', '出来', '，', '取得', '了', '很', '好', \n            '的', '图像', '分割', '效果', '。']\n        - the output whole word mask tokens:\n            ['通', '##过', '利', '##用', 'me', '##rc', '##er', '核', '，', '将', '样', '##本', '从', '输', '##入', \n            '空', '##间', '映', '##射', '到', '高', '##维', '特', '##征', '空', '##间', '，', '使', '原', '##来', \n            '没', '##有', '显', '##现', '的', '特', '##征', '突', '##现', '出', '##来', '，', '取', '##得', '了', \n            '很', '好', '的', '图', '##像', '分', '##割', '效', '##果', '。']\n    Args:\n        tokens(list(str)): The sequence of tokens, which are from the WordPiece tokenization.\n        words(list(str)): The sequence of Chinese words.\n        max_word_length(int, optional): \n            The maximum chinese character in Chinese words. It avoids too long Chinese word to be masked.\n            Defaults as 4.\n    Returns:\n         new_tokens(list(str)): The new token will be done with whole word masking strategy.\n    \"\"\"\n\n    new_tokens = []\n    # opt for long document\n    words_set = set(words)\n    i = 0\n    while i < len(tokens):\n        # non-chinese character, then do word piece\n        if len(re.findall('[\\u4E00-\\u9FA5]', tokens[i])) == 0:\n            new_tokens.append(tokens[i])\n            i += 1\n            continue\n\n        # add \"##\" mark on the middel tokens of Chinese words\n        # such as [\"通过\", \"利用\"] -> [\"通\", \"##过\"， \"利\", \"##用\"]\n        has_add = False\n        for length in range(max_word_length, 0, -1):\n            if i + length > len(tokens):\n                continue\n            if ''.join(tokens[i:i + length]) in words_set:\n                new_tokens.append(tokens[i])\n                for l in range(1, length):\n                    new_tokens.append('##' + tokens[i + l])\n                i += length\n                has_add = True\n                break\n\n        if not has_add:\n            new_tokens.append(tokens[i])\n            i += 1\n    return new_tokens\n\n\nclass IdentitySplitter(object):\n    def tokenize(self, *text):\n        return text\n\n\nclass NewlineSplitter():\n    def tokenize(self, text):\n        return text.split(\"\\n\")\n\n\nclass Converter(object):\n    def __init__(self, args):\n        self.args = args\n\n    def initializer(self):\n        Converter.tokenizer = getattr(\n            tfs,\n            self.args.tokenizer_name).from_pretrained(self.args.model_name)\n\n        # Split document to sentence.\n        if self.args.split_sentences:\n            if self.args.chinese:\n                Converter.splitter = NewlineSplitter()\n            else:\n                if not nltk_available:\n                    print(\"NLTK is not available to split sentences.\")\n                    exit()\n                splitter = nltk.load(\"tokenizers/punkt/english.pickle\")\n                Converter.splitter = splitter\n        else:\n            Converter.splitter = IdentitySplitter()\n\n        # Split sentence whole words mask for chinese\n        if self.args.cn_whole_word_segment:\n            if self.args.cn_splited:\n                Converter.segment_func = lambda text: text.split(self.args.cn_split_dimer)\n            else:\n                Converter.segment_func = CHINESE_SEG_FUNC[\n                    self.args.cn_seg_func]\n            Converter.whole_word_mask = get_whole_word_mask_tokens\n        else:\n            Converter.segment_func = lambda x: x\n            Converter.whole_word_mask = lambda x, y: x\n\n        def process(text):\n            words = Converter.segment_func(text)\n            tokens = Converter.tokenizer.tokenize(\"\".join(words))\n            tokens = Converter.whole_word_mask(tokens, words)\n            tokens = Converter.tokenizer.convert_tokens_to_ids(tokens)\n            return tokens\n\n        Converter.process = process\n\n    def encode(self, json_line):\n        text = json.loads(json_line)[self.args.json_key]\n        doc_ids = []\n        for sentence in Converter.splitter.tokenize(text):\n            sentence_ids = Converter.process(sentence.strip())\n            if len(sentence_ids) > 0:\n                doc_ids.append(sentence_ids)\n\n        if len(doc_ids) > 0 and self.args.append_eos:\n            doc_ids[-1].append(Converter.tokenizer.eos_token_id)\n\n        return doc_ids, len(text.encode(\"utf-8\"))\n\n\ndef main():\n    args = get_args()\n\n    file_paths = []\n    if os.path.isfile(args.input_path):\n        file_paths.append(args.input_path)\n    else:\n        for root, _, fs in os.walk(args.input_path):\n            for f in fs:\n                file_paths.append(os.path.join(root, f))\n    if len(file_paths) == 0:\n        print(\"No input file found!\")\n        exit(-1)\n\n    convert = Converter(args)\n\n    # Try tokenizer is availiable\n    sample_tokenizer = getattr(\n        tfs, args.tokenizer_name).from_pretrained(args.model_name)\n    if sample_tokenizer.vocab_size < 2**16 - 1:\n        save_dtype = np.uint16\n    else:\n        save_dtype = np.int32\n\n    pool = multiprocessing.Pool(args.workers, initializer=convert.initializer)\n\n    # We use BytesIO to store the ids.\n    token_ids_stream = io.BytesIO()\n    sentlens_stream = io.BytesIO()\n    # # Cumsum on tokens num\n    # sent_cumsum_stream = io.BytesIO()\n    # sent_cumsum_stream.write((0).to_bytes(8, byteorder='little', signed=True))\n    # Cunsum on document on every sentence num, type=np.int64\n    doc_cumsum_stream = io.BytesIO()\n    doc_cumsum_stream.write((0).to_bytes(8, byteorder='little', signed=True))\n\n    sent_count = 0\n    # token_count = 0\n\n    file_paths.sort()\n\n    step = 0\n    total_bytes_processed = 0\n    startup_start = time.time()\n    for file_path in tqdm(file_paths):\n        if file_path.endswith(\".zst\"):\n            import zstandard\n            cctx = zstandard.ZstdDecompressor()\n            fh = open(file_path, 'rb')\n            text = io.BufferedReader(cctx.stream_reader(fh))\n        elif file_path.endswith(\".jsonl\"):\n            text = open(file_path, 'r', encoding='utf-8')\n        else:\n            print(\"Unexpected data format, skiped %s\" % file_path)\n            continue\n\n        encoded_docs = pool.imap(convert.encode, text, 256)\n        print(\"Processing %s\" % file_path)\n        for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):\n            step += 1\n            total_bytes_processed += bytes_processed\n            if len(doc) == 0:\n                continue\n\n            for sentence in doc:\n                sentence_len = len(sentence)\n                if sentence_len == 0:\n                    continue\n                sentlens_stream.write(\n                    sentence_len.to_bytes(\n                        4, byteorder='little', signed=True))\n                # token_count += sentence_len\n                # sent_cumsum_stream.write(\n                #     token_count.to_bytes(\n                #         8, byteorder='little', signed=True))\n                sent_count += 1\n                token_ids_stream.write(\n                    np.array(\n                        sentence, dtype=save_dtype).tobytes(order='C'))\n\n            doc_cumsum_stream.write(\n                sent_count.to_bytes(\n                    8, byteorder='little', signed=True))\n\n            if step % args.log_interval == 0:\n                current = time.time()\n                elapsed = current - startup_start\n                mbs = total_bytes_processed / elapsed / 1024 / 1024\n                print(\n                    f\"Processed {step} documents\",\n                    f\"({step/elapsed:.2f} docs/s, {mbs:.4f} MB/s).\",\n                    file=sys.stderr)\n\n    pool.close()\n    print(\"Saving tokens to files...\")\n    all_doc_ids = np.frombuffer(token_ids_stream.getbuffer(), dtype=save_dtype)\n    lens = np.frombuffer(sentlens_stream.getbuffer(), dtype=np.int32)\n    # sents = np.frombuffer(sent_cumsum_stream.getbuffer(), dtype=np.int64)\n    docs = np.frombuffer(doc_cumsum_stream.getbuffer(), dtype=np.int64)\n    np.save(args.output_prefix + \"_ids.npy\", all_doc_ids)\n    # np.savez(args.output_prefix + \"_idx.npz\", lens=lens, sents=sents, docs=docs)\n    np.savez(args.output_prefix + \"_idx.npz\", lens=lens, docs=docs)\n\n    print(\"Total sentences num: %d\" % len(lens))\n    print(\"Total documents num: %d\" % (len(docs) - 1))\n    print(\"Total tokens num: %d\" % len(all_doc_ids))\n    print(\"Average tokens per sentence: %.2f\" % (len(all_doc_ids) / len(lens)))\n    print(\"Average tokens per document: %.2f\" % (len(all_doc_ids) /\n                                                 (len(docs) - 1)))\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "ppfleetx/data/data_tools/gpt/raw_trans_to_json.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport re\nimport argparse\nimport json\nimport multiprocessing\nimport sys\nimport time\nimport shutil\nfrom functools import partial\n\nimport numpy as np\nfrom tqdm import tqdm\n\n\ndef get_args():\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\n        '--input_path',\n        type=str,\n        required=True,\n        help='Path to you raw files. Folder or file path.')\n    parser.add_argument(\n        '--output_path',\n        type=str,\n        required=True,\n        help='Path to save the output json files.')\n    parser.add_argument(\n        '--json_key',\n        type=str,\n        default='text',\n        help='The content key of json file.')\n    parser.add_argument(\n        '--doc_spliter',\n        type=str,\n        default='',\n        help=\"Spliter between documents. We will strip the line, if you use blank line to split doc, leave it blank.\"\n    )\n    parser.add_argument(\n        '--min_doc_length',\n        type=int,\n        default=10,\n        help=\"Minimal char of a documment.\")\n    parser.add_argument(\n        '--workers',\n        type=int,\n        default=1,\n        help='Number of worker processes to launch')\n    parser.add_argument(\n        '--log_interval',\n        type=int,\n        default=1,\n        help='Interval between progress updates.')\n    parser.add_argument(\n        '--no-merge', action='store_true', help='Don\\'t merge the file.')\n    parser.add_argument(\n        '--no-shuffle', action='store_true', help='Don\\'t shuffle the file.')\n    args = parser.parse_args()\n    return args\n\n\ndef raw_text_to_json(path, doc_spliter=\"\", json_key=\"text\", min_doc_length=10):\n    path = os.path.abspath(path)\n    if not os.path.exists(path):\n        print(\"No found file %s\" % path)\n        return 0, None\n\n    out_filepath = path + \".jsonl\"\n    fout = open(out_filepath, \"w\", encoding=\"utf-8\")\n    len_files = 0\n    with open(path, \"r\") as f:\n        doc = \"\"\n        line = f.readline()\n        while line:\n            len_files += len(line)\n            if line.strip() == doc_spliter:\n                if len(doc) > min_doc_length:\n                    fout.write(\n                        json.dumps(\n                            {\n                                json_key: doc\n                            }, ensure_ascii=False) + \"\\n\")\n                doc = \"\"\n            else:\n                doc += line\n            line = f.readline()\n\n        if len(doc) > min_doc_length:\n            fout.write(json.dumps({json_key: doc}, ensure_ascii=False) + \"\\n\")\n        doc = \"\"\n\n    return len_files, out_filepath\n\n\ndef merge_file(file_paths, output_path):\n    if not output_path.endswith(\".jsonl\"):\n        output_path = output_path + \".jsonl\"\n    print(\"Merging files into %s\" % output_path)\n    with open(output_path, 'wb') as wfd:\n        for f in file_paths:\n            if f is not None and os.path.exists(f):\n                with open(f, 'rb') as fd:\n                    shutil.copyfileobj(fd, wfd)\n                os.remove(f)\n    print(\"File save in %s\" % output_path)\n    return output_path\n\n\ndef shuffle_file(output_path):\n    print(\"Shuffling the jsonl file...\")\n    if os.path.exists(output_path):\n        os.system(\"shuf %s -o %s\" % (output_path, output_path))\n        print(\"File shuffled!!!\")\n    else:\n        raise ValueError(\"File not found: %s\" % output_path)\n\n\ndef main():\n    args = get_args()\n    startup_start = time.time()\n\n    file_paths = []\n    if os.path.isfile(args.input_path):\n        file_paths.append(args.input_path)\n    else:\n        for root, _, fs in os.walk(args.input_path):\n            for f in fs:\n                file_paths.append(os.path.join(root, f))\n\n    pool = multiprocessing.Pool(args.workers)\n\n    startup_end = time.time()\n    proc_start = time.time()\n    total_bytes_processed = 0\n    print(\"Time to startup:\", startup_end - startup_start)\n\n    trans_json = partial(\n        raw_text_to_json,\n        doc_spliter=args.doc_spliter,\n        json_key=args.json_key,\n        min_doc_length=args.min_doc_length)\n    encoded_files = pool.imap(trans_json, file_paths, 1)\n\n    out_paths = []\n    for i, (bytes_processed, out_path) in enumerate(encoded_files, start=1):\n        total_bytes_processed += bytes_processed\n        out_paths.append(out_path)\n        master_start = time.time()\n\n        if i % args.log_interval == 0:\n            current = time.time()\n            elapsed = current - proc_start\n            mbs = total_bytes_processed / elapsed / 1024 / 1024\n            print(\n                f\"Processed {i} files\",\n                f\"({i/elapsed} files/s, {mbs} MB/s).\",\n                file=sys.stderr)\n\n    if not args.no_merge:\n        output_path = merge_file(out_paths, args.output_path)\n        if not args.no_shuffle:\n            shuffle_file(output_path)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "ppfleetx/data/dataset/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom .vision_dataset import (\n    GeneralClsDataset,\n    ImageFolder,\n    CIFAR10,\n    ContrativeLearningDataset, )\n\nfrom .multimodal_dataset import ImagenDataset\nfrom .gpt_dataset import GPTDataset, LM_Eval_Dataset, Lambada_Eval_Dataset\nfrom .glue_dataset import *\nfrom .ernie.ernie_dataset import ErnieDataset, ErnieSeqClsDataset\n"
  },
  {
    "path": "ppfleetx/data/dataset/ernie/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/data/dataset/ernie/dataset_utils.py",
    "content": "# coding=utf-8\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# Copyright 2018 The Google AI Language Team Authors, and NVIDIA.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Most of the code here has been copied from:\n#   https://github.com/google-research/albert/blob/master/create_pretraining_data.py\n# with some modifications.\n\nimport math\nimport os\nimport re\nimport time\nimport collections\n\nimport numpy as np\nimport paddle\n\n\ndef get_local_rank():\n    return int(os.getenv(\"PADDLE_RANK_IN_NODE\", 0))\n\n\nprint_rank_0 = print\n\n# COMPILED = False\n# DSET_TYPE_BERT = 'standard_bert'\n# DSET_TYPE_T5 = 't5'\n# DSET_TYPE_ERNIE = 'ernie'\n\n# DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_T5, DSET_TYPE_ERNIE]\n\n\ndef get_datasets_weights_and_num_samples(data_prefix,\n                                         train_valid_test_num_samples):\n\n    # The data prefix should be in the format of:\n    #   weight-1, data-prefix-1, weight-2, data-prefix-2, ..\n    assert len(data_prefix) % 2 == 0\n    num_datasets = len(data_prefix) // 2\n    weights = [0] * num_datasets\n    prefixes = [0] * num_datasets\n    for i in range(num_datasets):\n        weights[i] = float(data_prefix[2 * i])\n        prefixes[i] = (data_prefix[2 * i + 1]).strip()\n    # Normalize weights\n    weight_sum = 0.0\n    for weight in weights:\n        weight_sum += weight\n    assert weight_sum > 0.0\n    weights = [weight / weight_sum for weight in weights]\n\n    # Add 0.5% (the 1.005 factor) so in case the bleding dataset does\n    # not uniformly distribute the number of samples, we still have\n    # samples left to feed to the network.\n    datasets_train_valid_test_num_samples = []\n    for weight in weights:\n        datasets_train_valid_test_num_samples.append([\n            int(math.ceil(val * weight * 1.005))\n            for val in train_valid_test_num_samples\n        ])\n\n    return prefixes, weights, datasets_train_valid_test_num_samples\n\n\nclass MMapIndexedDataset(paddle.io.Dataset):\n    def __init__(self, path, skip_warmup=False):\n        super().__init__()\n\n        self._path = path\n\n        # All documment ids, extend as 1-D array.\n\n        for suffix in [\"_ids.npy\", \"_idx.npz\"]:\n            # print(path, suffix)\n            if not os.path.isfile(path + suffix):\n                raise ValueError(\"File Not found, %s\" % (path + suffix))\n\n        self._token_ids = np.load(\n            path + \"_ids.npy\", mmap_mode=\"r\", allow_pickle=True)\n        process_data = np.load(path + \"_idx.npz\")\n        self._sizes = process_data[\"lens\"]\n        self._pointers = np.empty(len(self._sizes) + 1, dtype=np.int64)\n        self._pointers[0] = 0\n        np.cumsum(self._sizes, out=self._pointers[1:])\n        self._doc_idx = process_data[\"docs\"]\n\n    def __getstate__(self):\n        return self._path\n\n    def __len__(self):\n        return len(self._sizes)\n\n    # @lru_cache(maxsize=8)\n    def __getitem__(self, idx):\n        if isinstance(idx, int):\n            size = self._sizes[idx]\n            ptr = self._pointers[idx]\n            np_array = self._token_ids[ptr:ptr + size]\n            return np_array\n\n        elif isinstance(idx, slice):\n            start, stop, step = idx.indices(len(self))\n            if step != 1:\n                raise ValueError(\n                    \"Slices into indexed_dataset must be contiguous\")\n            ptr = self._pointers[start]\n            sizes = self._sizes[idx]\n            offsets = list(accumulate(sizes))\n            total_size = sum(sizes)\n            np_array = self._token_ids[ptr:ptr + total_size]\n            sents = np.split(np_array, offsets[:-1])\n            return sents\n\n    def get(self, idx, offset=0, length=None):\n        \"\"\" Retrieves a single item from the dataset with the option to only\n        return a portion of the item.\n\n        get(idx) is the same as [idx] but get() does not support slicing.\n        \"\"\"\n        size = self._sizes[idx]\n        ptr = self._pointers[idx]\n\n        if length is None:\n            length = size - offset\n        ptr += offset\n        np_array = self._token_ids[ptr:prt + length]\n        return np_array\n\n    @property\n    def sizes(self):\n        return self._sizes\n\n    @property\n    def doc_idx(self):\n        return self._doc_idx\n\n    def get_doc_idx(self):\n        return self._doc_idx\n\n    def set_doc_idx(self, doc_idx_):\n        self._doc_idx = doc_idx_\n\n\ndef make_indexed_dataset(data_prefix, data_impl=None, skip_warmup=False):\n    return MMapIndexedDataset(data_prefix)\n\n\ndef get_a_and_b_segments(sample, np_rng):\n    \"\"\"Divide sample into a and b segments.\"\"\"\n\n    # Number of sentences in the sample.\n    n_sentences = len(sample)\n    # Make sure we always have two sentences.\n    assert n_sentences > 1, 'make sure each sample has at least two sentences.'\n\n    # First part:\n    # `a_end` is how many sentences go into the `A`.\n    a_end = 1\n    if n_sentences >= 3:\n        # Note that randin in numpy is exclusive.\n        a_end = np_rng.randint(1, n_sentences)\n    tokens_a = []\n    for j in range(a_end):\n        tokens_a.extend(sample[j])\n\n    # Second part:\n    tokens_b = []\n    for j in range(a_end, n_sentences):\n        tokens_b.extend(sample[j])\n\n    # Random next:\n    is_next_random = False\n    if np_rng.random() < 0.5:\n        is_next_random = True\n        tokens_a, tokens_b = tokens_b, tokens_a\n\n    return tokens_a, tokens_b, is_next_random\n\n\ndef truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens,\n                      np_rng):\n    \"\"\"Truncates a pair of sequences to a maximum sequence length.\"\"\"\n    #print(len_a, len_b, max_num_tokens)\n    assert len_a > 0\n    if len_a + len_b <= max_num_tokens:\n        return False\n    while len_a + len_b > max_num_tokens:\n        if len_a > len_b:\n            len_a -= 1\n            tokens = tokens_a\n        else:\n            len_b -= 1\n            tokens = tokens_b\n        if np_rng.random() < 0.5:\n            del tokens[0]\n        else:\n            tokens.pop()\n    return True\n\n\ndef create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id):\n    \"\"\"Merge segments A and B, add [CLS] and [SEP] and build tokentypes.\"\"\"\n\n    tokens = []\n    tokentypes = []\n    # [CLS].\n    tokens.append(cls_id)\n    tokentypes.append(0)\n    # Segment A.\n    for token in tokens_a:\n        tokens.append(token)\n        tokentypes.append(0)\n    # [SEP].\n    tokens.append(sep_id)\n    tokentypes.append(0)\n    # Segment B.\n    for token in tokens_b:\n        tokens.append(token)\n        tokentypes.append(1)\n    if tokens_b:\n        # [SEP].\n        tokens.append(sep_id)\n        tokentypes.append(1)\n\n    return tokens, tokentypes\n\n\nMaskedLmInstance = collections.namedtuple(\"MaskedLmInstance\",\n                                          [\"index\", \"label\"])\n\n\ndef is_start_piece(piece):\n    \"\"\"Check if the current word piece is the starting piece (BERT).\"\"\"\n    # When a word has been split into\n    # WordPieces, the first token does not have any marker and any subsequence\n    # tokens are prefixed with ##. So whenever we see the ## token, we\n    # append it to the previous set of word indexes.\n    return not piece.startswith(\"##\")\n\n\ndef create_masked_lm_predictions(tokens,\n                                 vocab_id_list,\n                                 vocab_id_to_token_dict,\n                                 masked_lm_prob,\n                                 cls_id,\n                                 sep_id,\n                                 mask_id,\n                                 max_predictions_per_seq,\n                                 np_rng,\n                                 max_ngrams=3,\n                                 vocab_token_to_id_dict=None,\n                                 do_whole_word_mask=True,\n                                 favor_longer_ngram=False,\n                                 do_permutation=False,\n                                 geometric_dist=False,\n                                 to_chinese_char=False,\n                                 inplace_random_mask=False,\n                                 masking_style=\"bert\"):\n    \"\"\"Creates the predictions for the masked LM objective.\n    Note: Tokens here are vocab ids and not text tokens.\"\"\"\n\n    cand_indexes = []\n    # Note(mingdachen): We create a list for recording if the piece is\n    # the starting piece of current token, where 1 means true, so that\n    # on-the-fly whole word masking is possible.\n    token_boundary = [0] * len(tokens)\n\n    for (i, token) in enumerate(tokens):\n        if token == cls_id or token == sep_id:\n            token_boundary[i] = 1\n            continue\n        # Whole Word Masking means that if we mask all of the wordpieces\n        # corresponding to an original word.\n        #\n        # Note that Whole Word Masking does *not* change the training code\n        # at all -- we still predict each WordPiece independently, softmaxed\n        # over the entire vocabulary.\n        vocab_id = vocab_id_to_token_dict[token]\n        if (do_whole_word_mask and len(cand_indexes) >= 1 and\n                not is_start_piece(vocab_id)):\n            cand_indexes[-1].append(i)\n        else:\n            cand_indexes.append([i])\n            if is_start_piece(vocab_id_to_token_dict[token]):\n                token_boundary[i] = 1\n\n    if to_chinese_char:\n        # set ## chinse char to original chinese char\n        char_tokens = []\n        assert vocab_token_to_id_dict is not None\n        for i, b in enumerate(token_boundary):\n            if b == 0:\n                vocab_id = vocab_id_to_token_dict[tokens[i]]\n                new_vocab_id = vocab_id[2:] if len(\n                    re.findall('##[\\u4E00-\\u9FA5]',\n                               vocab_id)) > 0 else vocab_id\n                char_tokens.append(vocab_token_to_id_dict[new_vocab_id]\n                                   if new_vocab_id in vocab_token_to_id_dict\n                                   else token)\n            else:\n                char_tokens.append(tokens[i])\n        output_tokens = list(char_tokens)\n    else:\n        output_tokens = list(tokens)\n\n    masked_lm_positions = []\n    masked_lm_labels = []\n\n    if masked_lm_prob == 0:\n        return (output_tokens, masked_lm_positions, masked_lm_labels,\n                token_boundary)\n\n    # NOTE(shenliang03): to avoid num_to_predict < 1\n    num_to_predict = max(1,\n                         min(max_predictions_per_seq,\n                             max(1, int(round(len(tokens) * masked_lm_prob)))))\n\n    ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64)\n    if not geometric_dist:\n        # Note(mingdachen):\n        # By default, we set the probilities to favor shorter ngram sequences.\n        pvals = 1. / np.arange(1, max_ngrams + 1)\n        pvals /= pvals.sum(keepdims=True)\n        if favor_longer_ngram:\n            pvals = pvals[::-1]\n\n    ngram_indexes = []\n    for idx in range(len(cand_indexes)):\n        ngram_index = []\n        for n in ngrams:\n            ngram_index.append(cand_indexes[idx:idx + n])\n        ngram_indexes.append(ngram_index)\n\n    np_rng.shuffle(ngram_indexes)\n\n    (masked_lms, masked_spans) = ([], [])\n    covered_indexes = set()\n    backup_output_tokens = list(output_tokens)\n    for cand_index_set in ngram_indexes:\n        if len(masked_lms) >= num_to_predict:\n            break\n        if not cand_index_set:\n            continue\n        # Note(mingdachen):\n        # Skip current piece if they are covered in lm masking or previous ngrams.\n        for index_set in cand_index_set[0]:\n            for index in index_set:\n                if index in covered_indexes:\n                    continue\n\n        if not geometric_dist:\n            n = np_rng.choice(\n                ngrams[:len(cand_index_set)],\n                p=pvals[:len(cand_index_set)] /\n                pvals[:len(cand_index_set)].sum(keepdims=True))\n        else:\n            # Sampling \"n\" from the geometric distribution and clipping it to\n            # the max_ngrams. Using p=0.2 default from the SpanBERT paper\n            # https://arxiv.org/pdf/1907.10529.pdf (Sec 3.1)\n            n = min(np_rng.geometric(0.2), max_ngrams)\n\n        index_set = sum(cand_index_set[n - 1], [])\n        n -= 1\n        # Note(mingdachen):\n        # Repeatedly looking for a candidate that does not exceed the\n        # maximum number of predictions by trying shorter ngrams.\n        while len(masked_lms) + len(index_set) > num_to_predict:\n            if n == 0:\n                break\n            index_set = sum(cand_index_set[n - 1], [])\n            n -= 1\n        # If adding a whole-word mask would exceed the maximum number of\n        # predictions, then just skip this candidate.\n        if len(masked_lms) + len(index_set) > num_to_predict:\n            continue\n        is_any_index_covered = False\n        for index in index_set:\n            if index in covered_indexes:\n                is_any_index_covered = True\n                break\n        if is_any_index_covered:\n            continue\n        for index in index_set:\n            covered_indexes.add(index)\n            masked_token = None\n            if masking_style == \"bert\":\n                # 80% of the time, replace with [MASK]\n                if np_rng.random() < 0.8:\n                    masked_token = mask_id\n                else:\n                    # 10% of the time, keep original\n                    if np_rng.random() < 0.5:\n                        masked_token = output_tokens[index]\n                    # 10% of the time, replace with random word\n                    else:\n                        if inplace_random_mask:\n                            masked_token = backup_output_tokens[np_rng.randint(\n                                0, len(output_tokens))]\n                        else:\n                            masked_token = vocab_id_list[np_rng.randint(\n                                0, len(vocab_id_list))]\n            elif masking_style == \"t5\":\n                masked_token = mask_id\n            else:\n                raise ValueError(\"invalid value of masking style\")\n\n            output_tokens[index] = masked_token\n            masked_lms.append(\n                MaskedLmInstance(\n                    index=index, label=backup_output_tokens[index]))\n\n        masked_spans.append(\n            MaskedLmInstance(\n                index=index_set,\n                label=[backup_output_tokens[index] for index in index_set]))\n\n    assert len(masked_lms) <= num_to_predict\n    np_rng.shuffle(ngram_indexes)\n\n    select_indexes = set()\n    if do_permutation:\n        for cand_index_set in ngram_indexes:\n            if len(select_indexes) >= num_to_predict:\n                break\n            if not cand_index_set:\n                continue\n            # Note(mingdachen):\n            # Skip current piece if they are covered in lm masking or previous ngrams.\n            for index_set in cand_index_set[0]:\n                for index in index_set:\n                    if index in covered_indexes or index in select_indexes:\n                        continue\n\n            n = np.random.choice(\n                ngrams[:len(cand_index_set)],\n                p=pvals[:len(cand_index_set)] /\n                pvals[:len(cand_index_set)].sum(keepdims=True))\n            index_set = sum(cand_index_set[n - 1], [])\n            n -= 1\n\n            while len(select_indexes) + len(index_set) > num_to_predict:\n                if n == 0:\n                    break\n                index_set = sum(cand_index_set[n - 1], [])\n                n -= 1\n            # If adding a whole-word mask would exceed the maximum number of\n            # predictions, then just skip this candidate.\n            if len(select_indexes) + len(index_set) > num_to_predict:\n                continue\n            is_any_index_covered = False\n            for index in index_set:\n                if index in covered_indexes or index in select_indexes:\n                    is_any_index_covered = True\n                    break\n            if is_any_index_covered:\n                continue\n            for index in index_set:\n                select_indexes.add(index)\n        assert len(select_indexes) <= num_to_predict\n\n        select_indexes = sorted(select_indexes)\n        permute_indexes = list(select_indexes)\n        np_rng.shuffle(permute_indexes)\n        orig_token = list(output_tokens)\n\n        for src_i, tgt_i in zip(select_indexes, permute_indexes):\n            output_tokens[src_i] = orig_token[tgt_i]\n            masked_lms.append(\n                MaskedLmInstance(\n                    index=src_i, label=orig_token[src_i]))\n\n    masked_lms = sorted(masked_lms, key=lambda x: x.index)\n    # Sort the spans by the index of the first span\n    masked_spans = sorted(masked_spans, key=lambda x: x.index[0])\n\n    for p in masked_lms:\n        masked_lm_positions.append(p.index)\n        masked_lm_labels.append(p.label)\n    return (output_tokens, masked_lm_positions, masked_lm_labels,\n            token_boundary, masked_spans)\n\n\ndef pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,\n                             masked_labels, pad_id, max_seq_length):\n    \"\"\"Pad sequences and convert them to numpy.\"\"\"\n\n    # Some checks.\n    num_tokens = len(tokens)\n    padding_length = max_seq_length - num_tokens\n    assert padding_length >= 0\n    assert len(tokentypes) == num_tokens\n    assert len(masked_positions) == len(masked_labels)\n\n    # Tokens and token types.\n    filler = [pad_id] * padding_length\n    tokens_np = np.array(tokens + filler, dtype=np.int64)\n    tokentypes_np = np.array(tokentypes + filler, dtype=np.int64)\n\n    # Padding mask.\n    padding_mask_np = np.array(\n        [1] * num_tokens + [0] * padding_length, dtype=np.int64)\n\n    # Lables and loss mask.\n    labels = [-1] * max_seq_length\n    loss_mask = [0] * max_seq_length\n    for i in range(len(masked_positions)):\n        assert masked_positions[i] < num_tokens\n        labels[masked_positions[i]] = masked_labels[i]\n        loss_mask[masked_positions[i]] = 1\n    labels_np = np.array(labels, dtype=np.int64)\n    loss_mask_np = np.array(loss_mask, dtype=np.int64)\n\n    return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np\n\n\ndef get_indexed_dataset_(data_prefix, data_impl, skip_warmup):\n\n    print_rank_0(' > building dataset index ...')\n\n    start_time = time.time()\n    indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup)\n    assert indexed_dataset.sizes.shape[0] == indexed_dataset.doc_idx[-1]\n    print_rank_0(' > finished creating indexed dataset in {:4f} '\n                 'seconds'.format(time.time() - start_time))\n\n    print_rank_0(' > indexed dataset stats:')\n    print_rank_0('    number of documents: {}'.format(\n        indexed_dataset.doc_idx.shape[0] - 1))\n    print_rank_0('    number of sentences: {}'.format(\n        indexed_dataset.sizes.shape[0]))\n\n    return indexed_dataset\n\n\ndef get_train_valid_test_split_(splits_string, size):\n    \"\"\" Get dataset splits from comma or '/' separated string list.\"\"\"\n\n    splits = []\n    if splits_string.find(',') != -1:\n        splits = [float(s) for s in splits_string.split(',')]\n    elif splits_string.find('/') != -1:\n        splits = [float(s) for s in splits_string.split('/')]\n    else:\n        splits = [float(splits_string)]\n    while len(splits) < 3:\n        splits.append(0.)\n    splits = splits[:3]\n    splits_sum = sum(splits)\n    assert splits_sum > 0.0\n    splits = [split / splits_sum for split in splits]\n    splits_index = [0]\n    for index, split in enumerate(splits):\n        splits_index.append(splits_index[index] + int(\n            round(split * float(size))))\n    diff = splits_index[-1] - size\n    for index in range(1, len(splits_index)):\n        splits_index[index] -= diff\n    assert len(splits_index) == 4\n    assert splits_index[-1] == size\n    return splits_index\n\n\ndef get_samples_mapping(indexed_dataset, data_prefix, num_epochs,\n                        max_num_samples, max_seq_length, short_seq_prob, seed,\n                        name, binary_head, share_folder):\n    \"\"\"Get a list that maps a sample index to a starting sentence index, end sentence index, and length\"\"\"\n\n    if not num_epochs:\n        if not max_num_samples:\n            raise ValueError(\"Need to specify either max_num_samples \"\n                             \"or num_epochs\")\n        num_epochs = np.iinfo(np.int32).max - 1\n    if not max_num_samples:\n        max_num_samples = np.iinfo(np.int64).max - 1\n\n    # Filename of the index mapping\n    indexmap_filename = data_prefix\n    indexmap_filename += '_{}_indexmap'.format(name)\n    if num_epochs != (np.iinfo(np.int32).max - 1):\n        indexmap_filename += '_{}ep'.format(num_epochs)\n    if max_num_samples != (np.iinfo(np.int64).max - 1):\n        indexmap_filename += '_{}mns'.format(max_num_samples)\n    indexmap_filename += '_{}msl'.format(max_seq_length)\n    indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob)\n    indexmap_filename += '_{}s'.format(seed)\n    indexmap_filename += '.npy'\n\n    local_rank = get_local_rank()\n    if share_folder:\n        local_rank = paddle.distributed.get_rank()\n    # Build the indexed mapping if not exist.\n\n    if local_rank == 0 and \\\n       not os.path.isfile(indexmap_filename):\n        print(' > WARNING: could not find index map file {}, building '\n              'the indices on rank 0 ...'.format(indexmap_filename))\n\n        # Make sure the types match the helpers input types.\n        assert indexed_dataset.doc_idx.dtype == np.int64\n        print(indexed_dataset.sizes.dtype)\n        assert indexed_dataset.sizes.dtype == np.int32\n\n        try:\n            import ppfleetx.data.data_tools.cpp.fast_index_map_helpers as ernie_fast_index_map_helpers\n        except Exception as e:\n            start_time = time.time()\n            print('> compiling dataset index builder ...')\n            from ppfleetx.data.data_tools.cpp.compile import compile_helper\n            compile_helper()\n            print(\n                '>>> done with dataset index builder. Compilation time: {:.3f} '\n                'seconds'.format(time.time() - start_time),\n                flush=True)\n            import ppfleetx.data.data_tools.cpp.fast_index_map_helpers as ernie_fast_index_map_helpers\n\n        samples_mapping = ernie_fast_index_map_helpers.build_mapping(\n            indexed_dataset.doc_idx, indexed_dataset.sizes, num_epochs,\n            max_num_samples, max_seq_length, short_seq_prob, seed, True, 2\n            if binary_head else 1)\n        print_rank_0(' > done building sapmles index maping')\n        start_time = time.time()\n        np.save(indexmap_filename, samples_mapping, allow_pickle=True)\n        print_rank_0(' > saved the index mapping in {}'.format(\n            indexmap_filename))\n        # Make sure all the ranks have built the mapping\n        print_rank_0(' > elasped time to build and save samples mapping '\n                     '(seconds): {:4f}'.format(time.time() - start_time))\n\n    else:\n        while True:\n            if (not os.path.isfile(indexmap_filename)):\n                time.sleep(3)\n            else:\n                try:\n                    np.load(\n                        indexmap_filename, allow_pickle=True, mmap_mode='r')\n                    break\n                except Exception as e:\n                    print(\n                        \"%s file is still writing or damaged, please wait a moment.\"\n                        % indexmap_filename)\n                    time.sleep(3)\n\n    # This should be a barrier but nccl barrier assumes\n    # device_index=rank which is not the case for model\n    # parallel case\n    if paddle.distributed.get_world_size() > 1:\n        if paddle.in_dynamic_mode():\n            paddle.distributed.barrier()\n\n    # Load indexed dataset.\n    print_rank_0(' > loading indexed mapping from {}'.format(\n        indexmap_filename))\n    start_time = time.time()\n    samples_mapping = np.load(\n        indexmap_filename, allow_pickle=True, mmap_mode='r')\n    print_rank_0('    loaded indexed file in {:3.3f} seconds'.format(time.time(\n    ) - start_time))\n    print_rank_0('    total number of samples: {}'.format(\n        samples_mapping.shape[0]))\n\n    return samples_mapping\n"
  },
  {
    "path": "ppfleetx/data/dataset/ernie/ernie_dataset.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport sys\nimport time\nimport numpy as np\nimport re\nimport copy\nfrom functools import partial\nimport paddle\n\nfrom .dataset_utils import (\n    get_samples_mapping,\n    get_a_and_b_segments,\n    truncate_segments,\n    create_tokens_and_tokentypes,\n    create_masked_lm_predictions,\n    make_indexed_dataset,\n    get_indexed_dataset_, )\nfrom paddlenlp.transformers import ErnieTokenizer\nfrom paddlenlp.datasets.dataset import MapDataset, IterableDataset, SimpleBuilder, load_dataset\n\n\ndef get_local_rank():\n    return int(os.getenv(\"PADDLE_RANK_IN_NODE\", 0))\n\n\nprint_rank_0 = print\n\nmode_to_index = {\"Train\": 0, \"Eval\": 1, \"Test\": 2}\nmode_to_key = {\"Train\": \"train\", \"Eval\": \"dev\", \"Test\": \"test\"}\n\n\nclass ErnieDataset(paddle.io.Dataset):\n    def __init__(self, input_dir, tokenizer_type, split, num_samples, mode,\n                 max_seq_length, masked_lm_prob, short_seq_prob, seed,\n                 binary_head, share_folder, favor_longer_ngram, max_ngrams):\n        tokenizer = ErnieTokenizer.from_pretrained(tokenizer_type)\n        tokenizer.extend_chinese_char()\n\n        files = get_train_data_file(input_dir)[0]\n        skip_warmup = True\n        indexed_dataset = get_indexed_dataset_(files, None, skip_warmup)\n        total_num_of_documents = indexed_dataset.doc_idx.shape[0] - 1\n        splits = get_train_valid_test_split_(split, total_num_of_documents)\n        # Print stats about the splits.\n        print_rank_0(' > dataset split:')\n\n        def print_split_stats(name, index):\n            print_rank_0('    {}:'.format(name))\n            print_rank_0('     document indices in [{}, {}) total of {} '\n                         'documents'.format(splits[index], splits[index + 1],\n                                            splits[index + 1] - splits[index]))\n            start_index = indexed_dataset.doc_idx[splits[index]]\n            end_index = indexed_dataset.doc_idx[splits[index + 1]]\n            print_rank_0('     sentence indices in [{}, {}) total of {} '\n                         'sentences'.format(start_index, end_index, end_index -\n                                            start_index))\n\n        index = mode_to_index[mode]\n        print_split_stats(mode, index)\n\n        # dataset = None\n        assert splits[index + 1] > splits[index]\n        # Get the pointer to the original doc-idx so we can set it later.\n        doc_idx_ptr = indexed_dataset.get_doc_idx()\n        # Slice the doc-idx\n        start_index = splits[index]\n        # Add +1 so we can index into the dataset to get the upper bound.\n        end_index = splits[index + 1] + 1\n        # New doc_idx view.\n        indexed_dataset.set_doc_idx(doc_idx_ptr[start_index:end_index])\n        # Build the dataset accordingly.\n        self.seed = seed\n        self.masked_lm_prob = masked_lm_prob\n        self.max_seq_length = max_seq_length\n        self.binary_head = binary_head\n        self.share_folder = share_folder\n        self.indexed_dataset = indexed_dataset\n\n        self.favor_longer_ngram = favor_longer_ngram\n        self.max_ngrams = max_ngrams\n\n        # Build the samples mapping.\n        self.samples_mapping = get_samples_mapping(\n            self.indexed_dataset,\n            files,\n            None,\n            num_samples,\n            self.max_seq_length - 3,  # account for added tokens\n            short_seq_prob,\n            self.seed,\n            mode,\n            self.binary_head,\n            self.share_folder)\n\n        self.vocab_id_list = list(tokenizer.vocab.idx_to_token.keys())\n        self.vocab_id_to_token_dict = copy.deepcopy(\n            tokenizer.vocab.idx_to_token)\n        self.vocab_token_to_id_dict = copy.deepcopy(\n            tokenizer.vocab.token_to_idx)\n\n        # ERNIE is chinse char level model, sometime is need\n        # add ## chinse char to encode and decode.\n        # Here we extend the vocab dict.\n        self.vocab_id_to_token_dict.update(tokenizer.added_tokens_decoder)\n        self.vocab_token_to_id_dict.update(tokenizer.added_tokens_encoder)\n\n        self.cls_id = tokenizer.cls_token_id\n        self.sep_id = tokenizer.sep_token_id\n        self.mask_id = tokenizer.mask_token_id\n        self.pad_id = tokenizer.pad_token_id\n\n    def __len__(self):\n        return self.samples_mapping.shape[0]\n\n    def __getitem__(self, idx):\n\n        start_idx, end_idx, seq_length = self.samples_mapping[idx]\n        sample = [self.indexed_dataset[i] for i in range(start_idx, end_idx)]\n\n        # Note that this rng state should be numpy and not python since\n        # python randint is inclusive whereas the numpy one is exclusive.\n        # We % 2**32 since numpy requres the seed to be between 0 and 2**32 - 1\n        np_rng = np.random.RandomState(seed=((self.seed + idx) % 2**32))\n        return build_training_sample(\n            sample,\n            seq_length,\n            self.max_seq_length,  # needed for padding\n            self.vocab_id_list,\n            self.vocab_id_to_token_dict,\n            self.vocab_token_to_id_dict,\n            self.cls_id,\n            self.sep_id,\n            self.mask_id,\n            self.pad_id,\n            self.masked_lm_prob,\n            np_rng,\n            self.binary_head,\n            self.favor_longer_ngram,\n            self.max_ngrams)\n\n\ndef build_training_sample(sample,\n                          target_seq_length,\n                          max_seq_length,\n                          vocab_id_list,\n                          vocab_id_to_token_dict,\n                          vocab_token_to_id_dict,\n                          cls_id,\n                          sep_id,\n                          mask_id,\n                          pad_id,\n                          masked_lm_prob,\n                          np_rng,\n                          binary_head,\n                          favor_longer_ngram=False,\n                          max_ngrams=3):\n    \"\"\"Biuld training sample.\n\n    Arguments:\n        sample: A list of sentences in which each sentence is a list token ids.\n        target_seq_length: Desired sequence length.\n        max_seq_length: Maximum length of the sequence. All values are padded to\n            this length.\n        vocab_id_list: List of vocabulary ids. Used to pick a random id.\n        vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.\n        vocab_token_to_id_dict: A dictionary from text tokens to vocab ids.\n        cls_id: Start of example id.\n        sep_id: Separator id.\n        mask_id: Mask token id.\n        pad_id: Padding token id.\n        masked_lm_prob: Probability to mask tokens.\n        np_rng: Random number genenrator. Note that this rng state should be\n              numpy and not python since python randint is inclusive for\n              the opper bound whereas the numpy one is exclusive.\n    \"\"\"\n\n    if binary_head:\n        # We assume that we have at least two sentences in the sample\n        assert len(sample) > 1, \"The sentence num should be large than 1.\"\n    assert target_seq_length <= max_seq_length\n\n    # Divide sample into two segments (A and B).\n    if binary_head:\n        tokens_a, tokens_b, is_next_random = get_a_and_b_segments(sample,\n                                                                  np_rng)\n    else:\n        tokens_a = []\n        for j in range(len(sample)):\n            tokens_a.extend(sample[j])\n        tokens_b = []\n        is_next_random = False\n\n    # Truncate to `target_sequence_length`.\n    max_num_tokens = target_seq_length\n    truncated = truncate_segments(tokens_a, tokens_b,\n                                  len(tokens_a),\n                                  len(tokens_b), max_num_tokens, np_rng)\n\n    # Build tokens and toketypes.\n    tokens, tokentypes = create_tokens_and_tokentypes(tokens_a, tokens_b,\n                                                      cls_id, sep_id)\n\n    # Masking.\n    max_predictions_per_seq = masked_lm_prob * max_num_tokens\n    (tokens, masked_positions, masked_labels, _,\n     _) = create_masked_lm_predictions(\n         tokens,\n         vocab_id_list,\n         vocab_id_to_token_dict,\n         masked_lm_prob,\n         cls_id,\n         sep_id,\n         mask_id,\n         max_predictions_per_seq,\n         np_rng,\n         vocab_token_to_id_dict=vocab_token_to_id_dict,\n         to_chinese_char=True,\n         inplace_random_mask=False,\n         favor_longer_ngram=favor_longer_ngram,\n         max_ngrams=max_ngrams, )\n\n    # Padding.\n    tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np \\\n        = pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,\n                                   masked_labels, pad_id, max_seq_length)\n\n    return tokens_np, tokentypes_np, padding_mask_np, masked_positions, masked_labels, int(\n        is_next_random)\n\n\ndef pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,\n                             masked_labels, pad_id, max_seq_length):\n    \"\"\"Pad sequences and convert them to numpy.\"\"\"\n\n    # Some checks.\n    num_tokens = len(tokens)\n    padding_length = max_seq_length - num_tokens\n    assert padding_length >= 0\n    assert len(tokentypes) == num_tokens\n    assert len(masked_positions) == len(masked_labels)\n\n    # Tokens and token types.\n    filler = [pad_id] * padding_length\n    tokens_np = np.array(tokens + filler, dtype=np.int64)\n    tokentypes_np = np.array(tokentypes + filler, dtype=np.int64)\n\n    # Padding mask.\n    padding_mask_np = np.array(\n        [1] * num_tokens + [0] * padding_length, dtype=np.float32)\n    padding_mask_np = (1 - padding_mask_np) * -1e4\n\n    padding_mask_np = padding_mask_np.reshape([1, 1, -1])\n    # Lables and loss mask.\n    labels = [-1] * max_seq_length\n    loss_mask = [0] * max_seq_length\n    for i in range(len(masked_positions)):\n        assert masked_positions[i] < num_tokens\n        labels[masked_positions[i]] = masked_labels[i]\n        loss_mask[masked_positions[i]] = 1\n    labels_np = np.array(labels, dtype=np.int64)\n    loss_mask_np = np.array(loss_mask, dtype=np.int64)\n\n    return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np\n\n\ndef get_train_data_file(input_dir):\n    if len(input_dir.split()) > 1:\n        # weight-1 data-prefix-1 weight-2 data-prefix-2 ...\n        return input_dir.split()\n    else:\n        files = [\n            os.path.join(input_dir, f) for f in os.listdir(input_dir)\n            if (os.path.isfile(os.path.join(input_dir, f)) and \"_idx.npz\" in\n                str(f))\n        ]\n        files = [x.replace(\"_idx.npz\", \"\") for x in files]\n\n        if len(files) > 1:\n            ret = []\n            logger.info(\"You are using multi-dataset:\")\n            for x in files:\n                ret.append(1.0)\n                ret.append(x)\n                logger.info(\"    > set weight of %s dataset to 1.0\" % x)\n            return ret\n    return files\n\n\ndef get_train_valid_test_split_(splits, size):\n    \"\"\"\n    Get dataset splits from comma or '/' separated string list.\n    \"\"\"\n\n    splits = [float(s) for s in splits]\n    while len(splits) < 3:\n        splits.append(0.)\n    splits = splits[:3]\n    splits_sum = sum(splits)\n    assert splits_sum > 0.0\n    splits = [split / splits_sum for split in splits]\n    splits_index = [0]\n    for index, split in enumerate(splits):\n        splits_index.append(splits_index[index] + int(\n            round(split * float(size))))\n    diff = splits_index[-1] - size\n    for index in range(1, len(splits_index)):\n        splits_index[index] -= diff\n    assert len(splits_index) == 4\n    assert splits_index[-1] == size\n    return splits_index\n\n\nclass ErnieSeqClsDataset(paddle.io.Dataset):\n    def __init__(self, dataset_type, tokenizer_type, max_seq_len, mode):\n        self.dataset = dataset_type\n        self.max_seq_len = max_seq_len\n        self.mode = mode_to_key[mode]\n\n        from ppfleetx.data.tokenizers import get_ernie_tokenizer\n        self.tokenizer = get_ernie_tokenizer(tokenizer_type)\n\n        dataset_config = self.dataset.split(\" \")\n        raw_datasets = load_dataset(\n            dataset_config[0],\n            None if len(dataset_config) <= 1 else dataset_config[1], )\n        self.label_list = getattr(raw_datasets['train'], \"label_list\", None)\n\n        # Define dataset pre-process function\n        if \"clue\" in self.dataset:\n            trans_fn = partial(self._clue_trans_fn)\n        else:\n            trans_fn = partial(self._seq_trans_fn)\n\n        self.seqcls_dataset = raw_datasets[self.mode].map(trans_fn)\n\n    def __getitem__(self, idx):\n        return self.seqcls_dataset.__getitem__(idx)\n\n    def __len__(self):\n        return self.seqcls_dataset.__len__()\n\n    def _seq_trans_fn(self, example):\n        return self._convert_example(\n            example,\n            tokenizer=self.tokenizer,\n            max_seq_length=self.max_seq_len, )\n\n    def _clue_trans_fn(self, example):\n        return self._convert_clue(\n            example,\n            label_list=self.label_list,\n            tokenizer=self.tokenizer,\n            max_seq_length=self.max_seq_len, )\n\n    def _convert_example(self,\n                         example,\n                         tokenizer,\n                         max_seq_length=512,\n                         is_test=False):\n        is_test = True\n        if 'label' in example.keys():\n            is_test = False\n\n        if \"text_b\" in example.keys():\n            text = example[\"text_a\"]\n            text_pair = example[\"text_b\"]\n        else:\n            text = example[\"text\"]\n            text_pair = None\n\n        encoded_inputs = tokenizer(\n            text=text, text_pair=text_pair, max_seq_len=max_seq_length)\n        input_ids = encoded_inputs[\"input_ids\"]\n        token_type_ids = encoded_inputs[\"token_type_ids\"]\n\n        if is_test:\n            return {\n                \"input_ids\": input_ids,\n                \"token_type_ids\": token_type_ids,\n            }\n        else:\n            # label = np.array([example[\"label\"]], dtype=\"int64\")\n            label = int(example[\"label\"])\n            return {\n                \"input_ids\": input_ids,\n                \"token_type_ids\": token_type_ids,\n                \"labels\": label\n            }\n\n    # Data pre-process function for clue benchmark datatset\n    def _convert_clue(self,\n                      example,\n                      label_list,\n                      tokenizer=None,\n                      max_seq_length=512,\n                      **kwargs):\n        \"\"\"convert a glue example into necessary features\"\"\"\n        is_test = False\n        if 'label' not in example.keys():\n            is_test = True\n\n        if not is_test:\n            # `label_list == None` is for regression task\n            label_dtype = \"int64\" if label_list else \"float32\"\n            # Get the label\n            example['label'] = int(example[\n                \"label\"]) if label_dtype != \"float32\" else float(example[\n                    \"label\"])\n            label = example['label']\n        # Convert raw text to feature\n        if 'keyword' in example:  # CSL\n            sentence1 = \" \".join(example['keyword'])\n            example = {\n                'sentence1': sentence1,\n                'sentence2': example['abst'],\n                'label': example['label']\n            }\n        elif 'target' in example:  # wsc\n            text, query, pronoun, query_idx, pronoun_idx = example[\n                'text'], example['target']['span1_text'], example['target'][\n                    'span2_text'], example['target']['span1_index'], example[\n                        'target']['span2_index']\n            text_list = list(text)\n            assert text[pronoun_idx:(pronoun_idx + len(\n                pronoun))] == pronoun, \"pronoun: {}\".format(pronoun)\n            assert text[query_idx:(query_idx + len(query)\n                                   )] == query, \"query: {}\".format(query)\n            if pronoun_idx > query_idx:\n                text_list.insert(query_idx, \"_\")\n                text_list.insert(query_idx + len(query) + 1, \"_\")\n                text_list.insert(pronoun_idx + 2, \"[\")\n                text_list.insert(pronoun_idx + len(pronoun) + 2 + 1, \"]\")\n            else:\n                text_list.insert(pronoun_idx, \"[\")\n                text_list.insert(pronoun_idx + len(pronoun) + 1, \"]\")\n                text_list.insert(query_idx + 2, \"_\")\n                text_list.insert(query_idx + len(query) + 2 + 1, \"_\")\n            text = \"\".join(text_list)\n            example['sentence'] = text\n\n        if tokenizer is None:\n            return example\n        if 'sentence' in example:\n            example = tokenizer(\n                example['sentence'], max_seq_len=max_seq_length)\n        elif 'sentence1' in example:\n            example = tokenizer(\n                example['sentence1'],\n                text_pair=example['sentence2'],\n                max_seq_len=max_seq_length)\n\n        if not is_test:\n            if \"token_type_ids\" in example:\n                return {\n                    \"input_ids\": example['input_ids'],\n                    \"token_type_ids\": example['token_type_ids'],\n                    \"labels\": label\n                }\n            else:\n                return {\"input_ids\": example['input_ids'], \"labels\": label}\n        else:\n            return {\n                \"input_ids\": example['input_ids'],\n                \"token_type_ids\": example['token_type_ids']\n            }\n"
  },
  {
    "path": "ppfleetx/data/dataset/glue_dataset.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport numpy as np\n\nimport paddle\n\nfrom ppfleetx.data.tokenizers import GPTTokenizer\nfrom ppfleetx.utils.download import cached_path\nfrom ppfleetx.utils.file import unzip, parse_csv\n\n__all__ = [\n    'CoLA', 'SST2', 'MNLI', 'QNLI', 'RTE', 'WNLI', 'MRPC', 'QQP', 'STSB'\n]\n\"\"\"\n\nSingle-Sentence Tasks:\n* CoLA\n* SST-2\n\n\nSimilarity and Paraphrase Tasks:\n* MRPC\n* STS-B\n* QQP\n\n\nInference Tasks:\n* MNLI\n* QNLI\n* RTE\n* WNLI\n\"\"\"\n\n\nclass CoLA(paddle.io.Dataset):\n    \"\"\"The Corpus of Linguistic Acceptability consists of English\n    acceptability judgments drawn from books and journal articles on\n    linguistic theory. Each example is a sequence of words annotated\n    with whether it is a grammatical English sentence.\"\"\"\n\n    # ref https://pytorch.org/text/stable/_modules/torchtext/datasets/cola.html#CoLA\n\n    URL = \"https://nyu-mll.github.io/CoLA/cola_public_1.1.zip\"\n    MD5 = \"9f6d88c3558ec424cd9d66ea03589aba\"\n\n    NUM_LINES = {\n        \"train\": 8551,\n        \"dev\": 527,\n        \"test\": 516,\n    }\n\n    _PATH = \"cola_public_1.1.zip\"\n\n    DATASET_NAME = \"CoLA\"\n\n    _EXTRACTED_FILES = {\n        \"train\": os.path.join(\"raw\", \"in_domain_train.tsv\"),\n        \"dev\": os.path.join(\"raw\", \"in_domain_dev.tsv\"),\n        \"test\": os.path.join(\"raw\", \"out_of_domain_dev.tsv\"),\n    }\n\n    def __init__(self, root, split, max_length=128):\n\n        self.root = root\n        self.split = split\n        if os.path.exists(self.root):\n            assert os.path.isdir(self.root)\n        else:\n            zip_path = cached_path(\n                self.URL, cache_dir=os.path.abspath(self.root))\n            unzip(\n                zip_path,\n                mode=\"r\",\n                out_dir=os.path.join(self.root, '..'),\n                delete=True)\n\n        self.path = os.path.join(self.root, self._EXTRACTED_FILES[split])\n        assert os.path.exists(self.path), f\"{self.path} is not exists!\"\n        self.max_length = max_length\n\n        self.tokenizer = GPTTokenizer.from_pretrained(\"gpt2\")\n\n        assert split in ['train', 'dev', 'test']\n\n        def _filter_res(x):\n            return len(x) == 4\n\n        def _modify_res(x):\n            return (x[3], int(x[1]))\n\n        self.samples = parse_csv(\n            self.path,\n            skip_lines=1,\n            delimiter=\"\\t\",\n            map_funcs=_modify_res,\n            filter_funcs=_filter_res)\n\n    def __getitem__(self, idx):\n        sample = self.samples[idx]\n\n        encoded_inputs = self.tokenizer(\n            sample[0],\n            padding=\"max_length\",\n            truncation=\"longest_first\",\n            max_length=self.max_length,\n            return_token_type_ids=False)\n        input_ids = encoded_inputs['input_ids']\n        input_ids = paddle.to_tensor(input_ids)\n        if self.split != 'test':\n            return input_ids, sample[1]\n        else:\n            return input_ids\n\n    def __len__(self):\n        return len(self.samples)\n\n    @property\n    def class_num(self):\n        return 2\n\n\nclass SST2(paddle.io.Dataset):\n    \"\"\"The Stanford Sentiment Treebank consists of sentences from movie reviews and\n    human annotations of their sentiment. The task is to predict the sentiment of a\n    given sentence. We use the two-way (positive/negative) class split, and use only\n    sentence-level labels.\"\"\"\n\n    # ref https://pytorch.org/text/stable/_modules/torchtext/datasets/sst2.html#SST2\n\n    URL = \"https://dl.fbaipublicfiles.com/glue/data/SST-2.zip\"\n    MD5 = \"9f81648d4199384278b86e315dac217c\"\n\n    NUM_LINES = {\n        \"train\": 67349,\n        \"dev\": 872,\n        \"test\": 1821,\n    }\n\n    _PATH = \"SST-2.zip\"\n\n    DATASET_NAME = \"SST2\"\n\n    _EXTRACTED_FILES = {\n        \"train\": \"train.tsv\",\n        \"dev\": \"dev.tsv\",\n        \"test\": \"test.tsv\",\n    }\n\n    def __init__(self, root, split, max_length=128):\n\n        self.root = root\n        self.split = split\n        if os.path.exists(self.root):\n            assert os.path.isdir(self.root)\n        else:\n            zip_path = cached_path(\n                self.URL, cache_dir=os.path.abspath(self.root))\n            unzip(\n                zip_path,\n                mode=\"r\",\n                out_dir=os.path.join(self.root, '..'),\n                delete=True)\n\n        self.path = os.path.join(self.root, self._EXTRACTED_FILES[split])\n        assert os.path.exists(self.path), f\"{self.path} is not exists!\"\n        self.max_length = max_length\n\n        self.tokenizer = GPTTokenizer.from_pretrained(\"gpt2\")\n\n        assert split in ['train', 'dev', 'test']\n\n        # test split for SST2 doesn't have labels\n        if split == \"test\":\n\n            def _modify_test_res(t):\n                return (t[1].strip(), )\n\n            self.samples = parse_csv(\n                self.path,\n                skip_lines=1,\n                delimiter=\"\\t\",\n                map_funcs=_modify_test_res)\n        else:\n\n            def _modify_res(t):\n                return (t[0].strip(), int(t[1]))\n\n            self.samples = parse_csv(\n                self.path, skip_lines=1, delimiter=\"\\t\", map_funcs=_modify_res)\n\n    def __getitem__(self, idx):\n        sample = self.samples[idx]\n\n        encoded_inputs = self.tokenizer(\n            sample[0],\n            padding=\"max_length\",\n            truncation=\"longest_first\",\n            max_length=self.max_length,\n            return_token_type_ids=False)\n        input_ids = encoded_inputs['input_ids']\n        input_ids = paddle.to_tensor(input_ids)\n        if self.split != 'test':\n            return input_ids, sample[1]\n        else:\n            return input_ids\n\n    def __len__(self):\n        return len(self.samples)\n\n    @property\n    def class_num(self):\n        return 2\n\n\nclass MNLI(paddle.io.Dataset):\n    \"\"\"The Multi-Genre Natural Language Inference Corpus is a crowdsourced\n    collection of sentence pairs with textual entailment annotations. Given a premise sentence\n    and a hypothesis sentence, the task is to predict whether the premise entails the hypothesis\n    (entailment), contradicts the hypothesis (contradiction), or neither (neutral). The premise sentences are\n    gathered from ten different sources, including transcribed speech, fiction, and government reports.\n    We use the standard test set, for which we obtained private labels from the authors, and evaluate\n    on both the matched (in-domain) and mismatched (cross-domain) section. We also use and recommend\n    the SNLI corpus as 550k examples of auxiliary training data.\"\"\"\n\n    # ref https://pytorch.org/text/stable/_modules/torchtext/datasets/mnli.html#MNLI\n\n    URL = \"https://cims.nyu.edu/~sbowman/multinli/multinli_1.0.zip\"\n    MD5 = \"0f70aaf66293b3c088a864891db51353\"\n\n    NUM_LINES = {\n        \"train\": 392702,\n        \"dev_matched\": 9815,\n        \"dev_mismatched\": 9832,\n    }\n\n    _PATH = \"multinli_1.0.zip\"\n\n    DATASET_NAME = \"MNLI\"\n\n    _EXTRACTED_FILES = {\n        \"train\": \"multinli_1.0_train.txt\",\n        \"dev_matched\": \"multinli_1.0_dev_matched.txt\",\n        \"dev_mismatched\": \"multinli_1.0_dev_mismatched.txt\",\n    }\n\n    LABEL_TO_INT = {\"entailment\": 0, \"neutral\": 1, \"contradiction\": 2}\n\n    def __init__(self, root, split, max_length=128):\n\n        self.root = root\n        self.split = split\n        if os.path.exists(self.root):\n            assert os.path.isdir(self.root)\n        else:\n            zip_path = cached_path(\n                self.URL, cache_dir=os.path.abspath(self.root))\n            unzip(\n                zip_path,\n                mode=\"r\",\n                out_dir=os.path.join(self.root, '..'),\n                delete=True)\n\n        self.path = os.path.join(self.root, self._EXTRACTED_FILES[split])\n        assert os.path.exists(self.path), f\"{self.path} is not exists!\"\n        self.max_length = max_length\n\n        self.tokenizer = GPTTokenizer.from_pretrained(\"gpt2\")\n\n        assert split in ['train', 'dev_matched', 'dev_mismatched']\n\n        def _filter_res(x):\n            return x[0] in self.LABEL_TO_INT\n\n        def _modify_res(x):\n            return (x[5], x[6], self.LABEL_TO_INT[x[0]])\n\n        self.samples = parse_csv(\n            self.path,\n            skip_lines=1,\n            delimiter=\"\\t\",\n            map_funcs=_modify_res,\n            filter_funcs=_filter_res)\n\n    def __getitem__(self, idx):\n        sample = self.samples[idx]\n\n        encoded_inputs = self.tokenizer(\n            sample[0],\n            text_pair=sample[1],\n            padding=\"max_length\",\n            truncation=\"longest_first\",\n            max_length=self.max_length,\n            return_token_type_ids=False)\n        input_ids = encoded_inputs['input_ids']\n        input_ids = paddle.to_tensor(input_ids)\n        return input_ids, sample[2]\n\n    def __len__(self):\n        return len(self.samples)\n\n    @property\n    def class_num(self):\n        return 3\n\n\nclass QNLI(paddle.io.Dataset):\n    \"\"\"The Stanford Question Answering Dataset is a question-answering\n    dataset consisting of question-paragraph pairs, where one of the sentences in the paragraph (drawn\n    from Wikipedia) contains the answer to the corresponding question (written by an annotator). We\n    convert the task into sentence pair classification by forming a pair between each question and each\n    sentence in the corresponding context, and filtering out pairs with low lexical overlap between the\n    question and the context sentence. The task is to determine whether the context sentence contains\n    the answer to the question. This modified version of the original task removes the requirement that\n    the model select the exact answer, but also removes the simplifying assumptions that the answer\n    is always present in the input and that lexical overlap is a reliable cue.\"\"\"\n\n    # ref https://pytorch.org/text/stable/_modules/torchtext/datasets/qnli.html#QNLI\n\n    URL = \"https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip\"\n    MD5 = \"b4efd6554440de1712e9b54e14760e82\"\n\n    NUM_LINES = {\n        \"train\": 104743,\n        \"dev\": 5463,\n        \"test\": 5463,\n    }\n\n    _PATH = \"QNLIv2.zip\"\n\n    DATASET_NAME = \"QNLI\"\n\n    _EXTRACTED_FILES = {\n        \"train\": \"train.tsv\",\n        \"dev\": \"dev.tsv\",\n        \"test\": \"test.tsv\",\n    }\n\n    MAP_LABELS = {\"entailment\": 0, \"not_entailment\": 1}\n\n    def __init__(self, root, split, max_length=128):\n\n        self.root = root\n        self.split = split\n        if os.path.exists(self.root):\n            assert os.path.isdir(self.root)\n        else:\n            zip_path = cached_path(\n                self.URL, cache_dir=os.path.abspath(self.root))\n            unzip(\n                zip_path,\n                mode=\"r\",\n                out_dir=os.path.join(self.root, '..'),\n                delete=True)\n\n        self.path = os.path.join(self.root, self._EXTRACTED_FILES[split])\n        assert os.path.exists(self.path), f\"{self.path} is not exists!\"\n        self.max_length = max_length\n\n        self.tokenizer = GPTTokenizer.from_pretrained(\"gpt2\")\n\n        assert split in ['train', 'dev', 'test']\n\n        def _modify_res(x):\n            if split == 'test':\n                # test split for QNLI doesn't have labels\n                return (x[1], x[2])\n            else:\n                return (x[1], x[2], self.MAP_LABELS[x[3]])\n\n        self.samples = parse_csv(\n            self.path, skip_lines=1, delimiter=\"\\t\", map_funcs=_modify_res)\n\n    def __getitem__(self, idx):\n        sample = self.samples[idx]\n\n        encoded_inputs = self.tokenizer(\n            sample[0],\n            text_pair=sample[1],\n            padding=\"max_length\",\n            truncation=\"longest_first\",\n            max_length=self.max_length,\n            return_token_type_ids=False)\n        input_ids = encoded_inputs['input_ids']\n        input_ids = paddle.to_tensor(input_ids)\n        if self.split != 'test':\n            return input_ids, sample[2]\n        else:\n            return input_ids\n\n    def __len__(self):\n        return len(self.samples)\n\n    @property\n    def class_num(self):\n        return 2\n\n\nclass RTE(paddle.io.Dataset):\n    \"\"\"The Recognizing Textual Entailment (RTE) datasets come from a series of annual textual\n    entailment challenges. We combine the data from RTE1 (Dagan et al., 2006), RTE2 (Bar Haim\n    et al., 2006), RTE3 (Giampiccolo et al., 2007), and RTE5 (Bentivogli et al., 2009).4 Examples are\n    constructed based on news and Wikipedia text. We convert all datasets to a two-class split, where\n    for three-class datasets we collapse neutral and contradiction into not entailment, for consistency.\"\"\"\n\n    # ref https://pytorch.org/text/stable/_modules/torchtext/datasets/rte.html#RTE\n\n    URL = \"https://dl.fbaipublicfiles.com/glue/data/RTE.zip\"\n    MD5 = \"bef554d0cafd4ab6743488101c638539\"\n\n    NUM_LINES = {\n        \"train\": 67349,\n        \"dev\": 872,\n        \"test\": 1821,\n    }\n\n    _PATH = \"RTE.zip\"\n\n    DATASET_NAME = \"RTE\"\n\n    _EXTRACTED_FILES = {\n        \"train\": \"train.tsv\",\n        \"dev\": \"dev.tsv\",\n        \"test\": \"test.tsv\",\n    }\n\n    MAP_LABELS = {\"entailment\": 0, \"not_entailment\": 1}\n\n    def __init__(self, root, split, max_length=128):\n\n        self.root = root\n        self.split = split\n        if os.path.exists(self.root):\n            assert os.path.isdir(self.root)\n        else:\n            zip_path = cached_path(\n                self.URL, cache_dir=os.path.abspath(self.root))\n            unzip(\n                zip_path,\n                mode=\"r\",\n                out_dir=os.path.join(self.root, '..'),\n                delete=True)\n\n        self.path = os.path.join(self.root, self._EXTRACTED_FILES[split])\n        assert os.path.exists(self.path), f\"{self.path} is not exists!\"\n        self.max_length = max_length\n\n        self.tokenizer = GPTTokenizer.from_pretrained(\"gpt2\")\n\n        assert split in ['train', 'dev', 'test']\n\n        def _modify_res(x):\n            if split == 'test':\n                # test split for RTE doesn't have labels\n                return (x[1], x[2])\n            else:\n                return (x[1], x[2], self.MAP_LABELS[x[3]])\n\n        self.samples = parse_csv(\n            self.path, skip_lines=1, delimiter=\"\\t\", map_funcs=_modify_res)\n\n    def __getitem__(self, idx):\n        sample = self.samples[idx]\n\n        encoded_inputs = self.tokenizer(\n            sample[0],\n            text_pair=sample[1],\n            padding=\"max_length\",\n            truncation=\"longest_first\",\n            max_length=self.max_length,\n            return_token_type_ids=False)\n        input_ids = encoded_inputs['input_ids']\n        input_ids = paddle.to_tensor(input_ids)\n        if self.split != 'test':\n            return input_ids, sample[2]\n        else:\n            return input_ids\n\n    def __len__(self):\n        return len(self.samples)\n\n    @property\n    def class_num(self):\n        return 2\n\n\nclass WNLI(paddle.io.Dataset):\n    \"\"\"The Winograd Schema Challenge (Levesque et al., 2011) is a reading comprehension task\n    in which a system must read a sentence with a pronoun and select the referent of that pronoun from\n    a list of choices. The examples are manually constructed to foil simple statistical methods: Each\n    one is contingent on contextual information provided by a single word or phrase in the sentence.\n    To convert the problem into sentence pair classification, we construct sentence pairs by replacing\n    the ambiguous pronoun with each possible referent. The task is to predict if the sentence with the\n    pronoun substituted is entailed by the original sentence. We use a small evaluation set consisting of\n    new examples derived from fiction books that was shared privately by the authors of the original\n    corpus. While the included training set is balanced between two classes, the test set is imbalanced\n    between them (65% not entailment). Also, due to a data quirk, the development set is adversarial:\n    hypotheses are sometimes shared between training and development examples, so if a model memorizes the\n    training examples, they will predict the wrong label on corresponding development set\n    example. As with QNLI, each example is evaluated separately, so there is not a systematic correspondence\n    between a model's score on this task and its score on the unconverted original task. We\n    call converted dataset WNLI (Winograd NLI).\"\"\"\n\n    # ref https://pytorch.org/text/stable/_modules/torchtext/datasets/wnli.html#WNLI\n\n    URL = \"https://dl.fbaipublicfiles.com/glue/data/WNLI.zip\"\n    MD5 = \"a1b4bd2861017d302d29e42139657a42\"\n\n    NUM_LINES = {\n        \"train\": 635,\n        \"dev\": 71,\n        \"test\": 146,\n    }\n\n    _PATH = \"WNLI.zip\"\n\n    DATASET_NAME = \"WNLI\"\n\n    _EXTRACTED_FILES = {\n        \"train\": \"train.tsv\",\n        \"dev\": \"dev.tsv\",\n        \"test\": \"test.tsv\",\n    }\n\n    def __init__(self, root, split, max_length=128):\n\n        self.root = root\n        self.split = split\n        if os.path.exists(self.root):\n            assert os.path.isdir(self.root)\n        else:\n            zip_path = cached_path(\n                self.URL, cache_dir=os.path.abspath(self.root))\n            unzip(\n                zip_path,\n                mode=\"r\",\n                out_dir=os.path.join(self.root, '..'),\n                delete=True)\n\n        self.path = os.path.join(self.root, self._EXTRACTED_FILES[split])\n        assert os.path.exists(self.path), f\"{self.path} is not exists!\"\n        self.max_length = max_length\n\n        self.tokenizer = GPTTokenizer.from_pretrained(\"gpt2\")\n\n        assert split in ['train', 'dev', 'test']\n\n        def _modify_res(x):\n            if split == 'test':\n                # test split for WNLI doesn't have labels\n                return (x[1], x[2])\n            else:\n                return (x[1], x[2], int(x[3]))\n\n        self.samples = parse_csv(\n            self.path, skip_lines=1, delimiter=\"\\t\", map_funcs=_modify_res)\n\n    def __getitem__(self, idx):\n        sample = self.samples[idx]\n\n        encoded_inputs = self.tokenizer(\n            sample[0],\n            text_pair=sample[1],\n            padding=\"max_length\",\n            truncation=\"longest_first\",\n            max_length=self.max_length,\n            return_token_type_ids=False)\n        input_ids = encoded_inputs['input_ids']\n        input_ids = paddle.to_tensor(input_ids)\n        if self.split != 'test':\n            return input_ids, sample[2]\n        else:\n            return input_ids\n\n    def __len__(self):\n        return len(self.samples)\n\n    @property\n    def class_num(self):\n        return 2\n\n\nclass MRPC(paddle.io.Dataset):\n    \"\"\"The Microsoft Research Paraphrase Corpus (Dolan & Brockett, 2005) is a corpus of\n    sentence pairs automatically extracted from online news sources, with human annotations\n    for whether the sentences in the pair are semantically equivalent.\"\"\"\n\n    # ref https://pytorch.org/text/stable/_modules/torchtext/datasets/mrpc.html#MRPC\n\n    URL = {\n        \"train\":\n        \"https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt\",\n        \"test\":\n        \"https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt\",\n    }\n\n    MD5 = {\n        \"train\": \"793daf7b6224281e75fe61c1f80afe35\",\n        \"test\": \"e437fdddb92535b820fe8852e2df8a49\",\n    }\n\n    NUM_LINES = {\n        \"train\": 4076,\n        \"test\": 1725,\n    }\n\n    DATASET_NAME = \"MRPC\"\n\n    _EXTRACTED_FILES = {\n        \"train\": \"msr_paraphrase_train.txt\",\n        \"test\": \"msr_paraphrase_test.txt\",\n    }\n\n    def __init__(self, root, split, max_length=128):\n\n        self.root = root\n        self.split = split\n        if os.path.exists(self.root):\n            assert os.path.isdir(self.root)\n        cached_path(self.URL[split], cache_dir=os.path.abspath(self.root))\n\n        self.path = os.path.join(self.root, self._EXTRACTED_FILES[split])\n        assert os.path.exists(self.path), f\"{self.path} is not exists!\"\n        self.max_length = max_length\n\n        self.tokenizer = GPTTokenizer.from_pretrained(\"gpt2\")\n\n        assert split in ['train', 'test']\n\n        def _modify_res(x):\n            return (x[3], x[4], int(x[0]))\n\n        self.samples = parse_csv(\n            self.path, skip_lines=1, delimiter=\"\\t\", map_funcs=_modify_res)\n\n    def __getitem__(self, idx):\n        sample = self.samples[idx]\n\n        encoded_inputs = self.tokenizer(\n            sample[0],\n            text_pair=sample[1],\n            padding=\"max_length\",\n            truncation=\"longest_first\",\n            max_length=self.max_length,\n            return_token_type_ids=False)\n        input_ids = encoded_inputs['input_ids']\n        input_ids = paddle.to_tensor(input_ids)\n        return input_ids, sample[2]\n\n    def __len__(self):\n        return len(self.samples)\n\n    @property\n    def class_num(self):\n        return 2\n\n\nclass QQP(paddle.io.Dataset):\n    \"\"\"The Quora Question Pairs2 dataset is a collection of question pairs from the\n    community question-answering website Quora. The task is to determine whether a\n    pair of questions are semantically equivalent.\"\"\"\n\n    # ref https://huggingface.co/datasets/glue/blob/main/glue.py#L212-L239\n\n    URL = \"https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip\"\n    MD5 = \"884bf26e39c783d757acc510a2a516ef\"\n\n    NUM_LINES = {\n        \"train\": 363846,\n        \"dev\": 40430,\n        \"test\": 390961,\n    }\n\n    _PATH = \"QQP-clean.zip\"\n\n    DATASET_NAME = \"QQP\"\n\n    _EXTRACTED_FILES = {\n        \"train\": \"train.tsv\",\n        \"dev\": \"dev.tsv\",\n        \"test\": \"test.tsv\",\n    }\n\n    MAP_LABELS = {\"not_duplicate\": 0, \"duplicate\": 1}\n\n    def __init__(self, root, split, max_length=128):\n\n        self.root = root\n        self.split = split\n        if os.path.exists(self.root):\n            assert os.path.isdir(self.root)\n        else:\n            zip_path = cached_path(\n                self.URL, cache_dir=os.path.abspath(self.root))\n            unzip(\n                zip_path,\n                mode=\"r\",\n                out_dir=os.path.join(self.root, '..'),\n                delete=True)\n\n        self.path = os.path.join(self.root, self._EXTRACTED_FILES[split])\n        assert os.path.exists(self.path), f\"{self.path} is not exists!\"\n        self.max_length = max_length\n\n        self.tokenizer = GPTTokenizer.from_pretrained(\"gpt2\")\n\n        assert split in ['train', 'dev', 'test']\n\n        def _modify_res(x):\n            if split == 'test':\n                # test split for QQP doesn't have labels\n                return (x[1], x[2])\n            else:\n                return (x[3], x[4], int(x[5]))\n\n        self.samples = parse_csv(\n            self.path, skip_lines=1, delimiter=\"\\t\", map_funcs=_modify_res)\n\n    def __getitem__(self, idx):\n        sample = self.samples[idx]\n\n        encoded_inputs = self.tokenizer(\n            sample[0],\n            text_pair=sample[1],\n            padding=\"max_length\",\n            truncation=\"longest_first\",\n            max_length=self.max_length,\n            return_token_type_ids=False)\n        input_ids = encoded_inputs['input_ids']\n        input_ids = paddle.to_tensor(input_ids)\n        if self.split != 'test':\n            return input_ids, sample[2]\n        else:\n            return input_ids\n\n    def __len__(self):\n        return len(self.samples)\n\n    @property\n    def class_num(self):\n        return 2\n\n\nclass STSB(paddle.io.Dataset):\n    \"\"\"The Semantic Textual Similarity Benchmark (Cer et al., 2017) is a collection of\n    sentence pairs drawn from news headlines, video and image captions, and natural\n    language inference data. Each pair is human-annotated with a similarity score\n    from 1 to 5.\"\"\"\n\n    # ref https://huggingface.co/datasets/glue/blob/main/glue.py#L240-L267\n\n    URL = \"https://dl.fbaipublicfiles.com/glue/data/STS-B.zip\"\n    MD5 = \"d573676be38f1a075a5702b90ceab3de\"\n\n    NUM_LINES = {\n        \"train\": 5749,\n        \"dev\": 1500,\n        \"test\": 1379,\n    }\n\n    _PATH = \"STS-B.zip\"\n\n    DATASET_NAME = \"STSB\"\n\n    _EXTRACTED_FILES = {\n        \"train\": \"train.tsv\",\n        \"dev\": \"dev.tsv\",\n        \"test\": \"test.tsv\",\n    }\n\n    def __init__(self, root, split, max_length=128):\n\n        self.root = root\n        self.split = split\n        if os.path.exists(self.root):\n            assert os.path.isdir(self.root)\n        else:\n            zip_path = cached_path(\n                self.URL, cache_dir=os.path.abspath(self.root))\n            unzip(\n                zip_path,\n                mode=\"r\",\n                out_dir=os.path.join(self.root, '..'),\n                delete=True)\n\n        self.path = os.path.join(self.root, self._EXTRACTED_FILES[split])\n        assert os.path.exists(self.path), f\"{self.path} is not exists!\"\n        self.max_length = max_length\n\n        self.tokenizer = GPTTokenizer.from_pretrained(\"gpt2\")\n\n        assert split in ['train', 'dev', 'test']\n\n        def _modify_res(x):\n            if split == 'test':\n                # test split for STSB doesn't have labels\n                return (x[7], x[8])\n            else:\n                return (x[7], x[8], float(x[9]))\n\n        self.samples = parse_csv(\n            self.path, skip_lines=1, delimiter=\"\\t\", map_funcs=_modify_res)\n\n    def __getitem__(self, idx):\n        sample = self.samples[idx]\n\n        encoded_inputs = self.tokenizer(\n            sample[0],\n            text_pair=sample[1],\n            padding=\"max_length\",\n            truncation=\"longest_first\",\n            max_length=self.max_length,\n            return_token_type_ids=False)\n        input_ids = encoded_inputs['input_ids']\n        input_ids = paddle.to_tensor(input_ids)\n        if self.split != 'test':\n            # Note(GuoxiaWang): We need return shape [1] value,\n            # so that we can attain a batched label with shape [batchsize, 1].\n            # Because the logits shape is [batchsize, 1], and feed into MSE loss.\n            return input_ids, np.array([sample[2]], dtype=np.float32)\n        else:\n            return input_ids\n\n    def __len__(self):\n        return len(self.samples)\n\n    @property\n    def class_num(self):\n        return 2\n"
  },
  {
    "path": "ppfleetx/data/dataset/gpt_dataset.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport sys\nimport time\nimport numpy as np\nimport re\nimport math\nimport json\n\nimport paddle\n\nfrom ppfleetx.distributed.apis import env\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.data.tokenizers import GPTTokenizer\n\n# TODO(haohongxiang): to solve the problem of cross-reference\nimport paddlenlp\nfrom paddlenlp.transformers.gpt.tokenizer import GPTChineseTokenizer\n\nmode_to_index = {\"Train\": 0, \"Eval\": 1, \"Test\": 2}\n\nMODEL_CLASSES = {\n    \"GPT\": (GPTTokenizer, \"gpt2\"),\n    \"MoE\": (GPTTokenizer, \"gpt2\"),\n    \"GPT-cn\": (GPTChineseTokenizer, \"gpt-cpm-large-cn\"),\n}\n\n\nclass GPTDataset(paddle.io.Dataset):\n    def __init__(self,\n                 input_dir,\n                 split,\n                 max_seq_len,\n                 num_samples,\n                 mode,\n                 model_type=\"GPT\",\n                 seed=1234):\n\n        files = get_train_data_file(input_dir)\n        files.sort()\n        input_dir = [files[0]]\n\n        local_rank = int(os.getenv(\"PADDLE_RANK_IN_NODE\", 0))\n\n        if local_rank == 0:\n            try:\n                import ppfleetx.data.data_tools.cpp.fast_index_map_helpers\n            except Exception as e:\n                start_time = time.time()\n                print('> compiling dataset index builder ...')\n                from ppfleetx.data.data_tools.cpp.compile import compile_helper\n                compile_helper()\n                print(\n                    '>>> done with dataset index builder. Compilation time: {:.3f} '\n                    'seconds'.format(time.time() - start_time),\n                    flush=True)\n\n        device_world_size = paddle.distributed.get_world_size()\n\n        if device_world_size > 1 and local_rank != 0:\n            while True:\n                try:\n                    import ppfleetx.data.data_tools.cpp.fast_index_map_helpers\n                    break\n                except Exception as e:\n                    print(\"> wait for helpers to be compiled!\")\n                    time.sleep(1)\n\n        try:\n            data_world_size = env.get_data_world_size()\n\n            logger.info(\n                \"The distributed run, total device num:{}, distinct dataflow num:{}.\".\n                format(device_world_size, data_world_size))\n        except AttributeError:\n            pass\n\n        assert len(input_dir) == 1, \"GPT only support one dataset for now.\"\n\n        input_prefix = input_dir[0]\n\n        if os.path.isfile(input_prefix + \"_ids.npz\"):\n            logger.warning(\n                \"You are using compatible dataset, please make new dataset as the readme!\"\n            )\n            process_data = np.load(\n                input_prefix + \"_ids.npz\", mmap_mode=\"r+\", allow_pickle=True)\n            sample_ids = process_data[\"ids\"]\n            sample_lens = process_data[\"lens\"].astype(\"int32\")\n        else:\n            for suffix in [\"_ids.npy\", \"_idx.npz\"]:\n                if not os.path.isfile(input_prefix + suffix):\n                    raise ValueError(\"File Not found, %s\" %\n                                     (input_prefix + suffix))\n\n            sample_ids = np.load(\n                input_prefix + \"_ids.npy\", mmap_mode=\"r\", allow_pickle=True)\n            # All documment ids, extend as 1-D array.\n\n            process_data = np.load(input_prefix + \"_idx.npz\")\n            # The len(sample_lens) num of docs\n            # The sum(sample_lens) should equal len(sample_ids)\n            sample_lens = process_data[\"lens\"]\n\n        splits = get_train_valid_test_split_(split, len(sample_lens))\n        assert len(sample_lens) >= splits[\n            -1], \"The document nums should larger than max of splits, but %s < %s\" % (\n                len(sample_lens), splits[-1])\n\n        tokenizer_class, pretrained_name = MODEL_CLASSES[model_type]\n        tokenizer = tokenizer_class.from_pretrained(pretrained_name)\n\n        self.input_dir = input_dir\n        self.max_seq_len = max_seq_len\n        self.mode = mode\n        self.name = \"gpt_\" + mode\n        self.eos_id = tokenizer.eos_token_id\n        self.sample_ids = sample_ids\n        self.sample_lens = sample_lens\n        self.build_data_file = (local_rank == 0)\n\n        if mode in mode_to_index.keys():\n            index = mode_to_index[mode]\n        else:\n            raise ValueError(\"valid str value for 'mode'\")\n\n        documents = np.arange(splits[index], splits[index + 1])\n        if documents is None:\n            document_ids = np.arange(0, self.sample_lens.shape[0])\n        else:\n            document_ids = documents\n\n        self.doc_idx, self.sample_idx, self.shuffle_idx = \\\n            construct_samples_and_shuffle_data(self.name, input_prefix, document_ids,\\\n                self.sample_lens, num_samples, max_seq_len, seed, self.build_data_file)\n\n        # The doc cumsum start pos\n        self.start_pos = [0] + np.cumsum(self.sample_lens).tolist()\n\n    def _construct_sample(self, tokens):\n        tokens = np.array(tokens).astype(\"int64\").tolist()\n        labels = tokens[1:]\n        tokens = tokens[:-1]\n        seq_length = len(tokens)\n        # Attention mask for the attention calulate\n        # attention_mask = np.tri(seq_length, seq_length).reshape((1, seq_length,\n        #  seq_length))\n        # The pad and eos tokens do not contribute the loss\n        loss_mask = np.ones(seq_length, dtype=\"float32\")\n        loss_mask[tokens == self.eos_id] = 0.0\n        position_ids = np.arange(0, seq_length, dtype=\"int64\")\n\n        labels = np.array(labels).astype(\"int64\")\n        tokens = np.array(tokens).astype(\"int64\")\n        if self.mode == \"Test\":\n            return [tokens, position_ids]\n        else:\n            return [tokens, position_ids, labels, loss_mask]\n\n    def _get_single_sample_from_idx(self, doc_index_f, doc_index_l, offset_f,\n                                    offset_l):\n        \"\"\"\n        The input means:\n            doc_index_f: data from the first doc.\n            doc_index_l: data from the last doc.\n            offset_f: offset of the first doc.\n            offset_l: offset of the last doc.\n        \"\"\"\n        # Data from the sample doc. just select the needed ids.\n        if doc_index_f == doc_index_l:\n            current_start_pos = self.start_pos[self.doc_idx[doc_index_f]]\n            return self.sample_ids[current_start_pos+offset_f:\\\n                       current_start_pos+offset_l+1].tolist()\n\n        # Data from multi docs.\n        else:\n            current_start_pos = self.start_pos[self.doc_idx[doc_index_f]]\n            next_start_pos = self.start_pos[self.doc_idx[doc_index_f] + 1]\n            tokens = self.sample_ids[current_start_pos + offset_f:\n                                     next_start_pos].tolist()\n            for i in range(doc_index_f + 1, doc_index_l):\n                current_start_pos = self.start_pos[self.doc_idx[i]]\n                next_start_pos = self.start_pos[self.doc_idx[i] + 1]\n                tokens.extend(self.sample_ids[current_start_pos:next_start_pos]\n                              .tolist())\n            last_start_pos = self.start_pos[self.doc_idx[doc_index_l]]\n            tokens.extend(self.sample_ids[last_start_pos:last_start_pos +\n                                          offset_l + 1].tolist())\n\n        return tokens\n\n    def __getitem__(self, index):\n        idx = self.shuffle_idx[index]\n        # Start and end documents and offsets.\n        doc_index_f = self.sample_idx[idx][0]\n        doc_index_l = self.sample_idx[idx + 1][0]\n        offset_f = self.sample_idx[idx][1]\n        offset_l = self.sample_idx[idx + 1][1]\n        tokens = self._get_single_sample_from_idx(doc_index_f, doc_index_l,\n                                                  offset_f, offset_l)\n        return self._construct_sample(tokens)\n\n    def __len__(self):\n        return self.sample_idx.shape[0] - 1\n\n\ndef get_train_data_file(input_dir):\n    files = [\n        os.path.join(input_dir, f) for f in os.listdir(input_dir)\n        if (os.path.isfile(os.path.join(input_dir, f)) and str(f)\n            .endswith(\"_idx.npz\"))\n    ]\n    files = [x.replace(\"_idx.npz\", \"\") for x in files]\n    if len(files) == 0:\n        logger.warning(\n            \"Not found dataset with name of xxx_ids.npy and xxx_idx.npz! Try to found old compatible xxx_ids.npz file.\"\n        )\n    else:\n        return files\n\n    files = [\n        os.path.join(input_dir, f) for f in os.listdir(input_dir)\n        if (os.path.isfile(os.path.join(input_dir, f)) and str(f)\n            .endswith(\"_ids.npz\"))\n    ]\n\n    files = [x.replace(\"_ids.npz\", \"\") for x in files]\n\n    if len(files) == 0:\n        raise RuntimeError(\n            \"Not found dataset with name of xxx_ids.npz in given input_dir '{}'! \".\n            format(input_dir))\n    else:\n        return files\n\n\ndef get_train_valid_test_split_(splits, size):\n    \"\"\"\n    Get dataset splits from comma or '/' separated string list.\n    \"\"\"\n\n    splits = [float(s) for s in splits]\n    while len(splits) < 3:\n        splits.append(0.)\n    splits = splits[:3]\n    splits_sum = sum(splits)\n    assert splits_sum > 0.0\n    splits = [split / splits_sum for split in splits]\n    splits_index = [0]\n    for index, split in enumerate(splits):\n        splits_index.append(splits_index[index] + int(\n            round(split * float(size))))\n    diff = splits_index[-1] - size\n    for index in range(1, len(splits_index)):\n        splits_index[index] -= diff\n    assert len(splits_index) == 4\n    assert splits_index[-1] == size\n    return splits_index\n\n\ndef construct_samples_and_shuffle_data(name, data_prefix, documents, sizes,\n                                       num_samples, seq_length, seed,\n                                       build_data_file):\n    \"\"\"\n    documents: document index from 0 to len(docs)\n    sizes: the length list of all docs.\n    num_samples: total step*bs iterations of data.\n    seq_length: the sequence length.\n    sum(sizes) = tokens_per_epoch\n    data_nums = num_samples *  micro_batch_size\n    num_epochs = (data_nums + 1) // sum(sizes)\n    len(doc_idx) = num_epochs * sum(sizes)\n    \"\"\"\n    # Number of tokens in each epoch and number of required epochs.\n    tokens_per_epoch = _num_tokens(documents, sizes)\n    num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples)\n    # Rng state\n    np_rng = np.random.RandomState(seed=seed)\n\n    # Filename of the index mappings.\n    _filename = data_prefix\n    _filename += '_{}_indexmap'.format(name)\n    _filename += '_{}ns'.format(num_samples)\n    _filename += '_{}sl'.format(seq_length)\n    doc_idx_filename = _filename + '_doc_idx.npy'\n    sample_idx_filename = _filename + '_sample_idx.npy'\n    shuffle_idx_filename = _filename + '_shuffle_idx.npy'\n\n    # Sava random state\n    savedState = np_rng.get_state()\n    # Build the indexed mapping if not exist.\n    if build_data_file:\n        if (not os.path.isfile(doc_idx_filename)) or \\\n           (not os.path.isfile(sample_idx_filename)) or \\\n           (not os.path.isfile(shuffle_idx_filename)):\n            if num_epochs == 1:\n                separate_last_epoch = False\n            else:\n                num_samples_from_epochs_minus_one = (\n                    (num_epochs - 1) * tokens_per_epoch - 1) // seq_length\n                last_epoch_num_samples = num_samples - \\\n                                         num_samples_from_epochs_minus_one\n                assert last_epoch_num_samples >= 0, \\\n                    'last epoch number of samples should be non-negative.'\n                num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length\n                assert last_epoch_num_samples < (num_samples_per_epoch + 1), \\\n                    'last epoch number of samples exceeded max value.'\n                separate_last_epoch = (\n                    last_epoch_num_samples < int(0.80 * num_samples_per_epoch))\n            # Note. len(doc_idx) = num_epochs * len(doc)\n            start_time = time.time()\n            doc_idx = _build_doc_idx(documents, num_epochs, np_rng,\n                                     separate_last_epoch)\n            np.save(doc_idx_filename, doc_idx, allow_pickle=True)\n            print(' > elasped time to build and save doc-idx mapping '\n                  '(seconds): {:4f}'.format(time.time() - start_time))\n            # sample-idx. pos of each seq_len of data.\n            start_time = time.time()\n            assert doc_idx.dtype == np.int32\n            assert sizes.dtype == np.int32\n\n            from ppfleetx.data.data_tools.cpp import fast_index_map_helpers\n\n            sample_idx = fast_index_map_helpers.build_sample_idx(\n                sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch)\n            # sample_idx = _build_sample_idx(sizes, doc_idx, seq_length,\n            #                                num_epochs, tokens_per_epoch)\n\n            np.save(sample_idx_filename, sample_idx, allow_pickle=True)\n            print(' > elasped time to build and save sample-idx mapping '\n                  '(seconds): {:4f}'.format(time.time() - start_time))\n\n            # shuffle-idx.\n            start_time = time.time()\n\n            if separate_last_epoch:\n                num_samples_ = num_samples_from_epochs_minus_one\n            else:\n                num_samples_ = sample_idx.shape[0] - 1\n\n            # Shuffle all seq len data.\n            shuffle_idx = _build_shuffle_idx(num_samples_,\n                                             sample_idx.shape[0] - 1, np_rng)\n            np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)\n            print(' > elasped time to build and save shuffle-idx mapping'\n                  ' (seconds): {:4f}'.format(time.time() - start_time))\n\n    else:\n        while True:\n            if (not os.path.isfile(doc_idx_filename)) or \\\n               (not os.path.isfile(sample_idx_filename)) or \\\n               (not os.path.isfile(shuffle_idx_filename)):\n                time.sleep(3)\n            else:\n                try:\n                    np.load(\n                        shuffle_idx_filename, allow_pickle=True, mmap_mode='r')\n                    break\n                except Exception as e:\n                    print(\n                        \"%s file is still writing or damaged, please wait a moment.\"\n                        % shuffle_idx_filename)\n                    time.sleep(3)\n\n    # Restore random state\n    np_rng.set_state(savedState)\n\n    try:\n        if paddle.distributed.get_world_size() > 1:\n            if paddle.in_dynamic_mode():\n                paddle.distributed.barrier()\n    except AssertionError:\n        pass\n\n    # Load mappings.\n    doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r')\n    sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r')\n    shuffle_idx = np.load(\n        shuffle_idx_filename, allow_pickle=True, mmap_mode='r')\n    return doc_idx, sample_idx, shuffle_idx\n\n\ndef _num_tokens(documents, lens):\n    \"\"\"Total number of tokens in the dataset.\"\"\"\n    return np.sum(lens[documents])\n\n\ndef _num_epochs(tokens_per_epoch, seq_length, num_samples):\n    \"\"\"Based on number of samples and sequence lenght, calculate how many\n    epochs will be needed.\"\"\"\n    num_epochs = 0\n    total_tokens = 0\n    while True:\n        num_epochs += 1\n        total_tokens += tokens_per_epoch\n        if ((total_tokens - 1) // seq_length) >= num_samples:\n            return num_epochs\n\n\ndef _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch):\n    \"\"\"\n    Build an array with length = number-of-epochs * number-of-documents.\n    Each index is mapped to a corresponding document.\n    \"\"\"\n    if not separate_last_epoch or num_epochs == 1:\n        doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1]\n        doc_idx[:] = documents\n        # The documents repeat num_epochs times.\n        doc_idx = doc_idx.reshape(-1)\n        doc_idx = doc_idx.astype(np.int32)\n        np_rng.shuffle(doc_idx)\n        return doc_idx\n\n    doc_idx_first = _build_doc_idx(documents, num_epochs - 1, np_rng, False)\n    doc_idx_last = _build_doc_idx(documents, 1, np_rng, False)\n    return np.concatenate((doc_idx_first, doc_idx_last))\n\n\ndef _build_sample_idx(sizes, doc_idx, seq_length, num_epochs,\n                      tokens_per_epoch):\n    \"\"\"\n    num_samples + 1, pos of bs data\n    the distance between two points for sample idx is bs tokens.\n    \"\"\"\n    num_samples = (num_epochs * tokens_per_epoch - 1) // seq_length\n    sample_idx = np.zeros([int(num_samples) + 1, 2], dtype=np.int32)\n\n    sample_index = 0\n    doc_idx_index = 0\n    doc_offset = 0\n    sample_idx[sample_index][0] = doc_idx_index\n    sample_idx[sample_index][1] = doc_offset\n    sample_index += 1\n    while sample_index <= num_samples:\n        remaining_seq_length = seq_length + 1\n        while remaining_seq_length != 0:\n            doc_id = doc_idx[doc_idx_index]\n            doc_length = sizes[doc_id] - doc_offset\n            remaining_seq_length -= doc_length\n            if remaining_seq_length <= 0:\n                doc_offset += (remaining_seq_length + doc_length - 1)\n                remaining_seq_length = 0\n            else:\n                doc_idx_index += 1\n                doc_offset = 0\n        sample_idx[sample_index][0] = doc_idx_index\n        sample_idx[sample_index][1] = doc_offset\n        sample_index += 1\n\n    return sample_idx\n\n\ndef _build_shuffle_idx(num_samples, total_size, np_rng):\n    dtype_ = np.uint32\n    if total_size >= (np.iinfo(np.uint32).max - 1):\n        dtype_ = np.int64\n\n    shuffle_idx_first = np.arange(\n        start=0, stop=num_samples, step=1, dtype=dtype_)\n    np_rng.shuffle(shuffle_idx_first)\n    if num_samples == total_size:\n        return shuffle_idx_first\n\n    shuffle_idx_last = np.arange(\n        start=num_samples, stop=total_size, step=1, dtype=dtype_)\n    np_rng.shuffle(shuffle_idx_last)\n\n    return np.concatenate((shuffle_idx_first, shuffle_idx_last))\n\n\nclass LM_Eval_Dataset(paddle.io.Dataset):\n    def __init__(self,\n                 input_dir,\n                 max_seq_len,\n                 overlapping_eval=None,\n                 model_type=\"GPT\",\n                 **kwargs):\n        tokenizer_class, pretrained_name = MODEL_CLASSES[model_type]\n        tokenizer = tokenizer_class.from_pretrained(pretrained_name)\n\n        with open(input_dir, \"rb\") as reader:\n            entire_data = reader.read().decode('utf-8')\n\n        self.num_original_tokens = len(entire_data.strip().split(\" \"))\n        entire_data = self._wikitext_detokenizer(entire_data)\n        self.tokens = tokenizer.encode(entire_data)\n        self.num_tokenized_tokens = len(self.tokens)\n        print('Original Tokens: %d, Detokenized tokens: %d' %\n              (self.num_original_tokens, self.num_tokenized_tokens))\n\n        self.seq_len = max_seq_len\n        self.pad_idx = tokenizer.eos_token_id\n        self.overlapping_eval = overlapping_eval\n        if self.overlapping_eval is None:\n            self.overlapping_eval = self.seq_len\n        self.overlapping_eval = max(1, self.overlapping_eval)\n\n        self.total_targets = len(self.tokens) - 1\n        # remove first sequence tokens\n        targets = max(self.total_targets - self.overlapping_eval, 0)\n        self.total_sequences = max(\n            math.ceil(targets / self.overlapping_eval) + 1, 1)\n\n    def __len__(self):\n        return self.total_sequences\n\n    def _construct_sample(self, tokens):\n        tokens = np.array(tokens).astype(\"int64\").tolist()\n        labels = tokens[1:]\n        tokens = tokens[:-1]\n        seq_length = len(tokens)\n        # attention mask for the attention calulate\n        attention_mask = np.tri(seq_length, seq_length).reshape(\n            (1, seq_length, seq_length))\n\n        # the pad and eos tokens do not contribute the loss\n        loss_mask = np.ones(seq_length, dtype=\"float32\")\n        loss_mask[tokens == self.pad_idx] = 0.0\n        position_ids = np.arange(0, seq_length, dtype=\"int64\")\n\n        # -INF mask value as default\n        # attention_mask = (attention_mask - 1.0) * 1e9\n        # Bool mask of attention\n        attention_mask = attention_mask.astype(\"float32\")\n        return [tokens, loss_mask, attention_mask, position_ids, labels]\n\n    def __getitem__(self, idx):\n        start_idx = idx * self.overlapping_eval\n        end_idx = start_idx + self.seq_len\n        tokens = self.tokens[start_idx:end_idx + 1]\n        num_tokens = len(tokens)\n        if num_tokens < self.seq_len + 1:\n            num_pad = (self.seq_len + 1 - num_tokens)\n            tokens += [self.pad_idx] * num_pad\n        [tokens, loss_mask, attention_mask, position_ids,\n         labels] = self._construct_sample(tokens)\n        if self.overlapping_eval != self.seq_len and idx != 0:\n            loss_mask[:-self.overlapping_eval] *= 0\n\n        return [tokens, loss_mask, attention_mask, position_ids, labels, \\\n            np.array([self.num_original_tokens, self.num_tokenized_tokens])]\n\n    def _wikitext_detokenizer(self, string):\n        # contractions\n        string = string.replace(\"s '\", \"s'\")\n        string = re.sub(r\"/' [0-9]/\", r\"/'[0-9]/\", string)\n        # number separators\n        string = string.replace(\" @-@ \", \"-\")\n        string = string.replace(\" @,@ \", \",\")\n        string = string.replace(\" @.@ \", \".\")\n        # punctuation\n        string = string.replace(\" : \", \": \")\n        string = string.replace(\" ; \", \"; \")\n        string = string.replace(\" . \", \". \")\n        string = string.replace(\" ! \", \"! \")\n        string = string.replace(\" ? \", \"? \")\n        string = string.replace(\" , \", \", \")\n        # double brackets\n        string = re.sub(r\"\\(\\s*([^\\)]*?)\\s*\\)\", r\"(\\1)\", string)\n        string = re.sub(r\"\\[\\s*([^\\]]*?)\\s*\\]\", r\"[\\1]\", string)\n        string = re.sub(r\"{\\s*([^}]*?)\\s*}\", r\"{\\1}\", string)\n        string = re.sub(r\"\\\"\\s*([^\\\"]*?)\\s*\\\"\", r'\"\\1\"', string)\n        string = re.sub(r\"'\\s*([^']*?)\\s*'\", r\"'\\1'\", string)\n        # miscellaneous\n        string = string.replace(\"= = = =\", \"====\")\n        string = string.replace(\"= = =\", \"===\")\n        string = string.replace(\"= =\", \"==\")\n        string = string.replace(\" \" + chr(176) + \" \", chr(176))\n        string = string.replace(\" \\n\", \"\\n\")\n        string = string.replace(\"\\n \", \"\\n\")\n        string = string.replace(\" N \", \" 1 \")\n        string = string.replace(\" 's\", \"'s\")\n        return string\n\n\nclass Lambada_Eval_Dataset(paddle.io.Dataset):\n    def __init__(self, input_dir, max_seq_len, model_type=\"GPT\", **kwargs):\n        tokenizer_class, pretrained_name = MODEL_CLASSES[model_type]\n        tokenizer = tokenizer_class.from_pretrained(pretrained_name)\n\n        tokenized_data = []\n        tokenized_label = []\n        with open(input_dir, 'r') as f:\n            for line in f.readlines():\n                text = json.loads(line)['text']\n                tokens, labels = self._get_tokens(tokenizer, text)\n                tokenized_data.append(tokens)\n                tokenized_label.append(labels)\n\n        self.pad_idx = tokenizer.eos_token_id\n        self.seq_len = max_seq_len\n        self.tokens = tokenized_data\n        self.labels = tokenized_label\n\n    def __len__(self):\n        return len(self.tokens)\n\n    def _construct_sample(self, tokens):\n        tokens = np.array(tokens).astype(\"int64\").tolist()\n        labels = tokens[1:]\n        tokens = tokens[:-1]\n\n        seq_length = len(tokens)\n        # attention mask for the attention calulate\n        attention_mask = np.tri(seq_length, seq_length).reshape(\n            (1, seq_length, seq_length))\n\n        # the pad and eos tokens do not contribute the loss\n        position_ids = np.arange(0, seq_length, dtype=\"int64\")\n\n        # -INF mask value as default\n        #attention_mask = (attention_mask - 1.0) * 1e9\n        # Bool mask of attention\n        attention_mask = attention_mask.astype(\"float32\")\n        return [tokens, attention_mask, position_ids, labels]\n\n    def __getitem__(self, idx):\n        tokens = self.tokens[idx][:self.seq_len]\n        labels = self.labels[idx]\n        tokens = tokens + labels\n        num_tokens = len(tokens)\n        if num_tokens < self.seq_len + 1:\n            num_pad = (self.seq_len + 1 - num_tokens)\n            tokens += [self.pad_idx] * num_pad\n        loss_mask = np.zeros(self.seq_len, dtype=\"float32\")\n        loss_mask[num_tokens - len(labels) - 1:num_tokens - 1] = 1.\n        [tokens, attention_mask, position_ids,\n         labels] = self._construct_sample(tokens)\n        return [\n            tokens, loss_mask, attention_mask, position_ids, labels,\n            np.array([self.__len__()])\n        ]\n\n    def _get_tokens(self, tokenizer, text, strict=True):\n        if not strict:\n            tokens = tokenizer.encode(text)\n            return tokens[:-1], [tokens[-1]]\n        last_token = text.split()[-1]\n        start_idx = text.rfind(last_token)\n        beginning_tokens = tokenizer.encode(text[:start_idx].strip())\n        last_token = tokenizer.encode(' ' + last_token)\n        return beginning_tokens, last_token\n"
  },
  {
    "path": "ppfleetx/data/dataset/multimodal_dataset.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport time\nimport gzip\n\nimport random\nimport base64\nimport numpy as np\nimport blobfile as bf\n\nfrom random import randint, choice\nfrom tqdm import tqdm\nfrom io import BytesIO\nfrom pathlib import Path\nfrom copy import deepcopy\nimport PIL\nfrom PIL import Image, ImageFile\n\nimport paddle\nfrom paddle.io import Dataset, DataLoader\nfrom paddle.distributed import get_world_size\nfrom paddle.vision import transforms as T\n\nfrom ppfleetx.utils.log import logger\n\n\ndef get_keys(data_path, gpu_num):\n    files = [\n        file.strip() for file in open(data_path).readlines()\n        if file.strip() != \"\"\n    ]\n    local_rank = paddle.distributed.get_rank()\n\n    if len(files) % gpu_num == 0:\n        keys_extend = list(files)\n    else:\n        added_num = gpu_num - (len(files) % gpu_num)\n        try:\n            keys_extend = files + random.sample(files, added_num)\n        except:\n            keys_extend = files + random.sample(files, 1) * added_num\n\n    keys = keys_extend[local_rank::gpu_num]\n    logger.info(\"keys: {} {}\".format(keys, local_rank))\n\n    return keys\n\n\nclass ImagenDataset(Dataset):\n    def __init__(self,\n                 input_path,\n                 image_format='base64',\n                 shuffle=False,\n                 image_size=64,\n                 text_max_len=128,\n                 filter_image_resolution=128,\n                 tokenizer=None,\n                 sr=False,\n                 split='train',\n                 interpolation=\"bicubic\",\n                 flip_p=0.5):\n        super().__init__()\n        device_world_size = paddle.distributed.get_world_size()\n        self.filename = get_keys(input_path, gpu_num=device_world_size)\n        if shuffle:\n            random.shuffle(self.filename)\n        self.filter_image_resolution = filter_image_resolution\n        self.text_max_len = text_max_len\n        self.split = split\n        self.tokenizer = tokenizer\n        self.sr = sr\n        if sr:\n            self.transform = T.Compose([T.Resize(image_size), T.ToTensor()])\n\n        self.for_line = self.get_line_for_line(self.filename).__iter__()\n\n        self.good_index = []\n\n        self.interpolation = {\n            \"linear\": PIL.Image.LINEAR,\n            \"bilinear\": PIL.Image.BILINEAR,\n            \"bicubic\": PIL.Image.BICUBIC,\n            \"lanczos\": PIL.Image.LANCZOS,\n        }[interpolation]\n        self.flip = T.RandomHorizontalFlip(prob=flip_p)\n        self.image_size = image_size\n\n    def load_path(self, data_path, f_index=None):\n        if f_index is None:\n            offset = 0\n            with open(data_path, 'rb') as f:\n                for line in tqdm(f, desc='Loading data'):\n                    self.indexes.append((offset, len(line)))\n                    offset += len(line)\n        else:\n            offset = 0\n            with open(data_path, 'rb') as f:\n                for line in tqdm(f, desc='Loading data'):\n                    self.indexes.append(((offset, len(line)), f_index))\n                    offset += len(line)\n\n        if self.split == 'train':\n            random.shuffle(self.indexes)\n        return\n\n    @staticmethod\n    def base64_to_image(base64_str):\n        byte_data = base64.b64decode(base64_str)\n        image_data = BytesIO(byte_data)\n        img = Image.open(image_data)\n        if img.mode != 'RGB':\n            img = img.convert('RGB')\n        return img\n\n    def get_line_for_line(self, filename):\n        while True:\n            for fname in filename:\n                if fname[-2:] != \"gz\":\n                    file = open(fname)\n                    for line in file:\n                        if line != \"\":\n                            data = line.strip().split('\\t')\n                            image_base64 = data[4]\n                            image_item = self.base64_to_image(image_base64)\n                            if min(image_item.size) >= self.image_size:\n                                yield line\n                else:\n                    file = gzip.GzipFile(fname, \"r\")\n                    for line in file:\n                        if line != \"\":\n                            line = line.decode()\n                            data = line.strip().split('\\t')\n                            image_base64 = data[4]\n                            image_item = self.base64_to_image(image_base64)\n                            if min(image_item.size) >= self.image_size:\n                                yield line\n\n    def __getitem__(self, index):\n        if not isinstance(self.filename, list):\n            data = self.for_line.__next__()\n        else:\n            data = self.for_line.__next__()\n\n        data = data.strip().split('\\t')\n\n        # For laion 400m\n        if len(data) == 6:\n            image_base64 = data[4]\n            caption = data[2]\n\n        image_item = self.base64_to_image(image_base64)\n\n        # Filter image resolution\n        if min(image_item.size) < self.filter_image_resolution:\n            return None\n\n        if not self.sr:\n            self.transform = T.Compose([\n                T.CenterCrop([min(image_item.size), min(image_item.size)]),\n                T.Resize(64), T.ToTensor()\n            ])\n            image_item = self.transform(image_item)\n        else:\n            img = np.array(image_item).astype(np.uint8)\n\n            crop = min(img.shape[0], img.shape[1])\n            h, w, = img.shape[0], img.shape[1]\n\n            if img.shape[0] > img.shape[1]:\n                img = img[0:crop, (w - crop) // 2:(w + crop) // 2]\n            else:\n                img = img[(h - crop) // 2:(h + crop) // 2, (w - crop) // 2:(\n                    w + crop) // 2]\n\n            image = Image.fromarray(img)\n            image = image.resize(\n                (self.image_size, self.image_size),\n                resample=self.interpolation)\n\n            image_item = self.transform(image)\n\n        example = {'id': index, 'image': image_item, 'caption': caption}\n        return example\n\n    def __len__(self):\n        #return len(self.indexes)\n        if self.sr:\n            return 300000000\n        return 5000000\n"
  },
  {
    "path": "ppfleetx/data/dataset/vision_dataset.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport os.path\nimport copy\nimport numpy as np\nfrom typing import Any, Callable, cast, Dict, List, Optional, Tuple\n\nimport paddle\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.data.transforms.utils import create_preprocess_operators, transform\n\n__all__ = [\n    \"GeneralClsDataset\",\n    \"ImageFolder\",\n    \"CIFAR10\",\n    \"ContrativeLearningDataset\",\n]\n\n\nclass GeneralClsDataset(paddle.io.Dataset):\n    def __init__(self,\n                 image_root,\n                 cls_label_path,\n                 transform_ops=None,\n                 delimiter=\" \",\n                 multi_label=False,\n                 class_num=None):\n        if multi_label:\n            assert class_num is not None, \"Must set class_num when multi_label=True\"\n        self.multi_label = multi_label\n        self.classes_num = class_num\n\n        self._img_root = image_root\n        self._cls_path = cls_label_path\n        self.delimiter = delimiter\n        self._transform_ops = None\n        if transform_ops:\n            self._transform_ops = create_preprocess_operators(transform_ops)\n\n        self.images = []\n        self.labels = []\n        self._load_anno()\n\n    def _load_anno(self):\n        assert os.path.exists(\n            self._cls_path), f\"{self._cls_path} does not exists\"\n        assert os.path.exists(\n            self._img_root), f\"{self._img_root} does not exists\"\n        self.images = []\n        self.labels = []\n\n        with open(self._cls_path) as fd:\n            lines = fd.readlines()\n            for l in lines:\n                l = l.strip().split(self.delimiter)\n                self.images.append(os.path.join(self._img_root, l[0]))\n                if self.multi_label:\n                    self.labels.append(l[1])\n                else:\n                    self.labels.append(np.int32(l[1]))\n                assert os.path.exists(self.images[\n                    -1]), f\"{self.images[-1]} does not exists\"\n\n    def __getitem__(self, idx):\n        try:\n            with open(self.images[idx], 'rb') as f:\n                img = f.read()\n            if self._transform_ops:\n                img = transform(img, self._transform_ops)\n            if self.multi_label:\n                one_hot = np.zeros([self.classes_num], dtype=np.float32)\n                cls_idx = [int(e) for e in self.labels[idx].split(',')]\n                for idx in cls_idx:\n                    one_hot[idx] = 1.0\n                return (img, one_hot)\n            else:\n                return (img, np.int32(self.labels[idx]))\n\n        except Exception as ex:\n            logger.error(\"Exception occured when parse line: {} with msg: {}\".\n                         format(self.images[idx], ex))\n            rnd_idx = np.random.randint(self.__len__())\n            return self.__getitem__(rnd_idx)\n\n    def __len__(self):\n        return len(self.images)\n\n    @property\n    def class_num(self):\n        if self.multi_label:\n            return self.classes_num\n        return len(set(self.labels))\n\n\nIMG_EXTENSIONS = (\".jpg\", \".jpeg\", \".png\", \".ppm\", \".bmp\", \".pgm\", \".tif\",\n                  \".tiff\", \".webp\")\n\n\nclass ImageFolder(paddle.io.Dataset):\n    \"\"\" Code ref from https://github.com/pytorch/vision/blob/main/torchvision/datasets/folder.py\n    \n    A generic data loader where the images are arranged in this way by default: ::\n\n        root/dog/xxx.png\n        root/dog/xxy.png\n        root/dog/[...]/xxz.png\n\n        root/cat/123.png\n        root/cat/nsdf3.png\n        root/cat/[...]/asd932_.png\n\n    This class inherits from :class:`~torchvision.datasets.DatasetFolder` so\n    the same methods can be overridden to customize the dataset.\n\n    Args:\n        root (string): Root directory path.\n        transform (callable, optional): A function/transform that  takes in an PIL image\n            and returns a transformed version. E.g, ``transforms.RandomCrop``\n        target_transform (callable, optional): A function/transform that takes in the\n            target and transforms it.\n        loader (callable, optional): A function to load an image given its path.\n        is_valid_file (callable, optional): A function that takes path of an Image file\n            and check if the file is a valid file (used to check of corrupt files)\n\n     Attributes:\n        classes (list): List of the class names sorted alphabetically.\n        class_to_idx (dict): Dict with items (class_name, class_index).\n        imgs (list): List of (image path, class_index) tuples\n    \"\"\"\n\n    def __init__(self, root, extensions=IMG_EXTENSIONS, transform_ops=None):\n\n        self.root = root\n        classes, class_to_idx = self.find_classes(self.root)\n        samples = self.make_dataset(self.root, class_to_idx, extensions)\n        logger.info(\n            f'find total {len(classes)} classes and {len(samples)} images.')\n\n        self.extensions = extensions\n\n        self.classes = classes\n        self.class_to_idx = class_to_idx\n        self.imgs = samples\n        self.targets = [s[1] for s in samples]\n\n        self._transform_ops = None\n        if transform_ops:\n            self._transform_ops = create_preprocess_operators(transform_ops)\n\n    @staticmethod\n    def make_dataset(\n            directory,\n            class_to_idx,\n            extensions=None,\n            is_valid_file=None, ):\n        \"\"\"Generates a list of samples of a form (path_to_sample, class).\n\n        Args:\n            directory (str): root dataset directory, corresponding to ``self.root``.\n            class_to_idx (Dict[str, int]): Dictionary mapping class name to class index.\n            extensions (optional): A list of allowed extensions.\n                Either extensions or is_valid_file should be passed. Defaults to None.\n            is_valid_file (optional): A function that takes path of a file\n                and checks if the file is a valid file\n                (used to check of corrupt files) both extensions and\n                is_valid_file should not be passed. Defaults to None.\n\n        Raises:\n            ValueError: In case ``class_to_idx`` is empty.\n            ValueError: In case ``extensions`` and ``is_valid_file`` are None or both are not None.\n            FileNotFoundError: In case no valid file was found for any class.\n\n        Returns:\n            List[Tuple[str, int]]: samples of a form (path_to_sample, class)\n        \"\"\"\n        if class_to_idx is None:\n            # prevent potential bug since make_dataset() would use the class_to_idx logic of the\n            # find_classes() function, instead of using that of the find_classes() method, which\n            # is potentially overridden and thus could have a different logic.\n            raise ValueError(\"The class_to_idx parameter cannot be None.\")\n\n        directory = os.path.expanduser(directory)\n\n        both_none = extensions is None and is_valid_file is None\n        both_something = extensions is not None and is_valid_file is not None\n        if both_none or both_something:\n            raise ValueError(\n                \"Both extensions and is_valid_file cannot be None or not None at the same time\"\n            )\n\n        if extensions is not None:\n\n            def is_valid_file(filename: str) -> bool:\n                return filename.lower().endswith(\n                    extensions\n                    if isinstance(extensions, str) else tuple(extensions))\n\n        is_valid_file = cast(Callable[[str], bool], is_valid_file)\n\n        instances = []\n        available_classes = set()\n        for target_class in sorted(class_to_idx.keys()):\n            class_index = class_to_idx[target_class]\n            target_dir = os.path.join(directory, target_class)\n            if not os.path.isdir(target_dir):\n                continue\n            for root, _, fnames in sorted(\n                    os.walk(\n                        target_dir, followlinks=True)):\n                for fname in sorted(fnames):\n                    path = os.path.join(root, fname)\n                    if is_valid_file(path):\n                        item = path, class_index\n                        instances.append(item)\n\n                        if target_class not in available_classes:\n                            available_classes.add(target_class)\n\n        empty_classes = set(class_to_idx.keys()) - available_classes\n        if empty_classes:\n            msg = f\"Found no valid file for the classes {', '.join(sorted(empty_classes))}. \"\n            if extensions is not None:\n                msg += f\"Supported extensions are: {extensions if isinstance(extensions, str) else ', '.join(extensions)}\"\n            raise FileNotFoundError(msg)\n\n        return instances\n\n    def find_classes(self, directory):\n        \"\"\"Find the class folders in a dataset structured as follows::\n\n            directory/\n            ├── class_x\n            │   ├── xxx.ext\n            │   ├── xxy.ext\n            │   └── ...\n            │       └── xxz.ext\n            └── class_y\n                ├── 123.ext\n                ├── nsdf3.ext\n                └── ...\n                └── asd932_.ext\n\n        This method can be overridden to only consider\n        a subset of classes, or to adapt to a different dataset directory structure.\n\n        Args:\n            directory(str): Root directory path, corresponding to ``self.root``\n\n        Raises:\n            FileNotFoundError: If ``dir`` has no class folders.\n\n        Returns:\n            (Tuple[List[str], Dict[str, int]]): List of all classes and dictionary mapping each class to an index.\n        \"\"\"\n\n        classes = sorted(\n            entry.name for entry in os.scandir(directory) if entry.is_dir())\n        if not classes:\n            raise FileNotFoundError(\n                f\"Couldn't find any class folder in {directory}.\")\n\n        class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}\n        return classes, class_to_idx\n\n    def __getitem__(self, idx):\n        try:\n            path, target = self.imgs[idx]\n            with open(path, 'rb') as f:\n                img = f.read()\n            if self._transform_ops:\n                img = transform(img, self._transform_ops)\n\n            return (img, np.int32(target))\n\n        except Exception as ex:\n            logger.error(\"Exception occured when parse line: {} with msg: {}\".\n                         format(path, ex))\n            rnd_idx = np.random.randint(self.__len__())\n            return self.__getitem__(rnd_idx)\n\n    def __len__(self) -> int:\n        return len(self.imgs)\n\n    @property\n    def class_num(self):\n        return len(set(self.classes))\n\n\nclass CIFAR10(paddle.io.Dataset):\n    def __init__(\n            self,\n            root,\n            mode='train',\n            transform_ops=None, ):\n        self.root = root\n        self.mode = mode\n        assert self.mode in ['train', 'test']\n        self._transform_ops = None\n\n        self.URL = 'https://dataset.bj.bcebos.com/cifar/cifar-10-python.tar.gz'\n\n        if transform_ops:\n            self._transform_ops = create_preprocess_operators(transform_ops)\n\n        if not os.path.exists(os.path.join(self.root, f'data_batch_1')):\n            from ppfleetx.utils.download import cached_path\n            from ppfleetx.utils.file import untar\n            zip_path = cached_path(\n                self.URL, cache_dir=os.path.abspath(self.root))\n            untar(\n                zip_path,\n                mode=\"r:gz\",\n                out_dir=os.path.join(self.root, '..'),\n                delete=True)\n\n        # wait to download dataset\n        if paddle.distributed.get_world_size() > 1:\n            paddle.distributed.barrier()\n\n        self.images = []\n        self.labels = []\n        self._load_anno()\n\n    def _load_anno(self):\n        def unpickle(file):\n            import pickle\n            with open(file, 'rb') as fo:\n                dict = pickle.load(fo, encoding='bytes')\n            return dict\n\n        if self.mode == 'train':\n            for idx in range(1, 6):\n                path = os.path.join(self.root, f'data_batch_{idx}')\n                ret = unpickle(path)\n                data = ret[b'data']\n                labels = ret[b'labels']\n                for i in range(len(data)):\n                    img = data[i].reshape((3, 32, 32)).transpose((1, 2, 0))\n                    self.images.append(img)\n                    self.labels.append(labels[i])\n        else:\n            path = os.path.join(self.root, f'test_batch')\n            ret = unpickle(path)\n            data = ret[b'data']\n            labels = ret[b'labels']\n            for i in range(len(data)):\n                img = data[i].reshape((3, 32, 32)).transpose((1, 2, 0))\n                self.images.append(img)\n                self.labels.append(labels[i])\n\n    def __getitem__(self, idx):\n        img = self.images[idx]\n        if self._transform_ops:\n            img = transform(img, self._transform_ops)\n\n        return (img, np.int32(self.labels[idx]))\n\n    def __len__(self):\n        return len(self.images)\n\n    @property\n    def class_num(self):\n        return len(set(self.labels))\n\n\nclass ContrativeLearningDataset(ImageFolder):\n    \"\"\" Code ref from https://github.com/pytorch/vision/blob/main/torchvision/datasets/folder.py\n    \n    A generic data loader where the images are arranged in this way by default: ::\n\n        root/dog/xxx.png\n        root/dog/xxy.png\n        root/dog/[...]/xxz.png\n\n        root/cat/123.png\n        root/cat/nsdf3.png\n        root/cat/[...]/asd932_.png\n    \"\"\"\n\n    def __init__(self, root, extensions=IMG_EXTENSIONS, transform_ops=None):\n        super(ContrativeLearningDataset, self).__init__(\n            root, extensions=extensions, transform_ops=transform_ops)\n\n        # remove unused attr\n        del self.classes\n        del self.class_to_idx\n        del self.targets\n        # only use image path\n        self.imgs = [s[0] for s in self.imgs]\n\n    def __getitem__(self, idx):\n        try:\n            path = self.imgs[idx]\n            with open(path, 'rb') as f:\n                img = f.read()\n            if self._transform_ops:\n                img1 = transform(img, self._transform_ops)\n                img2 = transform(img, self._transform_ops)\n\n            return img1, img2\n\n        except Exception as ex:\n            logger.error(\"Exception occured when parse line: {} with msg: {}\".\n                         format(path, ex))\n            rnd_idx = np.random.randint(self.__len__())\n            return self.__getitem__(rnd_idx)\n\n    def __len__(self) -> int:\n        return len(self.imgs)\n\n    @property\n    def class_num(self):\n        raise NotImplementedError\n"
  },
  {
    "path": "ppfleetx/data/sampler/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom .batch_sampler import *\nfrom .collate import Stack, Pad, Tuple, Dict\n"
  },
  {
    "path": "ppfleetx/data/sampler/batch_sampler.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import print_function\nfrom __future__ import division\n\nimport os\nimport sys\nimport numpy as np\nimport math\n\nimport paddle\nfrom paddle.io import DistributedBatchSampler\n\nfrom ppfleetx.distributed.apis import env\n\n__all__ = [\"GPTBatchSampler\", \"DistributedBatchSampler\"]\n\n\nclass GPTBatchSampler(paddle.io.BatchSampler):\n    \"\"\"Sampler that restricts data loading to a subset of the dataset.\n    In such case, each process can pass a DistributedBatchSampler instance \n    as a DataLoader sampler, and load a subset of the original dataset that \n    is exclusive to it.\n    .. note::\n        Dataset is assumed to be of constant size.\n        \n    Args:\n        dataset(paddle.io.Dataset): this could be a `paddle.io.Dataset` implement\n                     or other python object which implemented\n                     `__len__` for BatchSampler to get sample\n                     number of data source.\n        batch_size(int): sample indice number in a mini-batch indices.\n        num_replicas(int, optional): porcess number in distributed training.\n            If :attr:`num_replicas` is None, :attr:`num_replicas` will be\n            retrieved from :code:`paddle.distributed.ParallenEnv`.\n            Default None.\n        rank(int, optional): the rank of the current process among :attr:`num_replicas`\n            processes. If :attr:`rank` is None, :attr:`rank` is retrieved from\n            :code:`paddle.distributed.ParallenEnv`. Default None.\n        shuffle(bool): whther to shuffle indices order before genrating\n            batch indices. Default False.\n        drop_last(bool): whether drop the last incomplete batch dataset size\n            is not divisible by the batch size. Default False\n    Examples:\n        .. code-block:: python\n            import numpy as np\n            from paddle.io import Dataset, DistributedBatchSampler\n            # init with dataset\n            class RandomDataset(Dataset):\n                def __init__(self, num_samples):\n                    self.num_samples = num_samples\n            \n                def __getitem__(self, idx):\n                    image = np.random.random([784]).astype('float32')\n                    label = np.random.randint(0, 9, (1, )).astype('int64')\n                    return image, label\n                \n                def __len__(self):\n                    return self.num_samples\n  \n            dataset = RandomDataset(100)\n            sampler = DistributedBatchSampler(dataset, batch_size=64)\n            for data in sampler:\n                # do something\n                break\n    \"\"\"\n\n    def __init__(self,\n                 dataset,\n                 batch_size,\n                 num_replicas=None,\n                 rank=None,\n                 shuffle=False,\n                 drop_last=False,\n                 consumed_samples=0):\n        self.dataset = dataset\n\n        assert isinstance(batch_size, int) and batch_size > 0, \\\n                \"batch_size should be a positive integer\"\n        self.batch_size = batch_size\n        assert isinstance(shuffle, bool), \\\n                \"shuffle should be a boolean value\"\n        self.shuffle = shuffle\n        assert isinstance(drop_last, bool), \\\n                \"drop_last should be a boolean number\"\n\n        from paddle.distributed import ParallelEnv\n\n        if num_replicas is not None:\n            assert isinstance(num_replicas, int) and num_replicas > 0, \\\n                    \"num_replicas should be a positive integer\"\n            self.nranks = num_replicas\n        else:\n            self.nranks = env.get_data_world_size()\n\n        if rank is not None:\n            assert isinstance(rank, int) and rank >= 0, \\\n                    \"rank should be a non-negative integer\"\n            self.local_rank = rank\n        else:\n            self.local_rank = env.get_data_world_rank()\n\n        self.drop_last = drop_last\n        self.epoch = 0\n\n        self.consumed_samples = consumed_samples\n        self.num_samples = int(\n            math.ceil(len(self.dataset) * 1.0 / self.nranks))\n        self.total_size = self.num_samples * self.nranks\n\n    def get_start_end_idx(self):\n        start_idx = self.local_rank * self.batch_size\n        end_idx = start_idx + self.batch_size\n        return start_idx, end_idx\n\n    def __iter__(self):\n        assert self.consumed_samples % self.nranks == 0, \\\n            \"The consumed_samples should be divided by nranks. consumed_samples=%d, nranks=%s\" % (\n            self.consumed_samples, self.nranks)\n        self.remain_num_samples = int(\n            math.ceil((len(self.dataset) - self.consumed_samples) * 1.0 /\n                      self.nranks))\n        self.remain_total_size = self.remain_num_samples * self.nranks\n        self.batch_size_times_rank_size = self.batch_size * self.nranks\n\n        num_samples = len(self.dataset)\n        batch_indices = []\n        for idx in range(self.consumed_samples, self.total_size):\n            if idx >= num_samples:\n                batch_indices.append(idx - num_samples)\n            else:\n                batch_indices.append(idx)\n            if len(batch_indices) == self.batch_size_times_rank_size:\n                start_idx, end_idx = self.get_start_end_idx()\n                yield batch_indices[start_idx:end_idx]\n                batch_indices = []\n        if not self.drop_last and len(batch_indices) > 0:\n            yield batch_indices\n\n    def __len__(self):\n        num_samples = self.num_samples\n        num_samples += int(not self.drop_last) * (self.batch_size - 1)\n        return num_samples // self.batch_size\n\n    def set_epoch(self, epoch=0, consumed_samples=0):\n        \"\"\"\n        Sets the epoch number. When :attr:`shuffle=True`, this number is used\n        as seeds of random numbers. By default, users may not set this, all\n        replicas (workers) use a different random ordering for each epoch.\n        If set same number at each epoch, this sampler will yield the same\n        ordering at all epoches.\n        Arguments:\n            epoch (int): Epoch number.\n        Examples:\n            .. code-block:: python\n    \n                from paddle.io import Dataset, DistributedBatchSampler\n    \n                # init with dataset\n                class RandomDataset(Dataset):\n                    def __init__(self, num_samples):\n                        self.num_samples = num_samples\n                \n                    def __getitem__(self, idx):\n                        image = np.random.random([784]).astype('float32')\n                        label = np.random.randint(0, 9, (1, )).astype('int64')\n                        return image, label\n                    \n                    def __len__(self):\n                        return self.num_samples\n      \n                dataset = RandomDataset(100)\n                sampler = DistributedBatchSampler(dataset, batch_size=64)\n    \n                for epoch in range(10):\n                    sampler.set_epoch(epoch)\n        \"\"\"\n        self.epoch = epoch\n        # if we reset the epoch, the consumed_samples should be set to 0.\n        self.consumed_samples = consumed_samples\n"
  },
  {
    "path": "ppfleetx/data/sampler/collate.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.\n\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport numpy as np\nimport paddle\n\n__all__ = [\n    'Stack',\n    'Pad',\n    'Tuple',\n    'Dict',\n]\n\n\nclass Stack(object):\n    \"\"\"\n    Stacks the input data samples to construct the batch. The N input samples\n    must have the same shape/length and will be stacked to construct a batch.\n    Args:\n        axis (int, optional): The axis in the result data along which the input\n            data are stacked. Default: 0.\n        dtype (str|numpy.dtype, optional): The value type of the output. If it\n            is set to None, the type of input data is used. Default: None.\n    \"\"\"\n\n    def __init__(self, axis=0, dtype=None):\n        self._axis = axis\n        self._dtype = dtype\n\n    def __call__(self, data):\n        \"\"\"\n        Batchifies the input data by stacking.\n        Args:\n            data (list[numpy.ndarray]): The input data samples. It is a list. \n                Each element is a numpy.ndarray or list.\n        Returns:\n            numpy.ndarray: Stacked batch data.\n        Example:\n            .. code-block:: python\n                from paddlenlp.data import Stack\n                a = [1, 2, 3, 4]\n                b = [3, 4, 5, 6]\n                c = [5, 6, 7, 8]\n                result = Stack()([a, b, c])\n                '''\n                [[1, 2, 3, 4],\n                 [3, 4, 5, 6],\n                 [5, 6, 7, 8]]\n                '''\n        \"\"\"\n        data = np.stack(\n            data,\n            axis=self._axis).astype(self._dtype) if self._dtype else np.stack(\n                data, axis=self._axis)\n        return data\n\n\nclass Pad(object):\n    \"\"\"\n    Pads the input data samples to the largest length at `axis`.\n    Args:\n        pad_val (float|int, optional): The padding value. Default: 0.\n        axis (int, optional): The axis to pad the arrays. The arrays will be\n            padded to the largest length at `axis`. For example, assume the \n            input arrays have shape (10, 8, 5), (6, 8, 5), (3, 8, 5) and the \n            axis is 0. Each input will be padded into (10, 8, 5) and then \n            stacked to form the final output, which has shape (3, 10, 8, 5). \n            Default: 0.\n        ret_length (bool|numpy.dtype, optional): If it is bool, indicate whether\n            to return the valid length in the output, and the data type of\n            returned length is int32 if True. If it is numpy.dtype, indicate the\n            data type of returned length. Default: None.\n        dtype (numpy.dtype, optional): The value type of the output. If it is\n            set to None, the input data type is used. Default: None.\n        pad_right (bool, optional): Whether the padding direction is right-side. \n            If True, it indicates we pad to the right side, while False indicates \n            we pad to the left side. Default: True.\n     \"\"\"\n\n    def __init__(self,\n                 pad_val=0,\n                 axis=0,\n                 ret_length=None,\n                 dtype=None,\n                 pad_right=True):\n        self._pad_val = pad_val\n        self._axis = axis\n        self._ret_length = ret_length\n        self._dtype = dtype\n        self._pad_right = pad_right\n\n    def __call__(self, data):\n        \"\"\"\n        Batchifies the input data by padding. The input will be padded to the \n        largest dimension at `axis` and then stacked to form the final output. \n        In addition, the function will output the original dimensions at the \n        `axis` if `ret_length` is not None or False.\n        Args:\n            data (list[numpy.ndarray|list]): The input data samples. It is a \n                list. Each element is a numpy.ndarray or list.\n        Returns:\n            numpy.ndarray|tuple[numpy.ndarray]: If `ret_length` is False, it \n            is a numpy.ndarray representing the padded batch data and the \n            shape is (N, …). Otherwise, it is a tuple, besides the padded batch \n            data, the tuple also includes a numpy.ndarray representing original \n            length at `axis` of all input samples, which shaped `(N,)`. \n        Example:\n            .. code-block:: python\n                from paddlenlp.data import Pad\n                a = [1, 2, 3, 4]\n                b = [5, 6, 7]\n                c = [8, 9]\n                result = Pad(pad_val=0)([a, b, c])\n                '''\n                [[1, 2, 3, 4],\n                 [5, 6, 7, 0],\n                 [8, 9, 0, 0]]\n                '''\n        \"\"\"\n\n        # return data itself for rare unexpected cases when 1-D array is passed to Pad\n        if not isinstance(data[0], list) and not isinstance(data[0],\n                                                            np.ndarray):\n            return np.asarray(\n                data,\n                dtype=self._dtype if self._dtype is not None else np.int64)\n\n        arrs = [np.asarray(ele) for ele in data]\n        original_length = [ele.shape[self._axis] for ele in arrs]\n        max_size = max(original_length)\n        ret_shape = list(arrs[0].shape)\n        ret_shape[self._axis] = max_size\n        ret_shape = (len(arrs), ) + tuple(ret_shape)\n        ret = np.full(\n            shape=ret_shape,\n            fill_value=self._pad_val,\n            dtype=arrs[0].dtype if self._dtype is None else self._dtype)\n        for i, arr in enumerate(arrs):\n            if arr.shape[self._axis] == max_size:\n                ret[i] = arr\n            else:\n                slices = [slice(None) for _ in range(arr.ndim)]\n                if self._pad_right:\n                    slices[self._axis] = slice(0, arr.shape[self._axis])\n                else:\n                    slices[self._axis] = slice(\n                        max_size - arr.shape[self._axis], max_size)\n\n                if slices[self._axis].start != slices[self._axis].stop:\n                    slices = [slice(i, i + 1)] + slices\n                    ret[tuple(slices)] = arr\n        if self._ret_length:\n            return ret, np.asarray(\n                original_length,\n                dtype=\"int32\") if self._ret_length == True else np.asarray(\n                    original_length, self._ret_length)\n        else:\n            return ret\n\n\nclass Tuple(object):\n    \"\"\"\n    Wraps multiple batchify functions together. The input functions will be applied\n    to the corresponding input fields.\n    \n    Each sample should be a list or tuple containing multiple fields. The i'th\n    batchify function stored in Tuple will be applied on the i'th field. \n    \n    For example, when data sample is (nd_data, label), you can wrap two batchify\n    functions using `Tuple(DataBatchify, LabelBatchify)` to batchify nd_data and\n    label correspondingly.\n    Args:\n        fn (callable|list[callable]|tuple[callable]): The batchify functions to \n            wrap. It is a callable function or a list/tuple of callable functions.\n        args (tuple[callable]): The additional batchify functions to wrap.\n    \"\"\"\n\n    def __init__(self, fn, *args):\n        if isinstance(fn, (list, tuple)):\n            assert len(args) == 0, 'Input pattern not understood. The input of Tuple can be ' \\\n                                   'Tuple(A, B, C) or Tuple([A, B, C]) or Tuple((A, B, C)). ' \\\n                                   'Received fn=%s, args=%s' % (str(fn), str(args))\n            self._fn = fn\n        else:\n            self._fn = (fn, ) + args\n        for i, ele_fn in enumerate(self._fn):\n            assert callable(\n                ele_fn\n            ), 'Batchify functions must be callable! type(fn[%d]) = %s' % (\n                i, str(type(ele_fn)))\n\n    def __call__(self, data):\n        \"\"\"\n        Batchifies data samples by applying each function on the corresponding \n        data field, and each data field is produced by stacking the field data \n        of samples.\n        Args:\n            data (list|tuple): The samples to batchfy. Each sample in list/tuple\n                should contain `N` fields.\n        Returns:\n            tuple: A tuple composed of results from all including batchifying \n            functions.\n        Example:\n            .. code-block:: python\n                \n                from paddlenlp.data import Stack, Pad, Tuple\n                data = [\n                        [[1, 2, 3, 4], [1]],\n                        [[5, 6, 7], [0]],\n                        [[8, 9], [1]],\n                       ]\n                batchify_fn = Tuple(Pad(pad_val=0), Stack())\n                ids, label = batchify_fn(data)\n                '''\n                ids:\n                [[1, 2, 3, 4],\n                [5, 6, 7, 0],\n                [8, 9, 0, 0]]\n                label: [[1], [0], [1]]\n                '''\n        \"\"\"\n\n        assert len(data[0]) == len(self._fn),\\\n            'The number of attributes in each data sample should contain' \\\n            ' {} elements'.format(len(self._fn))\n        ret = []\n        for i, ele_fn in enumerate(self._fn):\n            result = ele_fn([ele[i] for ele in data])\n            if isinstance(result, (tuple, list)):\n                ret.extend(result)\n            else:\n                ret.append(result)\n        return tuple(ret)\n\n\nclass Dict(object):\n    \"\"\"\n    Wraps multiple batchify functions together. The input functions will be \n    applied to the corresponding input fields.\n    \n    Each sample should be a dict containing multiple fields. Each batchify \n    function with key stored in `Dict` will be applied on the field which has \n    the same key. \n    \n    For example, when data sample is {'tokens': tokens, 'labels': labels}, you \n    can wrap two batchify functions using \n    `Dict({'tokens': DataBatchify, 'labels': LabelBatchify})` to batchify tokens \n    and labels correspondingly.\n    Args:\n        fn (dict): The batchify functions to wrap. It is a dict, which values is \n            callable functions.\n    \"\"\"\n\n    def __init__(self, fn):\n        assert isinstance(fn, (dict)), 'Input pattern not understood. The input of Dict must be a dict with key of input column name and value of collate_fn ' \\\n                                   'Received fn=%s' % (str(fn))\n\n        self._fn = fn\n\n        for col_name, ele_fn in self._fn.items():\n            assert callable(\n                ele_fn\n            ), 'Batchify functions must be callable! type(fn[%d]) = %s' % (\n                col_name, str(type(ele_fn)))\n\n    def __call__(self, data):\n        \"\"\"\n        Batchifies data samples by applying each function on the corresponding \n        data field, and each data field is produced by stacking the field data \n        with the same key as batchify functions of all samples.\n        Args:\n            data (list[dict]|tuple[dict]): The samples to batchfy. Each sample \n                in list/tuple is a dict with `N` key-values.\n                \n        Returns:\n            tuple: A tuple composed of results from all including batchifying \n            functions.\n            \n        Example:\n            .. code-block:: python\n                from paddlenlp.data import Stack, Pad, Dict\n                data = [\n                        {'labels':[1], 'token_ids':[1, 2, 3, 4]},\n                        {'labels':[0], 'token_ids':[5, 6, 7]},\n                        {'labels':[1], 'token_ids':[8, 9]},\n                       ]\n                batchify_fn = Dict({'token_ids':Pad(pad_val=0), 'labels':Stack()})\n                ids, label = batchify_fn(data)\n                '''\n                ids:\n                [[1, 2, 3, 4],\n                [5, 6, 7, 0],\n                [8, 9, 0, 0]]\n                label: [[1], [0], [1]]\n                '''\n        \"\"\"\n\n        ret = []\n        for col_name, ele_fn in self._fn.items():\n            result = ele_fn([ele[col_name] for ele in data])\n            if isinstance(result, (tuple, list)):\n                ret.extend(result)\n            else:\n                ret.append(result)\n        return tuple(ret)\n"
  },
  {
    "path": "ppfleetx/data/tokenizers/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom .gpt_tokenizer import GPTTokenizer, GPTChineseTokenizer\nfrom .ernie_tokenizer import get_ernie_tokenizer\nfrom .t5_tokenizer import get_t5_tokenizer\nfrom .debertav2_tokenizer import get_debertav2_tokenizer\n"
  },
  {
    "path": "ppfleetx/data/tokenizers/debertav2_tokenizer.py",
    "content": "# coding=utf-8\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Tokenization for DebertaV2.\"\"\"\n\nfrom __future__ import (absolute_import, division, print_function,\n                        unicode_literals)\n\nimport os\nimport json\nimport copy\nimport logging\nimport warnings\nimport regex as re\nimport unicodedata\nimport sentencepiece as sp\nfrom collections import OrderedDict, UserDict\nfrom typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union\n\nfrom ppfleetx.utils.download import cached_path\nfrom ppfleetx.data.tokenizers.tokenization_utils_base import (\n    _LazyConfigMapping, AddedToken, TruncationStrategy, PaddingStrategy,\n    BatchEncoding, SpecialTokensMixin)\n\nlogger = logging.getLogger(__name__)\n\nMAX_LENGTH = 256\n\nDEFAULT_DebertaV2_NAME = \"projects/imagen/cache/deberta-v-xxlarge\"\n\n# Slow tokenizers used to be saved in three separated files\nSPECIAL_TOKENS_MAP_FILE = \"special_tokens_map.json\"\nADDED_TOKENS_FILE = \"added_tokens.json\"\nTOKENIZER_CONFIG_FILE = \"tokenizer_config.json\"\n\n# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file\nFULL_TOKENIZER_FILE = \"tokenizer.json\"\n_re_tokenizer_file = re.compile(r\"tokenizer\\.(.*)\\.json\")\n\nCONFIG_NAME = \"config.json\"\n\n\ndef get_debertav2_tokenizer(name):\n    tokenizer = DebertaV2Tokenizer.from_pretrained(name)\n    return tokenizer\n\n\ndef debertav2_tokenize(texts, tokenizer):\n    encoded = tokenizer.batch_encode_plus(\n        texts,\n        return_tensors=\"paddle\",\n        padding='longest',\n        max_length=MAX_LENGTH,\n        truncation=True)\n\n    input_ids = encoded.input_ids\n    attn_mask = encoded.attention_mask\n    return input_ids, attn_mask\n\n\nPRETRAINED_VOCAB_FILES_MAP = {\n    \"vocab_file\": {\n        \"microsoft/deberta-v2-xlarge\":\n        \"https://huggingface.co/microsoft/deberta-v2-xlarge/resolve/main/spm.model\",\n        \"microsoft/deberta-v2-xxlarge\":\n        \"https://huggingface.co/microsoft/deberta-v2-xxlarge/resolve/main/spm.model\",\n        \"microsoft/deberta-v2-xlarge-mnli\":\n        (\"https://huggingface.co/microsoft/deberta-v2-xlarge-mnli/resolve/main/spm.model\"\n         ),\n        \"microsoft/deberta-v2-xxlarge-mnli\":\n        (\"https://huggingface.co/microsoft/deberta-v2-xxlarge-mnli/resolve/main/spm.model\"\n         ),\n    }\n}\n\nPRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {\n    \"microsoft/deberta-v2-xlarge\": 512,\n    \"microsoft/deberta-v2-xxlarge\": 512,\n    \"microsoft/deberta-v2-xlarge-mnli\": 512,\n    \"microsoft/deberta-v2-xxlarge-mnli\": 512,\n}\n\nPRETRAINED_INIT_CONFIGURATION = {\n    \"microsoft/deberta-v2-xlarge\": {\n        \"do_lower_case\": False\n    },\n    \"microsoft/deberta-v2-xxlarge\": {\n        \"do_lower_case\": False\n    },\n    \"microsoft/deberta-v2-xlarge-mnli\": {\n        \"do_lower_case\": False\n    },\n    \"microsoft/deberta-v2-xxlarge-mnli\": {\n        \"do_lower_case\": False\n    },\n}\n\nVOCAB_FILES_NAMES = {\"vocab_file\": \"spm.model\"}\n\n\nclass DebertaV2Tokenizer(SpecialTokensMixin):\n    r\"\"\"\n    Constructs a DeBERTa-v2 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).\n\n    Args:\n        vocab_file (`str`):\n            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that\n            contains the vocabulary necessary to instantiate a tokenizer.\n        do_lower_case (`bool`, *optional*, defaults to `False`):\n            Whether or not to lowercase the input when tokenizing.\n        bos_token (`string`, *optional*, defaults to `\"[CLS]\"`):\n            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.\n            When building a sequence using special tokens, this is not the token that is used for the beginning of\n            sequence. The token used is the `cls_token`.\n        eos_token (`string`, *optional*, defaults to `\"[SEP]\"`):\n            The end of sequence token. When building a sequence using special tokens, this is not the token that is\n            used for the end of sequence. The token used is the `sep_token`.\n        unk_token (`str`, *optional*, defaults to `\"[UNK]\"`):\n            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this\n            token instead.\n        sep_token (`str`, *optional*, defaults to `\"[SEP]\"`):\n            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for\n            sequence classification or for a text and a question for question answering. It is also used as the last\n            token of a sequence built with special tokens.\n        pad_token (`str`, *optional*, defaults to `\"[PAD]\"`):\n            The token used for padding, for example when batching sequences of different lengths.\n        cls_token (`str`, *optional*, defaults to `\"[CLS]\"`):\n            The classifier token which is used when doing sequence classification (classification of the whole sequence\n            instead of per-token classification). It is the first token of the sequence when built with special tokens.\n        mask_token (`str`, *optional*, defaults to `\"[MASK]\"`):\n            The token used for masking values. This is the token used when training this model with masked language\n            modeling. This is the token which the model will try to predict.\n        sp_model_kwargs (`dict`, *optional*):\n            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for\n            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,\n            to set:\n\n            - `enable_sampling`: Enable subword regularization.\n            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.\n\n              - `nbest_size = {0,1}`: No sampling is performed.\n              - `nbest_size > 1`: samples from the nbest_size results.\n              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)\n                using forward-filtering-and-backward-sampling algorithm.\n\n            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for\n              BPE-dropout.\n    \"\"\"\n\n    vocab_files_names = VOCAB_FILES_NAMES\n    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP\n    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION\n    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES\n    model_input_names = [\"input_ids\", \"token_type_ids\", \"attention_mask\"]\n    padding_side = \"right\"\n    truncation_side = \"right\"\n    slow_tokenizer_class = None\n\n    def __init__(self,\n                 vocab_file,\n                 do_lower_case=False,\n                 split_by_punct=False,\n                 bos_token=\"[CLS]\",\n                 eos_token=\"[SEP]\",\n                 unk_token=\"[UNK]\",\n                 sep_token=\"[SEP]\",\n                 pad_token=\"[PAD]\",\n                 cls_token=\"[CLS]\",\n                 mask_token=\"[MASK]\",\n                 sp_model_kwargs=None,\n                 **kwargs):\n        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs\n        self.added_tokens_encoder: Dict[str, int] = {}\n        self.added_tokens_decoder: Dict[int, str] = {}\n\n        super().__init__(\n            do_lower_case=do_lower_case,\n            bos_token=bos_token,\n            eos_token=eos_token,\n            unk_token=unk_token,\n            sep_token=sep_token,\n            pad_token=pad_token,\n            cls_token=cls_token,\n            mask_token=mask_token,\n            split_by_punct=split_by_punct,\n            sp_model_kwargs=self.sp_model_kwargs,\n            **kwargs, )\n\n        if not os.path.isfile(vocab_file):\n            raise ValueError(\n                f\"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained\"\n                \" model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`\"\n            )\n        self.do_lower_case = do_lower_case\n        self.split_by_punct = split_by_punct\n        self.vocab_file = vocab_file\n        self._tokenizer = SPMTokenizer(\n            vocab_file,\n            split_by_punct=split_by_punct,\n            sp_model_kwargs=self.sp_model_kwargs)\n\n    def __len__(self):\n        \"\"\"\n        Size of the full vocabulary with the added tokens.\n        \"\"\"\n        return self.vocab_size + len(self.added_tokens_encoder)\n\n    @classmethod\n    def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs,\n                        **kwargs):\n        cache_dir = kwargs.pop(\"cache_dir\", None)\n        force_download = kwargs.pop(\"force_download\", False)\n        resume_download = kwargs.pop(\"resume_download\", False)\n        proxies = kwargs.pop(\"proxies\", None)\n        local_files_only = kwargs.pop(\"local_files_only\", False)\n        use_auth_token = kwargs.pop(\"use_auth_token\", None)\n        revision = kwargs.pop(\"revision\", None)\n        subfolder = kwargs.pop(\"subfolder\", None)\n        from_pipeline = kwargs.pop(\"_from_pipeline\", None)\n        from_auto_class = kwargs.pop(\"_from_auto\", False)\n        commit_hash = kwargs.pop(\"_commit_hash\", None)\n        _raise_exceptions_for_missing_entries = False\n\n        user_agent = {\n            \"file_type\": \"tokenizer\",\n            \"from_auto_class\": from_auto_class,\n            \"is_fast\": \"Fast\" in cls.__name__\n        }\n        if from_pipeline is not None:\n            user_agent[\"using_pipeline\"] = from_pipeline\n\n        pretrained_model_name_or_path = str(pretrained_model_name_or_path)\n        vocab_files = {}\n        init_configuration = {}\n\n        is_local = os.path.isdir(pretrained_model_name_or_path)\n        single_file_id = None\n        if os.path.isfile(\n                pretrained_model_name_or_path\n        ):  # or is_remote_url(pretrained_model_name_or_path):\n            if len(cls.vocab_files_names) > 1:\n                raise ValueError(\n                    f\"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not \"\n                    \"supported for this tokenizer. Use a model identifier or the path to a directory instead.\"\n                )\n            warnings.warn(\n                f\"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is deprecated and \"\n                \"won't be possible anymore in v5. Use a model identifier or the path to a directory instead.\",\n                FutureWarning, )\n            file_id = list(cls.vocab_files_names.keys())[0]\n\n            vocab_files[file_id] = pretrained_model_name_or_path\n            single_file_id = file_id\n        else:\n            # At this point pretrained_model_name_or_path is either a directory or a model identifier name\n            additional_files_names = {\n                \"added_tokens_file\": ADDED_TOKENS_FILE,\n                \"special_tokens_map_file\": SPECIAL_TOKENS_MAP_FILE,\n                \"tokenizer_config_file\": TOKENIZER_CONFIG_FILE,\n            }\n            vocab_files = {\n                ** cls.vocab_files_names, ** additional_files_names\n            }\n\n            if \"tokenizer_file\" in vocab_files:\n                # Try to get the tokenizer config to see if there are versioned tokenizer files.\n                fast_tokenizer_file = FULL_TOKENIZER_FILE\n                resolved_config_file = cached_file(\n                    pretrained_model_name_or_path,\n                    TOKENIZER_CONFIG_FILE,\n                    cache_dir=cache_dir,\n                    force_download=force_download,\n                    resume_download=resume_download,\n                    proxies=proxies,\n                    use_auth_token=use_auth_token,\n                    revision=revision,\n                    local_files_only=local_files_only,\n                    subfolder=subfolder,\n                    user_agent=user_agent,\n                    _raise_exceptions_for_missing_entries=False,\n                    _raise_exceptions_for_connection_errors=False,\n                    _commit_hash=commit_hash, )\n                commit_hash = extract_commit_hash(resolved_config_file,\n                                                  commit_hash)\n                if resolved_config_file is not None:\n                    with open(\n                            resolved_config_file, encoding=\"utf-8\") as reader:\n                        tokenizer_config = json.load(reader)\n                        if \"fast_tokenizer_files\" in tokenizer_config:\n                            fast_tokenizer_file = get_fast_tokenizer_file(\n                                tokenizer_config[\"fast_tokenizer_files\"])\n                vocab_files[\"tokenizer_file\"] = fast_tokenizer_file\n\n        # Get files from url, cache, or disk depending on the case\n        resolved_vocab_files = {}\n        unresolved_files = []\n        for file_id, file_path in vocab_files.items():\n            if file_path is None:\n                resolved_vocab_files[file_id] = None\n            elif single_file_id == file_id:\n                if os.path.isfile(file_path):\n                    resolved_vocab_files[file_id] = file_path\n                elif is_remote_url(file_path):\n                    resolved_vocab_files[file_id] = download_url(\n                        file_path, proxies=proxies)\n            else:\n                if subfolder is None:\n                    subfolder = \"\"\n                path_or_repo_id = str(pretrained_model_name_or_path)\n                if os.path.isdir(path_or_repo_id):\n                    resolved_file = os.path.join(\n                        os.path.join(path_or_repo_id, subfolder), file_path)\n                    if not os.path.isfile(resolved_file):\n                        if _raise_exceptions_for_missing_entries:\n                            raise EnvironmentError(\n                                f\"{path_or_repo_id} does not appear to have a file named {full_filename}. Checkout \"\n                                f\"'https://huggingface.co/{path_or_repo_id}/{revision}' for available files.\"\n                            )\n                        else:\n                            resolved_file = None\n                    resolved_vocab_files[file_id] = resolved_file\n\n                else:\n                    resolved_vocab_files[file_id] = cached_path(\n                        file_path,\n                        cache_dir=cache_dir, )\n\n        if len(unresolved_files) > 0:\n            logger.info(\n                f\"Can't load following files from cache: {unresolved_files} and cannot check if these \"\n                \"files are necessary for the tokenizer to operate.\")\n\n        if all(full_file_name is None\n               for full_file_name in resolved_vocab_files.values()):\n            raise EnvironmentError(\n                f\"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from \"\n                \"'https://huggingface.co/models', make sure you don't have a local directory with the same name. \"\n                f\"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory \"\n                f\"containing all relevant files for a {cls.__name__} tokenizer.\"\n            )\n\n        for file_id, file_path in vocab_files.items():\n            if file_id not in resolved_vocab_files:\n                continue\n\n            #if is_local:\n            #    logger.info(f\"loading file {file_path}\")\n            #else:\n            #    logger.info(f\"loading file {file_path} from cache at {resolved_vocab_files[file_id]}\")\n\n        return cls._from_pretrained(\n            resolved_vocab_files,\n            pretrained_model_name_or_path,\n            init_configuration,\n            *init_inputs,\n            use_auth_token=use_auth_token,\n            cache_dir=cache_dir,\n            local_files_only=local_files_only,\n            _commit_hash=commit_hash,\n            **kwargs, )\n\n    @classmethod\n    def _from_pretrained(cls,\n                         resolved_vocab_files,\n                         pretrained_model_name_or_path,\n                         init_configuration,\n                         *init_inputs,\n                         use_auth_token=None,\n                         cache_dir=None,\n                         local_files_only=False,\n                         _commit_hash=None,\n                         **kwargs):\n        # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json\n        # file or if `from_slow` is set to True.\n        from_slow = kwargs.get(\"from_slow\", False)\n        has_tokenizer_file = resolved_vocab_files.get(\"tokenizer_file\",\n                                                      None) is not None\n        if from_slow:\n            slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(\n                copy.deepcopy(resolved_vocab_files),\n                pretrained_model_name_or_path,\n                copy.deepcopy(init_configuration),\n                *init_inputs,\n                use_auth_token=use_auth_token,\n                cache_dir=cache_dir,\n                local_files_only=local_files_only,\n                _commit_hash=_commit_hash,\n                **(copy.deepcopy(kwargs)), )\n        else:\n            slow_tokenizer = None\n\n        # Prepare tokenizer initialization kwargs\n        # Did we saved some inputs and kwargs to reload ?\n        tokenizer_config_file = resolved_vocab_files.pop(\n            \"tokenizer_config_file\", None)\n        if tokenizer_config_file is not None:\n            with open(\n                    tokenizer_config_file,\n                    encoding=\"utf-8\") as tokenizer_config_handle:\n                init_kwargs = json.load(tokenizer_config_handle)\n            # First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers.\n            config_tokenizer_class = init_kwargs.get(\"tokenizer_class\")\n            init_kwargs.pop(\"tokenizer_class\", None)\n            init_kwargs.pop(\"auto_map\", None)\n            saved_init_inputs = init_kwargs.pop(\"init_inputs\", ())\n            if not init_inputs:\n                init_inputs = saved_init_inputs\n        else:\n            config_tokenizer_class = None\n            init_kwargs = init_configuration\n\n        if config_tokenizer_class is None:\n            try:\n                config_dict = resolved_vocab_files.pop(\"config_file\",\n                                                       CONFIG_NAME)\n                config_dict = os.path.join(pretrained_model_name_or_path,\n                                           config_dict)\n                config_dict = cls._dict_from_json_file(config_dict)\n                config_tokenizer_class = config_dict[\n                    \"tokenizer_class\"] if \"tokenizer_class\" in config_dict else None\n            except (OSError, ValueError, KeyError):\n                # skip if an error occurred.\n                config_dict = None\n            if config_tokenizer_class is None:\n                # Third attempt. If we have not yet found the original type of the tokenizer,\n                # we are loading we see if we can infer it from the type of the configuration file\n                from ppfleetx.data.tokenizers.tokenization_utils_base import TOKENIZER_MAPPING_NAMES  # tests_ignore\n\n                model_type = config_dict[\n                    \"model_type\"] if \"model_type\" in config_dict else None\n                if model_type is None:\n                    # Fallback: use pattern matching on the string.\n                    for pattern in TOKENIZER_MAPPING_NAMES.keys():\n                        if pattern in str(pretrained_model_name_or_path):\n                            model_type = pattern\n                            break\n\n                if model_type is not None:\n                    config_tokenizer_class, config_tokenizer_class_fast = TOKENIZER_MAPPING_NAMES.get(\n                        model_type, (None, None))\n                    if config_tokenizer_class is None:\n                        config_tokenizer_class = config_tokenizer_class_fast\n\n        if config_tokenizer_class is not None:\n            if cls.__name__.replace(\n                    \"Fast\", \"\") != config_tokenizer_class.replace(\"Fast\", \"\"):\n                logger.warning(\n                    \"The tokenizer class you load from this checkpoint is not the same type as the class this\"\n                    \" function is called from. It may result in unexpected tokenization. \\nThe tokenizer class you\"\n                    f\" load from this checkpoint is '{config_tokenizer_class}'. \\nThe class this function is called\"\n                    f\" from is '{cls.__name__}'.\")\n\n        # Update with newly provided kwargs\n        init_kwargs.update(kwargs)\n\n        # Convert AddedTokens serialized as dict to class instances\n        def convert_added_tokens(obj: Union[AddedToken, Any]):\n            if isinstance(obj, dict) and \"__type\" in obj and obj[\n                    \"__type\"] == \"AddedToken\":\n                obj.pop(\"__type\")\n                return AddedToken(**obj)\n            elif isinstance(obj, (list, tuple)):\n                return list(convert_added_tokens(o) for o in obj)\n            elif isinstance(obj, dict):\n                return {k: convert_added_tokens(v) for k, v in obj.items()}\n            return obj\n\n        init_kwargs = convert_added_tokens(init_kwargs)\n\n        # Set max length if needed\n        if pretrained_model_name_or_path in cls.max_model_input_sizes:\n            # if we're using a pretrained model, ensure the tokenizer\n            # wont index sequences longer than the number of positional embeddings\n\n            model_max_length = cls.max_model_input_sizes[\n                pretrained_model_name_or_path]\n            if model_max_length is not None and isinstance(model_max_length,\n                                                           (int, float)):\n\n                model_max_length = min(\n                    init_kwargs.get(\"model_max_length\", int(1e30)),\n                    model_max_length)\n                # TODO(PVP) - uncomment following line in Transformers v5\n                # init_kwargs[\"model_max_length\"] = model_max_length\n                # TODO(PVP) - remove in Transformers v5\n                # ---\n                init_kwargs[\n                    \"model_max_length\"] = cls._eventually_correct_t5_max_length(\n                        pretrained_model_name_or_path, model_max_length,\n                        init_kwargs.get(\"model_max_length\"))\n                # ---\n\n            # Merge resolved_vocab_files arguments in init_kwargs.\n        added_tokens_file = resolved_vocab_files.pop(\"added_tokens_file\", None)\n        for args_name, file_path in resolved_vocab_files.items():\n            if args_name not in init_kwargs:\n                init_kwargs[args_name] = file_path\n\n        if slow_tokenizer is not None:\n            init_kwargs[\"__slow_tokenizer\"] = slow_tokenizer\n\n        init_kwargs[\"name_or_path\"] = pretrained_model_name_or_path\n\n        # Instantiate tokenizer.\n        try:\n            tokenizer = cls(*init_inputs, **init_kwargs)\n        except OSError:\n            raise OSError(\n                \"Unable to load vocabulary from file. \"\n                \"Please check that the provided vocabulary is accessible and not corrupted.\"\n            )\n\n        # Save inputs and kwargs for saving and re-loading with ``save_pretrained``\n        # Removed: Now done at the base class level\n        # tokenizer.init_inputs = init_inputs\n        # tokenizer.init_kwargs = init_kwargs\n\n        # If there is a complementary special token map, load it\n        special_tokens_map_file = resolved_vocab_files.pop(\n            \"special_tokens_map_file\", None)\n        if special_tokens_map_file is not None:\n            with open(\n                    special_tokens_map_file,\n                    encoding=\"utf-8\") as special_tokens_map_handle:\n                special_tokens_map = json.load(special_tokens_map_handle)\n            for key, value in special_tokens_map.items():\n                if key in kwargs and kwargs[key]:\n                    # This value has already been redefined by the kwargs\n                    # We keep this new value and ignore the one stored in the special_tokens_map_file\n\n                    continue\n\n                if isinstance(value, dict):\n                    value = AddedToken(**value)\n                elif isinstance(value, list):\n                    value = [\n                        AddedToken(**token)\n                        if isinstance(token, dict) else token\n                        for token in value\n                    ]\n                setattr(tokenizer, key, value)\n\n        # Add supplementary tokens.\n        special_tokens = tokenizer.all_special_tokens\n        if added_tokens_file is not None:\n            with open(\n                    added_tokens_file,\n                    encoding=\"utf-8\") as added_tokens_handle:\n                added_tok_encoder = json.load(added_tokens_handle)\n\n            # Sort added tokens by index\n            added_tok_encoder_sorted = list(\n                sorted(\n                    added_tok_encoder.items(), key=lambda x: x[1]))\n\n            # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for\n            # individual tokens would repeatedly rebuild a trie, which can be slow.\n            is_last_special = None\n            tokens = []\n\n            for token, index in added_tok_encoder_sorted:\n                current_index = len(tokenizer) + len(tokens)\n                if has_tokenizer_file and index != current_index and tokenizer.convert_tokens_to_ids(\n                        token) != index:\n                    # Tokenizer fast: added token needs to either be in the vocabulary with the proper index or the\n                    # index is the current length of the tokenizer (not in vocabulary)\n                    raise ValueError(\n                        f\"Wrong index found for {token}: should be {tokenizer.convert_tokens_to_ids(token)} but found \"\n                        f\"{index}.\")\n                elif not has_tokenizer_file and index != current_index:\n                    # Tokenizer slow: added token cannot already be in the vocabulary so its index needs to be the\n                    # current length of the tokenizer.\n                    raise ValueError(\n                        f\"Non-consecutive added token '{token}' found. \"\n                        f\"Should have index {current_index} but has index {index} in saved vocabulary.\"\n                    )\n\n                is_special = bool(token in special_tokens)\n                if is_last_special is None or is_last_special == is_special:\n                    tokens.append(token)\n                else:\n                    tokenizer.add_tokens(\n                        tokens, special_tokens=is_last_special)\n                    tokens = [token]\n                is_last_special = is_special\n\n            if tokens:\n                tokenizer.add_tokens(tokens, special_tokens=is_last_special)\n\n        # Check all our special tokens are registered as \"no split\" token (we don't cut them) and are in the vocab\n        added_tokens = tokenizer.sanitize_special_tokens()\n        #if added_tokens:\n        #    logger.warning_advice(\n        #        \"Special tokens have been added in the vocabulary, make sure the associated word embeddings are\"\n        #        \" fine-tuned or trained.\"\n        #    )\n\n        return tokenizer\n\n    @property\n    def vocab_size(self):\n        return len(self.vocab)\n\n    @property\n    def vocab(self):\n        return self._tokenizer.vocab\n\n    def get_vocab(self):\n        vocab = self.vocab.copy()\n        vocab.update(self.get_added_vocab())\n        return vocab\n\n    @classmethod\n    def _dict_from_json_file(cls, json_file):\n        with open(json_file, \"r\", encoding=\"utf-8\") as reader:\n            text = reader.read()\n        return json.loads(text)\n\n    def _tokenize(self, text: str) -> List[str]:\n        \"\"\"Take as input a string and return a list of strings (tokens) for words/sub-words\"\"\"\n        if self.do_lower_case:\n            text = text.lower()\n        return self._tokenizer.tokenize(text)\n\n    def _convert_token_to_id(self, token):\n        \"\"\"Converts a token (str) in an id using the vocab.\"\"\"\n        return self._tokenizer.spm.PieceToId(token)\n\n    def _convert_id_to_token(self, index):\n        \"\"\"Converts an index (integer) in a token (str) using the vocab.\"\"\"\n        return self._tokenizer.spm.IdToPiece(\n            index) if index < self.vocab_size else self.unk_token\n\n    def convert_tokens_to_string(self, tokens):\n        \"\"\"Converts a sequence of tokens (string) in a single string.\"\"\"\n        return self._tokenizer.decode(tokens)\n\n    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):\n        \"\"\"\n        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and\n        adding special tokens. A DeBERTa sequence has the following format:\n\n        - single sequence: [CLS] X [SEP]\n        - pair of sequences: [CLS] A [SEP] B [SEP]\n\n        Args:\n            token_ids_0 (`List[int]`):\n                List of IDs to which the special tokens will be added.\n            token_ids_1 (`List[int]`, *optional*):\n                Optional second list of IDs for sequence pairs.\n\n        Returns:\n            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.\n        \"\"\"\n\n        if token_ids_1 is None:\n            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]\n        cls = [self.cls_token_id]\n        sep = [self.sep_token_id]\n        return cls + token_ids_0 + sep + token_ids_1 + sep\n\n    def get_special_tokens_mask(self,\n                                token_ids_0,\n                                token_ids_1=None,\n                                already_has_special_tokens=False):\n        \"\"\"\n        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding\n        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.\n\n        Args:\n            token_ids_0 (`List[int]`):\n                List of IDs.\n            token_ids_1 (`List[int]`, *optional*):\n                Optional second list of IDs for sequence pairs.\n            already_has_special_tokens (`bool`, *optional*, defaults to `False`):\n                Whether or not the token list is already formatted with special tokens for the model.\n\n        Returns:\n            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.\n        \"\"\"\n\n        if already_has_special_tokens:\n            return super().get_special_tokens_mask(\n                token_ids_0=token_ids_0,\n                token_ids_1=token_ids_1,\n                already_has_special_tokens=True)\n\n        if token_ids_1 is not None:\n            return [1] + ([0] * len(token_ids_0)) + [1] + (\n                [0] * len(token_ids_1)) + [1]\n        return [1] + ([0] * len(token_ids_0)) + [1]\n\n    def create_token_type_ids_from_sequences(self,\n                                             token_ids_0,\n                                             token_ids_1=None):\n        \"\"\"\n        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa\n        sequence pair mask has the following format:\n\n        ```\n        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1\n        | first sequence    | second sequence |\n        ```\n\n        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).\n\n        Args:\n            token_ids_0 (`List[int]`):\n                List of IDs.\n            token_ids_1 (`List[int]`, *optional*):\n                Optional second list of IDs for sequence pairs.\n\n        Returns:\n            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).\n        \"\"\"\n        sep = [self.sep_token_id]\n        cls = [self.cls_token_id]\n        if token_ids_1 is None:\n            return len(cls + token_ids_0 + sep) * [0]\n        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 +\n                                                        sep) * [1]\n\n    def prepare_for_tokenization(self,\n                                 text,\n                                 is_split_into_words=False,\n                                 **kwargs):\n        add_prefix_space = kwargs.pop(\"add_prefix_space\", False)\n        if is_split_into_words or add_prefix_space:\n            text = \" \" + text\n        return (text, kwargs)\n\n    def save_vocabulary(self,\n                        save_directory: str,\n                        filename_prefix: Optional[str]=None) -> Tuple[str]:\n        return self._tokenizer.save_pretrained(\n            save_directory, filename_prefix=filename_prefix)\n\n    def _eventual_warn_about_too_long_sequence(self,\n                                               ids,\n                                               max_length,\n                                               verbose: bool):\n        \"\"\"\n        Depending on the input and internal state we might trigger a warning about a sequence that is too long for its\n        corresponding model\n\n        Args:\n            ids (`List[str]`): The ids produced by the tokenization\n            max_length (`int`, *optional*): The max_length desired (does not trigger a warning if it is set)\n            verbose (`bool`): Whether or not to print more information and warnings.\n\n        \"\"\"\n        if max_length is None and len(ids) > self.model_max_length and verbose:\n            if not self.deprecation_warnings.get(\n                    \"sequence-length-is-longer-than-the-specified-maximum\",\n                    False):\n                logger.warning(\n                    \"Token indices sequence length is longer than the specified maximum sequence length \"\n                    f\"for this model ({len(ids)} > {self.model_max_length}). Running this sequence through the model \"\n                    \"will result in indexing errors\")\n            self.deprecation_warnings[\n                \"sequence-length-is-longer-than-the-specified-maximum\"] = True\n\n    def _get_padding_truncation_strategies(self,\n                                           padding=False,\n                                           truncation=False,\n                                           max_length=None,\n                                           pad_to_multiple_of=None,\n                                           verbose=True,\n                                           **kwargs):\n        \"\"\"\n        Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy\n        and pad_to_max_length) and behaviors.\n        \"\"\"\n        old_truncation_strategy = kwargs.pop(\"truncation_strategy\",\n                                             \"do_not_truncate\")\n        old_pad_to_max_length = kwargs.pop(\"pad_to_max_length\", False)\n\n        # Backward compatibility for previous behavior, maybe we should deprecate it:\n        # If you only set max_length, it activates truncation for max_length\n        if max_length is not None and padding is False and truncation is False:\n            if verbose:\n                if not self.deprecation_warnings.get(\n                        \"Truncation-not-explicitly-activated\", False):\n                    logger.warning(\n                        \"Truncation was not explicitly activated but `max_length` is provided a specific value, please\"\n                        \" use `truncation=True` to explicitly truncate examples to max length. Defaulting to\"\n                        \" 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the\"\n                        \" tokenizer you can select this strategy more precisely by providing a specific strategy to\"\n                        \" `truncation`.\")\n                self.deprecation_warnings[\n                    \"Truncation-not-explicitly-activated\"] = True\n            truncation = \"longest_first\"\n\n        # Get padding strategy\n        if padding is False and old_pad_to_max_length:\n            if verbose:\n                warnings.warn(\n                    \"The `pad_to_max_length` argument is deprecated and will be removed in a future version, \"\n                    \"use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or \"\n                    \"use `padding='max_length'` to pad to a max length. In this case, you can give a specific \"\n                    \"length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the \"\n                    \"maximal input size of the model (e.g. 512 for Bert).\",\n                    FutureWarning, )\n            if max_length is None:\n                padding_strategy = PaddingStrategy.LONGEST\n            else:\n                padding_strategy = PaddingStrategy.MAX_LENGTH\n        elif padding is not False:\n            if padding is True:\n                if verbose:\n                    if max_length is not None and (\n                            truncation is False or\n                            truncation == \"do_not_truncate\"):\n                        warnings.warn(\n                            \"`max_length` is ignored when `padding`=`True` and there is no truncation strategy. \"\n                            \"To pad to max length, use `padding='max_length'`.\")\n                    if old_pad_to_max_length is not False:\n                        warnings.warn(\n                            \"Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`.\"\n                        )\n                padding_strategy = PaddingStrategy.LONGEST  # Default to pad to the longest sequence in the batch\n            elif not isinstance(padding, PaddingStrategy):\n                padding_strategy = PaddingStrategy(padding)\n            elif isinstance(padding, PaddingStrategy):\n                padding_strategy = padding\n        else:\n            padding_strategy = PaddingStrategy.DO_NOT_PAD\n\n        # Get truncation strategy\n        if truncation is False and old_truncation_strategy != \"do_not_truncate\":\n            if verbose:\n                warnings.warn(\n                    \"The `truncation_strategy` argument is deprecated and will be removed in a future version, use\"\n                    \" `truncation=True` to truncate examples to a max length. You can give a specific length with\"\n                    \" `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the maximal input\"\n                    \" size of the model (e.g. 512 for Bert).  If you have pairs of inputs, you can give a specific\"\n                    \" truncation strategy selected among `truncation='only_first'` (will only truncate the first\"\n                    \" sentence in the pairs) `truncation='only_second'` (will only truncate the second sentence in the\"\n                    \" pairs) or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence\"\n                    \" in the pairs).\",\n                    FutureWarning, )\n            truncation_strategy = TruncationStrategy(old_truncation_strategy)\n        elif truncation is not False:\n            if truncation is True:\n                truncation_strategy = (\n                    TruncationStrategy.LONGEST_FIRST\n                )  # Default to truncate the longest sequences in pairs of inputs\n            elif not isinstance(truncation, TruncationStrategy):\n                truncation_strategy = TruncationStrategy(truncation)\n            elif isinstance(truncation, TruncationStrategy):\n                truncation_strategy = truncation\n        else:\n            truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE\n\n        # Set max length if needed\n        if max_length is None:\n            if padding_strategy == PaddingStrategy.MAX_LENGTH:\n                if self.model_max_length > LARGE_INTEGER:\n                    if verbose:\n                        if not self.deprecation_warnings.get(\n                                \"Asking-to-pad-to-max_length\", False):\n                            logger.warning(\n                                \"Asking to pad to max_length but no maximum length is provided and the model has no\"\n                                \" predefined maximum length. Default to no padding.\"\n                            )\n                        self.deprecation_warnings[\n                            \"Asking-to-pad-to-max_length\"] = True\n                    padding_strategy = PaddingStrategy.DO_NOT_PAD\n                else:\n                    max_length = self.model_max_length\n\n            if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:\n                if self.model_max_length > LARGE_INTEGER:\n                    if verbose:\n                        if not self.deprecation_warnings.get(\n                                \"Asking-to-truncate-to-max_length\", False):\n                            logger.warning(\n                                \"Asking to truncate to max_length but no maximum length is provided and the model has\"\n                                \" no predefined maximum length. Default to no truncation.\"\n                            )\n                        self.deprecation_warnings[\n                            \"Asking-to-truncate-to-max_length\"] = True\n                    truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE\n                else:\n                    max_length = self.model_max_length\n\n        # Test if we have a padding token\n        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (\n                not self.pad_token or self.pad_token_id < 0):\n            raise ValueError(\n                \"Asking to pad but the tokenizer does not have a padding token. \"\n                \"Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` \"\n                \"or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.\"\n            )\n\n        # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided\n        if (truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and\n                padding_strategy != PaddingStrategy.DO_NOT_PAD and\n                pad_to_multiple_of is not None and max_length is not None and\n            (max_length % pad_to_multiple_of != 0)):\n            raise ValueError(\n                \"Truncation and padding are both activated but \"\n                f\"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of}).\"\n            )\n\n        return padding_strategy, truncation_strategy, max_length, kwargs\n\n    def _pad(self,\n             encoded_inputs,\n             max_length=None,\n             padding_strategy=PaddingStrategy.DO_NOT_PAD,\n             pad_to_multiple_of=None,\n             return_attention_mask=None):\n        \"\"\"\n        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)\n\n        Args:\n            encoded_inputs:\n                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).\n            max_length: maximum length of the returned list and optionally padding length (see below).\n                Will truncate by taking into account the special tokens.\n            padding_strategy: PaddingStrategy to use for padding.\n\n                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch\n                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)\n                - PaddingStrategy.DO_NOT_PAD: Do not pad\n                The tokenizer padding sides are defined in self.padding_side:\n\n                    - 'left': pads on the left of the sequences\n                    - 'right': pads on the right of the sequences\n            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.\n                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability\n                >= 7.5 (Volta).\n            return_attention_mask:\n                (optional) Set to False to avoid returning attention mask (default: set to model specifics)\n        \"\"\"\n        # Load from model defaults\n        if return_attention_mask is None:\n            return_attention_mask = \"attention_mask\" in self.model_input_names\n\n        required_input = encoded_inputs[self.model_input_names[0]]\n\n        if padding_strategy == PaddingStrategy.LONGEST:\n            max_length = len(required_input)\n\n        if max_length is not None and pad_to_multiple_of is not None and (\n                max_length % pad_to_multiple_of != 0):\n            max_length = (\n                (max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of\n\n        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(\n            required_input) != max_length\n\n        # Initialize attention mask if not present.\n        if return_attention_mask and \"attention_mask\" not in encoded_inputs:\n            encoded_inputs[\"attention_mask\"] = [1] * len(required_input)\n\n        if needs_to_be_padded:\n            difference = max_length - len(required_input)\n\n            if self.padding_side == \"right\":\n                if return_attention_mask:\n\n                    encoded_inputs[\"attention_mask\"] = encoded_inputs[\n                        \"attention_mask\"] + [0] * difference\n                if \"token_type_ids\" in encoded_inputs:\n                    encoded_inputs[\"token_type_ids\"] = (\n                        encoded_inputs[\"token_type_ids\"] +\n                        [self.pad_token_type_id] * difference)\n                if \"special_tokens_mask\" in encoded_inputs:\n                    encoded_inputs[\"special_tokens_mask\"] = encoded_inputs[\n                        \"special_tokens_mask\"] + [1] * difference\n                encoded_inputs[self.model_input_names[\n                    0]] = required_input + [self.pad_token_id] * difference\n            elif self.padding_side == \"left\":\n                if return_attention_mask:\n                    encoded_inputs[\"attention_mask\"] = [\n                        0\n                    ] * difference + encoded_inputs[\"attention_mask\"]\n                if \"token_type_ids\" in encoded_inputs:\n                    encoded_inputs[\"token_type_ids\"] = [\n                        self.pad_token_type_id\n                    ] * difference + encoded_inputs[\"token_type_ids\"]\n                if \"special_tokens_mask\" in encoded_inputs:\n                    encoded_inputs[\"special_tokens_mask\"] = [\n                        1\n                    ] * difference + encoded_inputs[\"special_tokens_mask\"]\n                encoded_inputs[self.model_input_names[\n                    0]] = [self.pad_token_id] * difference + required_input\n            else:\n                raise ValueError(\"Invalid padding strategy:\" + str(\n                    self.padding_side))\n\n        return encoded_inputs\n\n    def pad(\n            self,\n            encoded_inputs,\n            padding=True,\n            max_length=None,\n            pad_to_multiple_of=None,\n            return_attention_mask=None,\n            return_tensors=None,\n            verbose=True, ):\n        \"\"\"\n        Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length\n        in the batch.\n\n        Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,\n        `self.pad_token_id` and `self.pad_token_type_id`)\n\n        <Tip>\n\n        If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the\n        result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of\n        PyTorch tensors, you will lose the specific device of your tensors however.\n\n        </Tip>\n\n        Args:\n            encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`):\n                Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of\n                tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]* or *List[Dict[str,\n                List[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader\n                collate function.\n\n                Instead of `List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see\n                the note above for the return type.\n            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):\n                 Select a strategy to pad the returned sequences (according to the model's padding side and padding\n                 index) among:\n\n                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single\n                  sequence if provided).\n                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum\n                  acceptable input length for the model if that argument is not provided.\n                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different\n                  lengths).\n            max_length (`int`, *optional*):\n                Maximum length of the returned list and optionally padding length (see above).\n            pad_to_multiple_of (`int`, *optional*):\n                If set will pad the sequence to a multiple of the provided value.\n\n                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability\n                >= 7.5 (Volta).\n            return_attention_mask (`bool`, *optional*):\n                Whether to return the attention mask. If left to the default, will return the attention mask according\n                to the specific tokenizer's default, defined by the `return_outputs` attribute.\n\n                [What are attention masks?](../glossary#attention-mask)\n            return_tensors (`str` or [`~utils.TensorType`], *optional*):\n                If set, will return tensors instead of list of python integers. Acceptable values are:\n\n                - `'tf'`: Return TensorFlow `tf.constant` objects.\n                - `'pt'`: Return PyTorch `torch.Tensor` objects.\n                - `'np'`: Return Numpy `np.ndarray` objects.\n            verbose (`bool`, *optional*, defaults to `True`):\n                Whether or not to print more information and warnings.\n        \"\"\"\n        # If we have a list of dicts, let's convert it in a dict of lists\n        # We do this to allow using this method as a collate_fn function in PyTorch Dataloader\n        if isinstance(encoded_inputs, (list, tuple)) and isinstance(\n                encoded_inputs[0], Mapping):\n            encoded_inputs = {\n                key: [example[key] for example in encoded_inputs]\n                for key in encoded_inputs[0].keys()\n            }\n\n        # The model's main input name, usually `input_ids`, has be passed for padding\n        if self.model_input_names[0] not in encoded_inputs:\n            raise ValueError(\n                \"You should supply an encoding or a list of encodings to this method \"\n                f\"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}\"\n            )\n\n        required_input = encoded_inputs[self.model_input_names[0]]\n\n        if not required_input:\n            if return_attention_mask:\n                encoded_inputs[\"attention_mask\"] = []\n            return encoded_inputs\n\n        # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects\n        # and rebuild them afterwards if no return_tensors is specified\n        # Note that we lose the specific device the tensor may be on for PyTorch\n\n        first_element = required_input[0]\n        if isinstance(first_element, (list, tuple)):\n            # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.\n            for item in required_input:\n                if len(item) != 0:\n                    first_element = item[0]\n                    break\n        # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.\n        if not isinstance(first_element, (int, list, tuple)):\n            if is_tf_available() and _is_tensorflow(first_element):\n                return_tensors = \"tf\" if return_tensors is None else return_tensors\n            elif is_torch_available() and _is_torch(first_element):\n                return_tensors = \"pt\" if return_tensors is None else return_tensors\n            elif isinstance(first_element, np.ndarray):\n                return_tensors = \"np\" if return_tensors is None else return_tensors\n            else:\n                raise ValueError(\n                    f\"type of {first_element} unknown: {type(first_element)}. \"\n                    \"Should be one of a python, numpy, pytorch or tensorflow object.\"\n                )\n\n            for key, value in encoded_inputs.items():\n                encoded_inputs[key] = to_py_obj(value)\n\n        # Convert padding_strategy in PaddingStrategy\n        padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(\n            padding=padding, max_length=max_length, verbose=verbose)\n\n        required_input = encoded_inputs[self.model_input_names[0]]\n        if required_input and not isinstance(required_input[0], (list, tuple)):\n            encoded_inputs = self._pad(\n                encoded_inputs,\n                max_length=max_length,\n                padding_strategy=padding_strategy,\n                pad_to_multiple_of=pad_to_multiple_of,\n                return_attention_mask=return_attention_mask, )\n            return BatchEncoding(encoded_inputs, tensor_type=return_tensors)\n\n        batch_size = len(required_input)\n        assert all(\n            len(v) == batch_size for v in encoded_inputs.values()\n        ), \"Some items in the output dictionary have a different batch size than others.\"\n\n        if padding_strategy == PaddingStrategy.LONGEST:\n            max_length = max(len(inputs) for inputs in required_input)\n            padding_strategy = PaddingStrategy.MAX_LENGTH\n\n        batch_outputs = {}\n        for i in range(batch_size):\n            inputs = dict((k, v[i]) for k, v in encoded_inputs.items())\n            outputs = self._pad(\n                inputs,\n                max_length=max_length,\n                padding_strategy=padding_strategy,\n                pad_to_multiple_of=pad_to_multiple_of,\n                return_attention_mask=return_attention_mask, )\n\n            for key, value in outputs.items():\n                if key not in batch_outputs:\n                    batch_outputs[key] = []\n                batch_outputs[key].append(value)\n\n        return BatchEncoding(batch_outputs, tensor_type=return_tensors)\n\n    def create_token_type_ids_from_sequences(self,\n                                             token_ids_0,\n                                             token_ids_1=None):\n        \"\"\"\n        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make\n        use of token type ids, therefore a list of zeros is returned.\n\n        Args:\n            token_ids_0 (`List[int]`):\n                List of IDs.\n            token_ids_1 (`List[int]`, *optional*):\n                Optional second list of IDs for sequence pairs.\n\n        Returns:\n            `List[int]` of zeros.\n        \"\"\"\n        eos = [self.eos_token_id]\n\n        if token_ids_1 is None:\n            return len(token_ids_0 + eos) * [0]\n        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]\n\n    def _add_eos_if_not_present(self, token_ids):\n        \"\"\"Do not add eos again if user already added it.\"\"\"\n        if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:\n            warnings.warn(\n                f\"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated\"\n                \" eos tokens being added.\")\n            return token_ids\n        else:\n            return token_ids + [self.eos_token_id]\n\n    def truncate_sequences(self,\n                           ids,\n                           pair_ids=None,\n                           num_tokens_to_remove=0,\n                           truncation_strategy=\"longest_first\",\n                           stride=0):\n        \"\"\"\n        Truncates a sequence pair in-place following the strategy.\n\n        Args:\n            ids (`List[int]`):\n                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and\n                `convert_tokens_to_ids` methods.\n            pair_ids (`List[int]`, *optional*):\n                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`\n                and `convert_tokens_to_ids` methods.\n            num_tokens_to_remove (`int`, *optional*, defaults to 0):\n                Number of tokens to remove using the truncation strategy.\n            truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):\n                The strategy to follow for truncation. Can be:\n\n                - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the\n                  maximum acceptable input length for the model if that argument is not provided. This will truncate\n                  token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a\n                  batch of pairs) is provided.\n                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the\n                  maximum acceptable input length for the model if that argument is not provided. This will only\n                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.\n                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the\n                  maximum acceptable input length for the model if that argument is not provided. This will only\n                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.\n                - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater\n                  than the model maximum admissible input size).\n            stride (`int`, *optional*, defaults to 0):\n                If set to a positive number, the overflowing tokens returned will contain some tokens from the main\n                sequence returned. The value of this argument defines the number of additional tokens.\n\n        Returns:\n            `Tuple[List[int], List[int], List[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of\n            overflowing tokens. Note: The *longest_first* strategy returns empty list of overflowing tokens if a pair\n            of sequences (or a batch of pairs) is provided.\n        \"\"\"\n        if num_tokens_to_remove <= 0:\n            return ids, pair_ids, []\n\n        if not isinstance(truncation_strategy, TruncationStrategy):\n            truncation_strategy = TruncationStrategy(truncation_strategy)\n\n        overflowing_tokens = []\n        if truncation_strategy == TruncationStrategy.ONLY_FIRST or (\n                truncation_strategy == TruncationStrategy.LONGEST_FIRST and\n                pair_ids is None):\n            if len(ids) > num_tokens_to_remove:\n                window_len = min(len(ids), stride + num_tokens_to_remove)\n                if self.truncation_side == \"left\":\n                    overflowing_tokens = ids[:window_len]\n                    ids = ids[num_tokens_to_remove:]\n                elif self.truncation_side == \"right\":\n                    overflowing_tokens = ids[-window_len:]\n                    ids = ids[:-num_tokens_to_remove]\n                else:\n                    raise ValueError(\n                        f\"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'.\"\n                    )\n\n            else:\n                error_msg = (\n                    f\"We need to remove {num_tokens_to_remove} to truncate the input \"\n                    f\"but the first sequence has a length {len(ids)}. \")\n                if truncation_strategy == TruncationStrategy.ONLY_FIRST:\n                    error_msg = (\n                        error_msg +\n                        \"Please select another truncation strategy than \"\n                        f\"{truncation_strategy}, for instance 'longest_first' or 'only_second'.\"\n                    )\n                logger.error(error_msg)\n        elif truncation_strategy == TruncationStrategy.LONGEST_FIRST:\n            logger.warning(\n                \"Be aware, overflowing tokens are not returned for the setting you have chosen,\"\n                f\" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' \"\n                \"truncation strategy. So the returned list will always be empty even if some \"\n                \"tokens have been removed.\")\n            for _ in range(num_tokens_to_remove):\n                if pair_ids is None or len(ids) > len(pair_ids):\n                    if self.truncation_side == \"right\":\n                        ids = ids[:-1]\n                    elif self.truncation_side == \"left\":\n                        ids = ids[1:]\n                    else:\n                        raise ValueError(\"invalid truncation strategy:\" + str(\n                            self.truncation_side))\n                else:\n                    if self.truncation_side == \"right\":\n                        pair_ids = pair_ids[:-1]\n                    elif self.truncation_side == \"left\":\n                        pair_ids = pair_ids[1:]\n                    else:\n                        raise ValueError(\"invalid truncation strategy:\" + str(\n                            self.truncation_side))\n        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:\n            if len(pair_ids) > num_tokens_to_remove:\n                window_len = min(len(pair_ids), stride + num_tokens_to_remove)\n                if self.truncation_side == \"right\":\n                    overflowing_tokens = pair_ids[-window_len:]\n                    pair_ids = pair_ids[:-num_tokens_to_remove]\n                elif self.truncation_side == \"left\":\n                    overflowing_tokens = pair_ids[:window_len]\n                    pair_ids = pair_ids[num_tokens_to_remove:]\n                else:\n                    raise ValueError(\"invalid truncation strategy:\" + str(\n                        self.truncation_side))\n            else:\n                logger.error(\n                    f\"We need to remove {num_tokens_to_remove} to truncate the input \"\n                    f\"but the second sequence has a length {len(pair_ids)}. \"\n                    f\"Please select another truncation strategy than {truncation_strategy}, \"\n                    \"for instance 'longest_first' or 'only_first'.\")\n\n        return (ids, pair_ids, overflowing_tokens)\n\n    def num_special_tokens_to_add(self, pair: bool=False) -> int:\n        \"\"\"\n        Returns the number of added tokens when encoding a sequence with special tokens.\n\n        <Tip>\n\n        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put\n        this inside your training loop.\n\n        </Tip>\n\n        Args:\n            pair (`bool`, *optional*, defaults to `False`):\n                Whether the number of added tokens should be computed in the case of a sequence pair or a single\n                sequence.\n\n        Returns:\n            `int`: Number of special tokens added to sequences.\n        \"\"\"\n        token_ids_0 = []\n        token_ids_1 = []\n        return len(\n            self.build_inputs_with_special_tokens(token_ids_0, token_ids_1\n                                                  if pair else None))\n\n    def prepare_for_model(self,\n                          ids,\n                          pair_ids=None,\n                          add_special_tokens=True,\n                          padding=False,\n                          truncation=False,\n                          max_length=None,\n                          stride=0,\n                          pad_to_multiple_of=None,\n                          return_tensors=None,\n                          return_token_type_ids=None,\n                          return_attention_mask=None,\n                          return_overflowing_tokens=False,\n                          return_special_tokens_mask=False,\n                          return_offsets_mapping=False,\n                          return_length=False,\n                          verbose=True,\n                          prepend_batch_axis=False,\n                          **kwargs):\n        \"\"\"\n        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It\n        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and\n        manages a moving window (with user defined stride) for overflowing tokens. Please Note, for *pair_ids*\n        different than `None` and *truncation_strategy = longest_first* or `True`, it is not possible to return\n        overflowing tokens. Such a combination of arguments will raise an error.\n\n        Args:\n            ids (`List[int]`):\n                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and\n                `convert_tokens_to_ids` methods.\n            pair_ids (`List[int]`, *optional*):\n                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`\n                and `convert_tokens_to_ids` methods.\n        \"\"\"\n\n        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'\n        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(\n            padding=padding,\n            truncation=truncation,\n            max_length=max_length,\n            pad_to_multiple_of=pad_to_multiple_of,\n            verbose=verbose,\n            **kwargs, )\n\n        pair = bool(pair_ids is not None)\n        len_ids = len(ids)\n        len_pair_ids = len(pair_ids) if pair else 0\n\n        if return_token_type_ids and not add_special_tokens:\n            raise ValueError(\n                \"Asking to return token_type_ids while setting add_special_tokens to False \"\n                \"results in an undefined behavior. Please set add_special_tokens to True or \"\n                \"set return_token_type_ids to None.\")\n\n        if (return_overflowing_tokens and\n                truncation_strategy == TruncationStrategy.LONGEST_FIRST and\n                pair_ids is not None):\n            raise ValueError(\n                \"Not possible to return overflowing tokens for pair of sequences with the \"\n                \"`longest_first`. Please select another truncation strategy than `longest_first`, \"\n                \"for instance `only_second` or `only_first`.\")\n\n        # Load from model defaults\n        if return_token_type_ids is None:\n            return_token_type_ids = \"token_type_ids\" in self.model_input_names\n        if return_attention_mask is None:\n            return_attention_mask = \"attention_mask\" in self.model_input_names\n\n        encoded_inputs = {}\n\n        # Compute the total size of the returned encodings\n        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(\n            pair=pair) if add_special_tokens else 0)\n\n        # Truncation: Handle max sequence length\n        overflowing_tokens = []\n        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:\n            ids, pair_ids, overflowing_tokens = self.truncate_sequences(\n                ids,\n                pair_ids=pair_ids,\n                num_tokens_to_remove=total_len - max_length,\n                truncation_strategy=truncation_strategy,\n                stride=stride, )\n\n        if return_overflowing_tokens:\n            encoded_inputs[\"overflowing_tokens\"] = overflowing_tokens\n            encoded_inputs[\"num_truncated_tokens\"] = total_len - max_length\n\n        # Add special tokens\n        if add_special_tokens:\n            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)\n            token_type_ids = self.create_token_type_ids_from_sequences(\n                ids, pair_ids)\n        else:\n            sequence = ids + pair_ids if pair else ids\n            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids)\n                                               if pair else [])\n\n        # Build output dictionary\n        encoded_inputs[\"input_ids\"] = sequence\n        if return_token_type_ids:\n            encoded_inputs[\"token_type_ids\"] = token_type_ids\n        if return_special_tokens_mask:\n            if add_special_tokens:\n                encoded_inputs[\n                    \"special_tokens_mask\"] = self.get_special_tokens_mask(\n                        ids, pair_ids)\n            else:\n                encoded_inputs[\"special_tokens_mask\"] = [0] * len(sequence)\n\n        # Check lengths\n        self._eventual_warn_about_too_long_sequence(\n            encoded_inputs[\"input_ids\"], max_length, verbose)\n\n        # Padding\n        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:\n            encoded_inputs = self.pad(\n                encoded_inputs,\n                max_length=max_length,\n                padding=padding_strategy.value,\n                pad_to_multiple_of=pad_to_multiple_of,\n                return_attention_mask=return_attention_mask, )\n\n        if return_length:\n            encoded_inputs[\"length\"] = len(encoded_inputs[\"input_ids\"])\n\n        batch_outputs = BatchEncoding(\n            encoded_inputs,\n            tensor_type=return_tensors,\n            prepend_batch_axis=prepend_batch_axis)\n        return batch_outputs\n\n    def _batch_prepare_for_model(\n            self,\n            batch_ids_pairs,\n            add_special_tokens=True,\n            padding_strategy=PaddingStrategy.DO_NOT_PAD,\n            truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE,\n            max_length=None,\n            stride=0,\n            pad_to_multiple_of=None,\n            return_tensors=None,\n            return_token_type_ids=None,\n            return_attention_mask=None,\n            return_overflowing_tokens=False,\n            return_special_tokens_mask=False,\n            return_length=False,\n            verbose=True, ):\n        \"\"\"\n        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It\n        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and\n        manages a moving window (with user defined stride) for overflowing tokens\n\n        Args:\n            batch_ids_pairs: list of tokenized input ids or input ids pairs\n        \"\"\"\n\n        batch_outputs = {}\n        for first_ids, second_ids in batch_ids_pairs:\n            outputs = self.prepare_for_model(\n                first_ids,\n                second_ids,\n                add_special_tokens=add_special_tokens,\n                padding=PaddingStrategy.DO_NOT_PAD.\n                value,  # we pad in batch afterward\n                truncation=truncation_strategy.value,\n                max_length=max_length,\n                stride=stride,\n                pad_to_multiple_of=None,  # we pad in batch afterward\n                return_attention_mask=False,  # we pad in batch afterward\n                return_token_type_ids=return_token_type_ids,\n                return_overflowing_tokens=return_overflowing_tokens,\n                return_special_tokens_mask=return_special_tokens_mask,\n                return_length=return_length,\n                return_tensors=None,  # We convert the whole batch to tensors at the end\n                prepend_batch_axis=False,\n                verbose=verbose, )\n\n            for key, value in outputs.items():\n                if key not in batch_outputs:\n                    batch_outputs[key] = []\n                batch_outputs[key].append(value)\n\n        batch_outputs = self.pad(\n            batch_outputs,\n            padding=padding_strategy.value,\n            max_length=max_length,\n            pad_to_multiple_of=pad_to_multiple_of,\n            return_attention_mask=return_attention_mask, )\n\n        batch_outputs = BatchEncoding(\n            batch_outputs, tensor_type=return_tensors)\n\n        return batch_outputs\n\n    def _get_padding_truncation_strategies(self,\n                                           padding=False,\n                                           truncation=False,\n                                           max_length=None,\n                                           pad_to_multiple_of=None,\n                                           verbose=True,\n                                           **kwargs):\n        \"\"\"\n        Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy\n        and pad_to_max_length) and behaviors.\n        \"\"\"\n        old_truncation_strategy = kwargs.pop(\"truncation_strategy\",\n                                             \"do_not_truncate\")\n        old_pad_to_max_length = kwargs.pop(\"pad_to_max_length\", False)\n\n        # Backward compatibility for previous behavior, maybe we should deprecate it:\n        # If you only set max_length, it activates truncation for max_length\n        if max_length is not None and padding is False and truncation is False:\n            if verbose:\n                if not self.deprecation_warnings.get(\n                        \"Truncation-not-explicitly-activated\", False):\n                    logger.warning(\n                        \"Truncation was not explicitly activated but `max_length` is provided a specific value, please\"\n                        \" use `truncation=True` to explicitly truncate examples to max length. Defaulting to\"\n                        \" 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the\"\n                        \" tokenizer you can select this strategy more precisely by providing a specific strategy to\"\n                        \" `truncation`.\")\n                self.deprecation_warnings[\n                    \"Truncation-not-explicitly-activated\"] = True\n            truncation = \"longest_first\"\n\n        # Get padding strategy\n        if padding is False and old_pad_to_max_length:\n            if verbose:\n                warnings.warn(\n                    \"The `pad_to_max_length` argument is deprecated and will be removed in a future version, \"\n                    \"use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or \"\n                    \"use `padding='max_length'` to pad to a max length. In this case, you can give a specific \"\n                    \"length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the \"\n                    \"maximal input size of the model (e.g. 512 for Bert).\",\n                    FutureWarning, )\n            if max_length is None:\n                padding_strategy = PaddingStrategy.LONGEST\n            else:\n                padding_strategy = PaddingStrategy.MAX_LENGTH\n        elif padding is not False:\n            if padding is True:\n                if verbose:\n                    if max_length is not None and (\n                            truncation is False or\n                            truncation == \"do_not_truncate\"):\n                        warnings.warn(\n                            \"`max_length` is ignored when `padding`=`True` and there is no truncation strategy. \"\n                            \"To pad to max length, use `padding='max_length'`.\")\n                    if old_pad_to_max_length is not False:\n                        warnings.warn(\n                            \"Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`.\"\n                        )\n                padding_strategy = PaddingStrategy.LONGEST  # Default to pad to the longest sequence in the batch\n            elif not isinstance(padding, PaddingStrategy):\n                padding_strategy = PaddingStrategy(padding)\n            elif isinstance(padding, PaddingStrategy):\n                padding_strategy = padding\n        else:\n            padding_strategy = PaddingStrategy.DO_NOT_PAD\n\n        # Get truncation strategy\n        if truncation is False and old_truncation_strategy != \"do_not_truncate\":\n            if verbose:\n                warnings.warn(\n                    \"The `truncation_strategy` argument is deprecated and will be removed in a future version, use\"\n                    \" `truncation=True` to truncate examples to a max length. You can give a specific length with\"\n                    \" `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the maximal input\"\n                    \" size of the model (e.g. 512 for Bert).  If you have pairs of inputs, you can give a specific\"\n                    \" truncation strategy selected among `truncation='only_first'` (will only truncate the first\"\n                    \" sentence in the pairs) `truncation='only_second'` (will only truncate the second sentence in the\"\n                    \" pairs) or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence\"\n                    \" in the pairs).\",\n                    FutureWarning, )\n            truncation_strategy = TruncationStrategy(old_truncation_strategy)\n        elif truncation is not False:\n            if truncation is True:\n                truncation_strategy = (\n                    TruncationStrategy.LONGEST_FIRST\n                )  # Default to truncate the longest sequences in pairs of inputs\n            elif not isinstance(truncation, TruncationStrategy):\n                truncation_strategy = TruncationStrategy(truncation)\n            elif isinstance(truncation, TruncationStrategy):\n                truncation_strategy = truncation\n        else:\n            truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE\n\n        # Set max length if needed\n        if max_length is None:\n            if padding_strategy == PaddingStrategy.MAX_LENGTH:\n                if self.model_max_length > LARGE_INTEGER:\n                    if verbose:\n                        if not self.deprecation_warnings.get(\n                                \"Asking-to-pad-to-max_length\", False):\n                            logger.warning(\n                                \"Asking to pad to max_length but no maximum length is provided and the model has no\"\n                                \" predefined maximum length. Default to no padding.\"\n                            )\n                        self.deprecation_warnings[\n                            \"Asking-to-pad-to-max_length\"] = True\n                    padding_strategy = PaddingStrategy.DO_NOT_PAD\n                else:\n                    max_length = self.model_max_length\n\n            if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:\n                if self.model_max_length > LARGE_INTEGER:\n                    if verbose:\n                        if not self.deprecation_warnings.get(\n                                \"Asking-to-truncate-to-max_length\", False):\n                            logger.warning(\n                                \"Asking to truncate to max_length but no maximum length is provided and the model has\"\n                                \" no predefined maximum length. Default to no truncation.\"\n                            )\n                        self.deprecation_warnings[\n                            \"Asking-to-truncate-to-max_length\"] = True\n                    truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE\n                else:\n                    max_length = self.model_max_length\n\n        # Test if we have a padding token\n        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (\n                not self.pad_token or self.pad_token_id < 0):\n            raise ValueError(\n                \"Asking to pad but the tokenizer does not have a padding token. \"\n                \"Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` \"\n                \"or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.\"\n            )\n\n        # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided\n        if (truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and\n                padding_strategy != PaddingStrategy.DO_NOT_PAD and\n                pad_to_multiple_of is not None and max_length is not None and\n            (max_length % pad_to_multiple_of != 0)):\n            raise ValueError(\n                \"Truncation and padding are both activated but \"\n                f\"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of}).\"\n            )\n\n        return padding_strategy, truncation_strategy, max_length, kwargs\n\n    def batch_encode_plus(self,\n                          batch_text_or_text_pairs,\n                          add_special_tokens=True,\n                          padding=False,\n                          truncation=False,\n                          max_length=None,\n                          stride=0,\n                          is_split_into_words=False,\n                          pad_to_multiple_of=None,\n                          return_tensors=None,\n                          return_token_type_ids=None,\n                          return_attention_mask=None,\n                          return_overflowing_tokens=False,\n                          return_special_tokens_mask=False,\n                          return_offsets_mapping=False,\n                          return_length=False,\n                          verbose=True,\n                          **kwargs):\n        \"\"\"\n        Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.\n\n        <Tip warning={true}>\n\n        This method is deprecated, `__call__` should be used instead.\n\n        </Tip>\n\n        Args:\n            batch_text_or_text_pairs (`List[str]`, `List[Tuple[str, str]]`, `List[List[str]]`, `List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also `List[List[int]]`, `List[Tuple[List[int], List[int]]]`):\n                Batch of sequences or pair of sequences to be encoded. This can be a list of\n                string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see\n                details in `encode_plus`).\n        \"\"\"\n\n        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'\n        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(\n            padding=padding,\n            truncation=truncation,\n            max_length=max_length,\n            pad_to_multiple_of=pad_to_multiple_of,\n            verbose=verbose,\n            **kwargs, )\n\n        return self._batch_encode_plus(\n            batch_text_or_text_pairs=batch_text_or_text_pairs,\n            add_special_tokens=add_special_tokens,\n            padding_strategy=padding_strategy,\n            truncation_strategy=truncation_strategy,\n            max_length=max_length,\n            stride=stride,\n            is_split_into_words=is_split_into_words,\n            pad_to_multiple_of=pad_to_multiple_of,\n            return_tensors=return_tensors,\n            return_token_type_ids=return_token_type_ids,\n            return_attention_mask=return_attention_mask,\n            return_overflowing_tokens=return_overflowing_tokens,\n            return_special_tokens_mask=return_special_tokens_mask,\n            return_offsets_mapping=return_offsets_mapping,\n            return_length=return_length,\n            verbose=verbose,\n            **kwargs, )\n\n    def _batch_encode_plus(\n            self,\n            batch_text_or_text_pairs,\n            add_special_tokens=True,\n            padding_strategy=PaddingStrategy.DO_NOT_PAD,\n            truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE,\n            max_length=None,\n            stride=0,\n            is_split_into_words=False,\n            pad_to_multiple_of=None,\n            return_tensors=None,\n            return_token_type_ids=None,\n            return_attention_mask=None,\n            return_overflowing_tokens=False,\n            return_special_tokens_mask=False,\n            return_offsets_mapping=False,\n            return_length=False,\n            verbose=True,\n            **kwargs):\n        def get_input_ids(text):\n            if isinstance(text, str):\n                tokens = self.tokenize(text, **kwargs)\n                return self.convert_tokens_to_ids(tokens)\n            elif isinstance(text,\n                            (list, tuple)) and len(text) > 0 and isinstance(\n                                text[0], str):\n                if is_split_into_words:\n                    tokens = list(\n                        itertools.chain(*(self.tokenize(\n                            t, is_split_into_words=True, **kwargs)\n                                          for t in text)))\n                    return self.convert_tokens_to_ids(tokens)\n                else:\n                    return self.convert_tokens_to_ids(text)\n            elif isinstance(text,\n                            (list, tuple)) and len(text) > 0 and isinstance(\n                                text[0], int):\n                return text\n            else:\n                raise ValueError(\n                    \"Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.\"\n                )\n\n        if return_offsets_mapping:\n            raise NotImplementedError(\n                \"return_offset_mapping is not available when using Python tokenizers. \"\n                \"To use this feature, change your tokenizer to one deriving from \"\n                \"transformers.PreTrainedTokenizerFast.\")\n\n        input_ids = []\n        for ids_or_pair_ids in batch_text_or_text_pairs:\n            if not isinstance(ids_or_pair_ids, (list, tuple)):\n                ids, pair_ids = ids_or_pair_ids, None\n            elif is_split_into_words and not isinstance(ids_or_pair_ids[0],\n                                                        (list, tuple)):\n                ids, pair_ids = ids_or_pair_ids, None\n            else:\n                ids, pair_ids = ids_or_pair_ids\n\n            first_ids = get_input_ids(ids)\n            second_ids = get_input_ids(\n                pair_ids) if pair_ids is not None else None\n            input_ids.append((first_ids, second_ids))\n\n        batch_outputs = self._batch_prepare_for_model(\n            input_ids,\n            add_special_tokens=add_special_tokens,\n            padding_strategy=padding_strategy,\n            truncation_strategy=truncation_strategy,\n            max_length=max_length,\n            stride=stride,\n            pad_to_multiple_of=pad_to_multiple_of,\n            return_attention_mask=return_attention_mask,\n            return_token_type_ids=return_token_type_ids,\n            return_overflowing_tokens=return_overflowing_tokens,\n            return_special_tokens_mask=return_special_tokens_mask,\n            return_length=return_length,\n            return_tensors=return_tensors,\n            verbose=verbose, )\n\n        return BatchEncoding(batch_outputs)\n\n    def tokenize(self, text, **kwargs):\n        \"\"\"\n        Converts a string in a sequence of tokens, using the tokenizer.\n\n        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies\n        (BPE/SentencePieces/WordPieces). Takes care of added tokens.\n\n        Args:\n            text (`str`):\n                The sequence to be encoded.\n            **kwargs (additional keyword arguments):\n                Passed along to the model-specific `prepare_for_tokenization` preprocessing method.\n\n        Returns:\n            `List[str]`: The list of tokens.\n        \"\"\"\n        # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors\n        all_special_tokens_extended = dict(\n            (str(t), t) for t in self.all_special_tokens_extended\n            if isinstance(t, AddedToken))\n\n        text, kwargs = self.prepare_for_tokenization(text, **kwargs)\n\n        if kwargs:\n            logger.warning(f\"Keyword arguments {kwargs} not recognized.\")\n\n        # TODO: should this be in the base class?\n        if hasattr(self, \"do_lower_case\") and self.do_lower_case:\n            # convert non-special tokens to lowercase\n            escaped_special_toks = [\n                re.escape(s_tok)\n                for s_tok in (self.unique_no_split_tokens +\n                              self.all_special_tokens)\n            ]\n            pattern = r\"(\" + r\"|\".join(escaped_special_toks) + r\")|\" + r\"(.+?)\"\n            text = re.sub(pattern,\n                          lambda m: m.groups()[0] or m.groups()[1].lower(),\n                          text)\n\n        no_split_token = set(self.unique_no_split_tokens)\n        tokens = self.tokens_trie.split(text)\n        # [\"This is something\", \"<special_token_1>\", \"  else\"]\n        for i, token in enumerate(tokens):\n            if token in no_split_token:\n                tok_extended = all_special_tokens_extended.get(token, None)\n                left = tokens[i - 1] if i > 0 else None\n                right = tokens[i + 1] if i < len(tokens) - 1 else None\n                if isinstance(tok_extended, AddedToken):\n                    if tok_extended.rstrip and right:\n                        # A bit counter-intuitive but we strip the left of the string\n                        # since tok_extended.rstrip means the special token is eating all white spaces on its right\n                        tokens[i + 1] = right.lstrip()\n                    # Strip white spaces on the left\n                    if tok_extended.lstrip and left:\n                        tokens[i - 1] = left.rstrip()  # Opposite here\n                else:\n                    # We strip left and right by default\n                    if right:\n                        tokens[i + 1] = right.lstrip()\n                    if left:\n                        tokens[i - 1] = left.rstrip()\n        # [\"This is something\", \"<special_token_1>\", \"else\"]\n        tokenized_text = []\n        for token in tokens:\n            # Need to skip eventual empty (fully stripped) tokens\n            if not token:\n                continue\n            if token in no_split_token:\n                tokenized_text.append(token)\n            else:\n                tokenized_text.extend(self._tokenize(token))\n        # [\"This\", \" is\", \" something\", \"<special_token_1>\", \"else\"]\n        return tokenized_text\n\n\nclass SPMTokenizer:\n    r\"\"\"\n    Constructs a tokenizer based on [SentencePiece](https://github.com/google/sentencepiece).\n\n    Args:\n        vocab_file (`str`):\n            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that\n            contains the vocabulary necessary to instantiate a tokenizer.\n        sp_model_kwargs (`dict`, *optional*):\n            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for\n            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,\n            to set:\n\n            - `enable_sampling`: Enable subword regularization.\n            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.\n\n              - `nbest_size = {0,1}`: No sampling is performed.\n              - `nbest_size > 1`: samples from the nbest_size results.\n              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)\n                using forward-filtering-and-backward-sampling algorithm.\n\n            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for\n              BPE-dropout.\n    \"\"\"\n\n    def __init__(self,\n                 vocab_file,\n                 split_by_punct=False,\n                 sp_model_kwargs: Optional[Dict[str, Any]]=None):\n        self.split_by_punct = split_by_punct\n        self.vocab_file = vocab_file\n        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs\n        spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)\n        if not os.path.exists(vocab_file):\n            raise FileNotFoundError(f\"{vocab_file} does not exist!\")\n        spm.load(vocab_file)\n        bpe_vocab_size = spm.GetPieceSize()\n        # Token map\n        # <unk> 0+1\n        # <s> 1+1\n        # </s> 2+1\n        self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)}\n        self.ids_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]\n        # self.vocab['[PAD]'] = 0\n        # self.vocab['[CLS]'] = 1\n        # self.vocab['[SEP]'] = 2\n        # self.vocab['[UNK]'] = 3\n\n        self.spm = spm\n\n    def __getstate__(self):\n        state = self.__dict__.copy()\n        state[\"spm\"] = None\n        return state\n\n    def __setstate__(self, d):\n        self.__dict__ = d\n\n        # for backward compatibility\n        if not hasattr(self, \"sp_model_kwargs\"):\n            self.sp_model_kwargs = {}\n\n        self.spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)\n        self.spm.Load(self.vocab_file)\n\n    def tokenize(self, text):\n        return self._encode_as_pieces(text)\n\n    def convert_ids_to_tokens(self, ids):\n        tokens = []\n        for i in ids:\n            tokens.append(self.ids_to_tokens[i])\n        return tokens\n\n    def decode(self, tokens, start=-1, end=-1, raw_text=None):\n        if raw_text is None:\n            return self.spm.decode_pieces([t for t in tokens])\n        else:\n            words = self.split_to_words(raw_text)\n            word_tokens = [self.tokenize(w) for w in words]\n            token2words = [0] * len(tokens)\n            tid = 0\n            for i, w in enumerate(word_tokens):\n                for k, t in enumerate(w):\n                    token2words[tid] = i\n                    tid += 1\n            word_start = token2words[start]\n            word_end = token2words[end] if end < len(tokens) else len(words)\n            text = \"\".join(words[word_start:word_end])\n            return text\n\n    def add_special_token(self, token):\n        if token not in self.special_tokens:\n            self.special_tokens.append(token)\n            if token not in self.vocab:\n                self.vocab[token] = len(self.vocab) - 1\n                self.ids_to_tokens.append(token)\n        return self.id(token)\n\n    def part_of_whole_word(self, token, is_bos=False):\n        if is_bos:\n            return True\n        if (len(token) == 1 and\n            (_is_whitespace(list(token)[0]) or _is_control(list(token)[0]) or\n             _is_punctuation(list(token)[0]))) or token in self.special_tokens:\n            return False\n\n        word_start = b\"\\xe2\\x96\\x81\".decode(\"utf-8\")\n        return not token.startswith(word_start)\n\n    def pad(self):\n        return \"[PAD]\"\n\n    def bos(self):\n        return \"[CLS]\"\n\n    def eos(self):\n        return \"[SEP]\"\n\n    def unk(self):\n        return \"[UNK]\"\n\n    def mask(self):\n        return \"[MASK]\"\n\n    def sym(self, id):\n        return self.ids_to_tokens[id]\n\n    def id(self, sym):\n        return self.vocab[sym] if sym in self.vocab else 1\n\n    def _encode_as_pieces(self, text):\n        text = convert_to_unicode(text)\n        if self.split_by_punct:\n            words = self._run_split_on_punc(text)\n            pieces = [self.spm.encode(w, out_type=str) for w in words]\n            return [p for w in pieces for p in w]\n        else:\n            return self.spm.encode(text, out_type=str)\n\n    def split_to_words(self, text):\n        pieces = self._encode_as_pieces(text)\n        word_start = b\"\\xe2\\x96\\x81\".decode(\"utf-8\")\n        words = []\n        offset = 0\n        prev_end = 0\n        for i, p in enumerate(pieces):\n            if p.startswith(word_start):\n                if offset > prev_end:\n                    words.append(text[prev_end:offset])\n                prev_end = offset\n                w = p.replace(word_start, \"\")\n            else:\n                w = p\n            try:\n                s = text.index(w, offset)\n                pn = \"\"\n                k = i + 1\n                while k < len(pieces):\n                    pn = pieces[k].replace(word_start, \"\")\n                    if len(pn) > 0:\n                        break\n                    k += 1\n\n                if len(pn) > 0 and pn in text[offset:s]:\n                    offset = offset + 1\n                else:\n                    offset = s + len(w)\n            except Exception:\n                offset = offset + 1\n\n        if prev_end < offset:\n            words.append(text[prev_end:offset])\n\n        return words\n\n    def _run_strip_accents(self, text):\n        \"\"\"Strips accents from a piece of text.\"\"\"\n        text = unicodedata.normalize(\"NFD\", text)\n        output = []\n        for char in text:\n            cat = unicodedata.category(char)\n            if cat == \"Mn\":\n                continue\n            output.append(char)\n        return \"\".join(output)\n\n    def _run_split_on_punc(self, text):\n        \"\"\"Splits punctuation on a piece of text.\"\"\"\n        chars = list(text)\n        i = 0\n        start_new_word = True\n        output = []\n        while i < len(chars):\n            char = chars[i]\n            if _is_punctuation(char):\n                output.append([char])\n                start_new_word = True\n            else:\n                if start_new_word:\n                    output.append([])\n                start_new_word = False\n                output[-1].append(char)\n            i += 1\n\n        return [\"\".join(x) for x in output]\n\n    def save_pretrained(self, path: str, filename_prefix: str=None):\n        filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]]\n        if filename_prefix is not None:\n            filename = filename_prefix + \"-\" + filename\n        full_path = os.path.join(path, filename)\n        with open(full_path, \"wb\") as fs:\n            fs.write(self.spm.serialized_model_proto())\n        return (full_path, )\n\n\ndef _is_whitespace(char):\n    \"\"\"Checks whether `chars` is a whitespace character.\"\"\"\n    # \\t, \\n, and \\r are technically control characters but we treat them\n    # as whitespace since they are generally considered as such.\n    if char == \" \" or char == \"\\t\" or char == \"\\n\" or char == \"\\r\":\n        return True\n    cat = unicodedata.category(char)\n    if cat == \"Zs\":\n        return True\n    return False\n\n\ndef _is_control(char):\n    \"\"\"Checks whether `chars` is a control character.\"\"\"\n    # These are technically control characters but we count them as whitespace\n    # characters.\n    if char == \"\\t\" or char == \"\\n\" or char == \"\\r\":\n        return False\n    cat = unicodedata.category(char)\n    if cat.startswith(\"C\"):\n        return True\n    return False\n\n\ndef _is_punctuation(char):\n    \"\"\"Checks whether `chars` is a punctuation character.\"\"\"\n    cp = ord(char)\n    # We treat all non-letter/number ASCII as punctuation.\n    # Characters such as \"^\", \"$\", and \"`\" are not in the Unicode\n    # Punctuation class but we treat them as punctuation anyways, for\n    # consistency.\n    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (\n            cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):\n        return True\n    cat = unicodedata.category(char)\n    if cat.startswith(\"P\"):\n        return True\n    return False\n\n\ndef convert_to_unicode(text):\n    \"\"\"Converts `text` to Unicode (if it's not already), assuming utf-8 input.\"\"\"\n    if isinstance(text, str):\n        return text\n    elif isinstance(text, bytes):\n        return text.decode(\"utf-8\", \"ignore\")\n    else:\n        raise ValueError(f\"Unsupported string type: {type(text)}\")\n"
  },
  {
    "path": "ppfleetx/data/tokenizers/ernie_tokenizer.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom paddlenlp.transformers import ErnieTokenizer\n\ntokenizer = None\n\n\ndef get_ernie_tokenizer(tokenizer_type):\n    global tokenizer\n    if tokenizer is None:\n        tokenizer = ErnieTokenizer.from_pretrained(tokenizer_type)\n\n    return tokenizer\n"
  },
  {
    "path": "ppfleetx/data/tokenizers/gpt_tokenizer.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Tokenization classes for OpenAI GPT.\"\"\"\n\nfrom __future__ import (absolute_import, division, print_function,\n                        unicode_literals)\n\nimport sys\nimport json\nimport logging\nimport warnings\nimport os\nimport regex as re\nfrom io import open\n\nfrom ppfleetx.utils.download import cached_path\n\ntry:\n    from functools import lru_cache\nexcept ImportError:\n    # Just a dummy decorator to get the checks to run on python2\n    # because honestly I don't want to support a byte-level unicode BPE\n    # tokenizer on python 2 right now.\n    def lru_cache():\n        return lambda func: func\n\n\nfrom ppfleetx.utils.log import logger\n\ntry:\n    import paddlenlp\n    from paddlenlp.transformers.gpt.tokenizer import GPTChineseTokenizer\nexcept ImportError:\n    raise ImportError(\"Please import paddlenlp before running the GPT tasks.\")\n\nPRETRAINED_VOCAB_ARCHIVE_MAP = {\n    'gpt2': \"http://fleet.bj.bcebos.com/datasets/gpt/gpt2-vocab.json\",\n}\nPRETRAINED_MERGES_ARCHIVE_MAP = {\n    'gpt2': \"http://fleet.bj.bcebos.com/datasets/gpt/gpt2-merges.txt\",\n}\nPRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {'gpt2': 1024, }\nVOCAB_NAME = 'vocab.json'\nMERGES_NAME = 'merges.txt'\nSPECIAL_TOKENS_NAME = 'special_tokens.txt'\n\n\n@lru_cache()\ndef bytes_to_unicode():\n    \"\"\"\n    Returns list of utf-8 byte and a corresponding list of unicode strings.\n    The reversible bpe codes work on unicode strings.\n    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.\n    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.\n    This is a signficant percentage of your normal, say, 32K bpe vocab.\n    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.\n    And avoids mapping to whitespace/control characters the bpe code barfs on.\n    \"\"\"\n    _chr = unichr if sys.version_info[0] == 2 else chr\n    bs = list(range(ord(\"!\"), ord(\"~\") + 1)) + list(range(ord(\"¡\"), ord(\"¬\") + 1)) + \\\n        list(range(ord(\"®\"), ord(\"ÿ\") + 1))\n    cs = bs[:]\n    n = 0\n    for b in range(2**8):\n        if b not in bs:\n            bs.append(b)\n            cs.append(2**8 + n)\n            n += 1\n    cs = [_chr(n) for n in cs]\n    return dict(zip(bs, cs))\n\n\ndef get_pairs(word):\n    \"\"\"Return set of symbol pairs in a word.\n    Word is represented as tuple of symbols (symbols being variable-length strings).\n    \"\"\"\n    pairs = set()\n    prev_char = word[0]\n    for char in word[1:]:\n        pairs.add((prev_char, char))\n        prev_char = char\n    return pairs\n\n\nclass GPTTokenizer(object):\n    \"\"\"\n    GPT-2 BPE tokenizer. Peculiarities:\n        - Byte-level BPE\n    \"\"\"\n\n    padding_side = \"right\"\n    truncation_side = \"right\"\n    model_input_names = [\"input_ids\", \"token_type_ids\", \"attention_mask\"]\n    pad_token_type_id = 0\n    pad_token_id = 0\n\n    @classmethod\n    def from_pretrained(cls,\n                        pretrained_model_name_or_path,\n                        cache_dir=None,\n                        *inputs,\n                        **kwargs):\n        \"\"\"\n        Instantiate a PreTrainedBertModel from a pre-trained model file.\n        Download and cache the pre-trained model file if needed.\n        \"\"\"\n        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:\n            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[\n                pretrained_model_name_or_path]\n            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[\n                pretrained_model_name_or_path]\n            special_tokens_file = None\n        else:\n            vocab_file = os.path.join(pretrained_model_name_or_path,\n                                      VOCAB_NAME)\n            merges_file = os.path.join(pretrained_model_name_or_path,\n                                       MERGES_NAME)\n            special_tokens_file = os.path.join(pretrained_model_name_or_path,\n                                               SPECIAL_TOKENS_NAME)\n            if not os.path.exists(special_tokens_file):\n                special_tokens_file = None\n            else:\n                logger.info(\"loading special tokens file {}\".format(\n                    special_tokens_file))\n        # redirect to the cache, if necessary\n        try:\n            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)\n            resolved_merges_file = cached_path(\n                merges_file, cache_dir=cache_dir)\n        except Exception as e:\n            logger.info(e)\n            logger.error(\n                \"Model name '{}' was not found in model name list ({}). \"\n                \"We assumed '{}' was a path or url but couldn't find files {} and {} \"\n                \"at this path or url.\".format(\n                    pretrained_model_name_or_path, ', '.join(\n                        PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),\n                    pretrained_model_name_or_path, vocab_file, merges_file))\n            return None\n        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:\n            logger.info(\"loading vocabulary file {}\".format(vocab_file))\n            logger.info(\"loading merges file {}\".format(merges_file))\n        else:\n            logger.info(\"loading vocabulary file {} from cache at {}\".format(\n                vocab_file, resolved_vocab_file))\n            logger.info(\"loading merges file {} from cache at {}\".format(\n                merges_file, resolved_merges_file))\n        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:\n            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer\n            # than the number of positional embeddings\n            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[\n                pretrained_model_name_or_path]\n            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)\n        # Instantiate tokenizer.\n        if special_tokens_file and 'special_tokens' not in kwargs:\n            special_tokens = open(\n                special_tokens_file, encoding='utf-8').read().split('\\n')[:-1]\n        else:\n            special_tokens = kwargs.pop('special_tokens', [])\n        tokenizer = cls(resolved_vocab_file,\n                        resolved_merges_file,\n                        special_tokens=special_tokens,\n                        *inputs,\n                        **kwargs)\n        return tokenizer\n\n    def __init__(self,\n                 vocab_file,\n                 merges_file,\n                 errors='replace',\n                 special_tokens=None,\n                 max_len=None,\n                 **kwargs):\n\n        self.padding_side = kwargs.pop(\"padding_side\", self.padding_side)\n        if self.padding_side not in [\"right\", \"left\"]:\n            raise ValueError(\n                f\"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}\"\n            )\n\n        self.truncation_side = kwargs.pop(\"truncation_side\",\n                                          self.truncation_side)\n        if self.truncation_side not in [\"right\", \"left\"]:\n            raise ValueError(\n                f\"Padding side should be selected between 'right' and 'left', current value: {self.truncation_side}\"\n            )\n\n        self.max_len = max_len if max_len is not None else int(1e12)\n        self.encoder = json.load(open(vocab_file))\n        self.decoder = {v: k for k, v in self.encoder.items()}\n        self.errors = errors  # how to handle errors in decoding\n        self.byte_encoder = bytes_to_unicode()\n        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}\n        bpe_data = open(merges_file, encoding='utf-8').read().split('\\n')[1:-1]\n        bpe_merges = [tuple(merge.split()) for merge in bpe_data]\n        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))\n        self.cache = {}\n\n        # Should haved added re.IGNORECASE so BPE merges can happen for\n        # capitalized versions of contractions\n        self.eod_id = self.encoder['<|endoftext|>']\n        self.pat = re.compile(\n            r\"\"\"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+\"\"\"\n        )\n\n        self.special_tokens = {}\n        self.special_tokens_decoder = {}\n        self.set_special_tokens(special_tokens)\n\n    def __call__(self,\n                 text,\n                 text_pair=None,\n                 add_special_tokens=True,\n                 padding=False,\n                 truncation=False,\n                 max_length=None,\n                 pad_to_multiple_of=None,\n                 return_token_type_ids=None,\n                 return_attention_mask=None,\n                 return_overflowing_tokens=False,\n                 return_length=False):\n        assert padding in [True, False, \"longest\", \"max_length\", \"do_not_pad\"]\n\n        if max_length is not None and padding is False and truncation is False:\n            truncation = \"longest_first\"\n\n        if padding is True:\n            padding = \"longest\"\n        elif padding is False:\n            padding = \"do_not_pad\"\n\n        assert truncation in [\n            True, False, \"only_first\", \"only_second\", \"longest_first\",\n            \"do_not_truncate\"\n        ]\n        if truncation is True:\n            truncation = \"longest_first\"\n        elif truncation is False:\n            truncation = \"do_not_truncate\"\n\n        # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided\n        if (truncation != \"do_not_truncate\" and padding != \"do_not_pad\" and\n                pad_to_multiple_of is not None and max_length is not None and\n            (max_length % pad_to_multiple_of != 0)):\n            raise ValueError(\n                \"Truncation and padding are both activated but \"\n                f\"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of}).\"\n            )\n\n        is_batched = isinstance(text, (list, tuple))\n        if is_batched:\n            raise NotImplementedError\n        else:\n            return self.encode_plus(\n                text=text,\n                text_pair=text_pair,\n                add_special_tokens=add_special_tokens,\n                padding=padding,\n                truncation=truncation,\n                max_length=max_length,\n                pad_to_multiple_of=pad_to_multiple_of,\n                return_token_type_ids=return_token_type_ids,\n                return_attention_mask=return_attention_mask,\n                return_overflowing_tokens=return_overflowing_tokens,\n                return_length=return_length)\n\n    def encode_plus(self,\n                    text,\n                    text_pair,\n                    add_special_tokens=True,\n                    padding=\"do_not_pad\",\n                    truncation=\"do_not_truncate\",\n                    max_length=None,\n                    pad_to_multiple_of=None,\n                    return_token_type_ids=None,\n                    return_attention_mask=None,\n                    return_overflowing_tokens=False,\n                    return_length=False,\n                    **kwargs):\n        def get_input_ids(text):\n            if isinstance(text, str):\n                tokens = self.tokenize(text, **kwargs)\n                return self.convert_tokens_to_ids(tokens)\n            elif isinstance(text,\n                            (list, tuple)) and len(text) > 0 and isinstance(\n                                text[0], str):\n                if is_split_into_words:\n                    tokens = list(\n                        itertools.chain(*(self.tokenize(\n                            t, is_split_into_words=True, **kwargs)\n                                          for t in text)))\n                    return self.convert_tokens_to_ids(tokens)\n                else:\n                    return self.convert_tokens_to_ids(text)\n            elif isinstance(text,\n                            (list, tuple)) and len(text) > 0 and isinstance(\n                                text[0], int):\n                return text\n            else:\n                raise NotImplementedError\n\n        first_ids = get_input_ids(text)\n        second_ids = get_input_ids(\n            text_pair) if text_pair is not None else None\n\n        pair = bool(second_ids is not None)\n        len_ids = len(first_ids)\n        len_pair_ids = len(second_ids) if pair else 0\n\n        if return_token_type_ids and not add_special_tokens:\n            raise ValueError(\n                \"Asking to return token_type_ids while setting add_special_tokens to False \"\n                \"results in an undefined behavior. Please set add_special_tokens to True or \"\n                \"set return_token_type_ids to None.\")\n\n        # Load from model defaults\n        if return_token_type_ids is None:\n            return_token_type_ids = \"token_type_ids\" in self.model_input_names\n        if return_attention_mask is None:\n            return_attention_mask = \"attention_mask\" in self.model_input_names\n\n        encoded_inputs = {}\n        # Compute the total size of the returned encodings\n        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(\n            pair=pair) if add_special_tokens else 0)\n\n        # Truncation: Handle max sequence length\n        overflowing_tokens = []\n        if truncation != \"do_not_truncate\" and max_length and total_len > max_length:\n            first_ids, second_ids, overflowing_tokens = self.truncate_sequences(\n                first_ids,\n                pair_ids=second_ids,\n                num_tokens_to_remove=total_len - max_length,\n                truncation=truncation, )\n        if return_overflowing_tokens:\n            encoded_inputs[\"overflowing_tokens\"] = overflowing_tokens\n            encoded_inputs[\"num_truncated_tokens\"] = total_len - max_length\n\n        # Add special tokens\n        if add_special_tokens:\n            sequence = self.build_inputs_with_special_tokens(first_ids,\n                                                             second_ids)\n            token_type_ids = self.create_token_type_ids_from_sequences(\n                first_ids, second_ids)\n        else:\n            sequence = first_ids + second_ids if pair else first_ids\n            token_type_ids = [0] * len(first_ids) + ([0] * len(second_ids)\n                                                     if pair else [])\n\n        # Build output dictionary\n        encoded_inputs[\"input_ids\"] = sequence\n        if return_token_type_ids:\n            encoded_inputs[\"token_type_ids\"] = token_type_ids\n\n        # Padding\n        if padding != \"do_not_pad\" or return_attention_mask:\n            encoded_inputs = self.pad(\n                encoded_inputs,\n                max_length=max_length,\n                padding=padding,\n                pad_to_multiple_of=pad_to_multiple_of,\n                return_attention_mask=return_attention_mask, )\n\n        if return_length:\n            encoded_inputs[\"length\"] = len(encoded_inputs[\"input_ids\"])\n\n        return encoded_inputs\n\n    def num_special_tokens_to_add(self, pair: bool=False) -> int:\n        token_ids_0 = []\n        token_ids_1 = []\n        return len(\n            self.build_inputs_with_special_tokens(token_ids_0, token_ids_1\n                                                  if pair else None))\n\n    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):\n        if token_ids_1 is None:\n            return token_ids_0\n        return token_ids_0 + token_ids_1\n\n    def create_token_type_ids_from_sequences(self,\n                                             token_ids_0,\n                                             token_ids_1=None):\n        if token_ids_1 is None:\n            return len(token_ids_0) * [0]\n        return [0] * len(token_ids_0) + [1] * len(token_ids_1)\n\n    def truncate_sequences(\n            self,\n            ids,\n            pair_ids=None,\n            num_tokens_to_remove=0,\n            truncation=\"longest_first\",\n            stride=0, ):\n        if num_tokens_to_remove <= 0:\n            return ids, pair_ids, []\n\n        overflowing_tokens = []\n        if truncation == \"only_first\" or (truncation == \"longest_first\" and\n                                          pair_ids is None):\n            if len(ids) > num_tokens_to_remove:\n                window_len = min(len(ids), stride + num_tokens_to_remove)\n                if self.truncation_side == \"left\":\n                    overflowing_tokens = ids[:window_len]\n                    ids = ids[num_tokens_to_remove:]\n                elif self.truncation_side == \"right\":\n                    overflowing_tokens = ids[-window_len:]\n                    ids = ids[:-num_tokens_to_remove]\n                else:\n                    raise ValueError(\n                        f\"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'.\"\n                    )\n\n            else:\n                error_msg = (\n                    f\"We need to remove {num_tokens_to_remove} to truncate the input \"\n                    f\"but the first sequence has a length {len(ids)}. \")\n                if truncation == \"only_first\":\n                    error_msg = (\n                        error_msg +\n                        \"Please select another truncation strategy than \"\n                        f\"{truncation}, for instance 'longest_first' or 'only_second'.\"\n                    )\n                logger.error(error_msg)\n        elif truncation == \"longest_first\":\n            warnings.warn(\n                \"Be aware, overflowing tokens are not returned for the setting you have chosen,\"\n                f\" i.e. sequence pairs with the '{truncation}' \"\n                \"truncation strategy. So the returned list will always be empty even if some \"\n                \"tokens have been removed.\")\n            for _ in range(num_tokens_to_remove):\n                if pair_ids is None or len(ids) > len(pair_ids):\n                    if self.truncation_side == \"right\":\n                        ids = ids[:-1]\n                    elif self.truncation_side == \"left\":\n                        ids = ids[1:]\n                    else:\n                        raise ValueError(\"invalid truncation strategy:\" + str(\n                            self.truncation_side))\n                else:\n                    if self.truncation_side == \"right\":\n                        pair_ids = pair_ids[:-1]\n                    elif self.truncation_side == \"left\":\n                        pair_ids = pair_ids[1:]\n                    else:\n                        raise ValueError(\"invalid truncation strategy:\" + str(\n                            self.truncation_side))\n        elif truncation == \"only_second\" and pair_ids is not None:\n            if len(pair_ids) > num_tokens_to_remove:\n                window_len = min(len(pair_ids), stride + num_tokens_to_remove)\n                if self.truncation_side == \"right\":\n                    overflowing_tokens = pair_ids[-window_len:]\n                    pair_ids = pair_ids[:-num_tokens_to_remove]\n                elif self.truncation_side == \"left\":\n                    overflowing_tokens = pair_ids[:window_len]\n                    pair_ids = pair_ids[num_tokens_to_remove:]\n                else:\n                    raise ValueError(\"invalid truncation strategy:\" + str(\n                        self.truncation_side))\n            else:\n                logger.error(\n                    f\"We need to remove {num_tokens_to_remove} to truncate the input \"\n                    f\"but the second sequence has a length {len(pair_ids)}. \"\n                    f\"Please select another truncation strategy than {truncation}, \"\n                    \"for instance 'longest_first' or 'only_first'.\")\n\n        return (ids, pair_ids, overflowing_tokens)\n\n    def pad(\n            self,\n            encoded_inputs,\n            padding=True,\n            max_length=None,\n            pad_to_multiple_of=None,\n            return_attention_mask=None,\n            return_tensors=None,\n            verbose=True, ):\n\n        # The model's main input name, usually `input_ids`, has be passed for padding\n        if self.model_input_names[0] not in encoded_inputs:\n            raise ValueError(\n                \"You should supply an encoding or a list of encodings to this method \"\n                f\"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}\"\n            )\n\n        required_input = encoded_inputs[self.model_input_names[0]]\n\n        if not required_input:\n            if return_attention_mask:\n                encoded_inputs[\"attention_mask\"] = []\n            return encoded_inputs\n\n        required_input = encoded_inputs[self.model_input_names[0]]\n\n        if required_input and not isinstance(required_input[0], (list, tuple)):\n            encoded_inputs = self._pad(\n                encoded_inputs,\n                max_length=max_length,\n                padding=padding,\n                pad_to_multiple_of=pad_to_multiple_of,\n                return_attention_mask=return_attention_mask, )\n            return encoded_inputs\n\n        batch_size = len(required_input)\n        assert all(\n            len(v) == batch_size for v in encoded_inputs.values()\n        ), \"Some items in the output dictionary have a different batch size than others.\"\n\n        if padding == \"longest\":\n            max_length = max(len(inputs) for inputs in required_input)\n            padding = \"max_length\"\n\n        batch_outputs = {}\n        for i in range(batch_size):\n            inputs = dict((k, v[i]) for k, v in encoded_inputs.items())\n            outputs = self._pad(\n                inputs,\n                max_length=max_length,\n                padding=padding,\n                pad_to_multiple_of=pad_to_multiple_of,\n                return_attention_mask=return_attention_mask, )\n\n            for key, value in outputs.items():\n                if key not in batch_outputs:\n                    batch_outputs[key] = []\n                batch_outputs[key].append(value)\n\n        return encoded_inputs\n\n    def _pad(\n            self,\n            encoded_inputs,\n            max_length=None,\n            padding=\"do_not_pad\",\n            pad_to_multiple_of=None,\n            return_attention_mask=None, ) -> dict:\n        # Load from model defaults\n        if return_attention_mask is None:\n            return_attention_mask = \"attention_mask\" in self.model_input_names or \"attention_mask\" in encoded_inputs\n\n        required_input = encoded_inputs[self.model_input_names[0]]\n\n        if padding == \"longest\":\n            max_length = len(required_input)\n\n        if max_length is not None and pad_to_multiple_of is not None and (\n                max_length % pad_to_multiple_of != 0):\n            max_length = (\n                (max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of\n\n        needs_to_be_padded = padding != \"do_not_pad\" and len(\n            required_input) != max_length\n\n        # Initialize attention mask if not present.\n        if return_attention_mask and \"attention_mask\" not in encoded_inputs:\n            encoded_inputs[\"attention_mask\"] = [1] * len(required_input)\n\n        if needs_to_be_padded:\n            difference = max_length - len(required_input)\n\n            if self.padding_side == \"right\":\n                if return_attention_mask:\n                    encoded_inputs[\"attention_mask\"] = encoded_inputs[\n                        \"attention_mask\"] + [0] * difference\n                if \"token_type_ids\" in encoded_inputs:\n                    encoded_inputs[\"token_type_ids\"] = (\n                        encoded_inputs[\"token_type_ids\"] +\n                        [self.pad_token_type_id] * difference)\n                if \"special_tokens_mask\" in encoded_inputs:\n                    encoded_inputs[\"special_tokens_mask\"] = encoded_inputs[\n                        \"special_tokens_mask\"] + [1] * difference\n                if \"offset_mapping\" in encoded_inputs:\n                    encoded_inputs[\"offset_mapping\"] = encoded_inputs[\n                        \"offset_mapping\"] + [(0, 0)] * difference\n                if \"position_ids\" in encoded_inputs:\n                    encoded_inputs[\"position_ids\"] = encoded_inputs[\n                        \"position_ids\"] + [0] * difference\n                encoded_inputs[self.model_input_names[\n                    0]] = required_input + [self.pad_token_id] * difference\n            elif self.padding_side == \"left\":\n                if return_attention_mask:\n                    encoded_inputs[\"attention_mask\"] = [\n                        0\n                    ] * difference + encoded_inputs[\"attention_mask\"]\n                if \"token_type_ids\" in encoded_inputs:\n                    encoded_inputs[\"token_type_ids\"] = [\n                        self.pad_token_type_id\n                    ] * difference + encoded_inputs[\"token_type_ids\"]\n                if \"special_tokens_mask\" in encoded_inputs:\n                    encoded_inputs[\"special_tokens_mask\"] = [\n                        1\n                    ] * difference + encoded_inputs[\"special_tokens_mask\"]\n                if \"offset_mapping\" in encoded_inputs:\n                    encoded_inputs[\"offset_mapping\"] = [\n                        (0, 0)\n                    ] * difference + encoded_inputs[\"offset_mapping\"]\n                if \"position_ids\" in encoded_inputs:\n                    encoded_inputs[\"position_ids\"] = [\n                        0\n                    ] * difference + encoded_inputs[\"position_ids\"]\n                encoded_inputs[self.model_input_names[\n                    0]] = [self.pad_token_id] * difference + required_input\n            else:\n                raise ValueError(\"Invalid padding strategy:\" + str(\n                    self.padding_side))\n\n        return encoded_inputs\n\n    def __len__(self):\n        return len(self.encoder) + len(self.special_tokens)\n\n    def set_special_tokens(self, special_tokens):\n        \"\"\" Add a list of additional tokens to the encoder.\n            The additional tokens are indexed starting from the last index of the\n            current vocabulary in the order of the `special_tokens` list.\n        \"\"\"\n        if not special_tokens:\n            self.special_tokens = {}\n            self.special_tokens_decoder = {}\n            return\n        self.special_tokens = dict((tok, len(self.encoder) + i)\n                                   for i, tok in enumerate(special_tokens))\n        self.special_tokens_decoder = {\n            v: k\n            for k, v in self.special_tokens.items()\n        }\n        logger.info(\"Special tokens {}\".format(self.special_tokens))\n\n    def bpe(self, token):\n        if token in self.cache:\n            return self.cache[token]\n        word = tuple(token)\n        pairs = get_pairs(word)\n\n        if not pairs:\n            return token\n\n        while True:\n            bigram = min(\n                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))\n            if bigram not in self.bpe_ranks:\n                break\n            first, second = bigram\n            new_word = []\n            i = 0\n            while i < len(word):\n                try:\n                    j = word.index(first, i)\n                    new_word.extend(word[i:j])\n                    i = j\n                except BaseException:\n                    new_word.extend(word[i:])\n                    break\n\n                if word[i] == first and i < len(word) - 1 and word[\n                        i + 1] == second:\n                    new_word.append(first + second)\n                    i += 2\n                else:\n                    new_word.append(word[i])\n                    i += 1\n            new_word = tuple(new_word)\n            word = new_word\n            if len(word) == 1:\n                break\n            else:\n                pairs = get_pairs(word)\n        word = ' '.join(word)\n        self.cache[token] = word\n        return word\n\n    def tokenize(self, text):\n        \"\"\" Tokenize a string. \"\"\"\n        bpe_tokens = []\n        for token in re.findall(self.pat, text):\n            if sys.version_info[0] == 2:\n                token = ''.join(self.byte_encoder[ord(b)] for b in token)\n            else:\n                token = ''.join(self.byte_encoder[b]\n                                for b in token.encode('utf-8'))\n            bpe_tokens.extend(\n                bpe_token for bpe_token in self.bpe(token).split(' '))\n        return bpe_tokens\n\n    def convert_tokens_to_ids(self, tokens):\n        \"\"\" Converts a sequence of tokens into ids using the vocab. \"\"\"\n        ids = []\n        if isinstance(tokens, str) or (sys.version_info[0] == 2 and\n                                       isinstance(tokens, unicode)):\n            if tokens in self.special_tokens:\n                return self.special_tokens[tokens]\n            else:\n                return self.encoder.get(tokens, 0)\n        for token in tokens:\n            if token in self.special_tokens:\n                ids.append(self.special_tokens[token])\n            else:\n                ids.append(self.encoder.get(token, 0))\n        if len(ids) > self.max_len:\n            warnings.warn(\n                \"Token indices sequence length is longer than the specified maximum \"\n                \" sequence length for this OpenAI GPT model ({} > {}). Running this\"\n                \" sequence through the model will result in indexing errors\".\n                format(len(ids), self.max_len))\n        return ids\n\n    def convert_ids_to_string(self, ids):\n        \"\"\"\n        Converts a single index or a sequence of indices to texts.\n        Args:\n            ids (int|List[int]):\n                The token id (or token ids) to be converted to text.\n        Returns:\n            str: The decoded text.\n        Example:\n            .. code-block::\n                from paddlenlp.transformers import GPTTokenizer\n                tokenizer = GPTTokenizer.from_pretrained('gpt2-medium-en')\n                print(tokenizer.convert_ids_to_string(tokenizer.convert_ids_to_string([14618, 284, 779, 350, 37382, 47, 37382, 290, 350, 37382, 45, 19930]))\n                # 'Welcome to use PaddlePaddle and PaddleNLP'\n        \"\"\"\n\n        text = ''.join([self.decoder[id] for id in ids])\n        text = bytearray([self.byte_decoder[c] for c in text]).decode(\n            'utf-8', errors=self.errors)\n        return text\n\n    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):\n        \"\"\"Converts a sequence of ids in BPE tokens using the vocab.\"\"\"\n        tokens = []\n        for i in ids:\n            if i in self.special_tokens_decoder:\n                if not skip_special_tokens:\n                    tokens.append(self.special_tokens_decoder[i])\n            else:\n                tokens.append(self.decoder[i])\n        return tokens\n\n    def encode(self, text):\n        return self.convert_tokens_to_ids(self.tokenize(text))\n\n    def decode(self, tokens):\n        text = ''.join([\n            self.decoder[token] if token in self.decoder.keys() else ''\n            for token in tokens\n        ])\n        text = bytearray([self.byte_decoder[c] for c in text]).decode(\n            'utf-8', errors=self.errors)\n        return text\n\n    def save_vocabulary(self, vocab_path):\n        \"\"\"Save the tokenizer vocabulary and merge files to a directory.\"\"\"\n        if not os.path.isdir(vocab_path):\n            logger.error(\"Vocabulary path ({}) should be a directory\".format(\n                vocab_path))\n            return\n        vocab_file = os.path.join(vocab_path, VOCAB_NAME)\n        merge_file = os.path.join(vocab_path, MERGES_NAME)\n        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)\n\n        with open(vocab_file, 'w', encoding='utf-8') as f:\n            f.write(json.dumps(self.encoder, ensure_ascii=False))\n\n        index = 0\n        with open(merge_file, \"w\", encoding=\"utf-8\") as writer:\n            writer.write(u'#version: 0.2\\n')\n            for bpe_tokens, token_index in sorted(\n                    self.bpe_ranks.items(), key=lambda kv: kv[1]):\n                if index != token_index:\n                    warnings.warn(\n                        \"Saving vocabulary to {}: BPE merge indices are not consecutive.\"\n                        \" Please check that the tokenizer is not corrupted!\".\n                        format(merge_file))\n                    index = token_index\n                writer.write(' '.join(bpe_tokens) + u'\\n')\n                index += 1\n\n        index = len(self.encoder)\n        with open(special_tokens_file, 'w', encoding='utf-8') as writer:\n            for token, token_index in sorted(\n                    self.special_tokens.items(), key=lambda kv: kv[1]):\n                if index != token_index:\n                    warnings.warn(\n                        \"Saving special tokens vocabulary to {}: BPE indices are not consecutive.\"\n                        \" Please check that the tokenizer is not corrupted!\".\n                        format(special_tokens_file))\n                    index = token_index\n                writer.write(token + u'\\n')\n                index += 1\n\n        return vocab_file, merge_file, special_tokens_file\n\n    @property\n    def vocab_size(self):\n        return len(self.encoder)\n\n    @property\n    def vocab(self):\n        return self.encoder\n\n    @property\n    def inv_vocab(self):\n        return self.decoder\n\n    @property\n    def eos_token_id(self):\n        return self.eod_id\n"
  },
  {
    "path": "ppfleetx/data/tokenizers/t5_tokenization_utils.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# Copyright 2020 The HuggingFace Inc. team.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\n Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see\n tokenization_utils_fast.py\n\"\"\"\nimport bisect\nimport itertools\nimport re\nimport unicodedata\nfrom collections import OrderedDict\nfrom typing import Any, Dict, List, Optional, Tuple, Union, overload\n\nfrom .tokenization_utils_base import (\n    ENCODE_KWARGS_DOCSTRING,\n    ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,\n    INIT_TOKENIZER_DOCSTRING,\n    AddedToken,\n    BatchEncoding,\n    EncodedInput,\n    EncodedInputPair,\n    PreTokenizedInput,\n    PreTokenizedInputPair,\n    PreTrainedTokenizerBase,\n    TextInput,\n    TextInputPair,\n    TruncationStrategy, )\nfrom .utils import PaddingStrategy, TensorType, add_end_docstrings, logging\n\nlogger = logging.get_logger(__name__)\n\n# Slow tokenizers are saved in a vocabulary plus three separated files\nSPECIAL_TOKENS_MAP_FILE = \"special_tokens_map.json\"\nADDED_TOKENS_FILE = \"added_tokens.json\"\nTOKENIZER_CONFIG_FILE = \"tokenizer_config.json\"\n\n\nclass Trie:\n    \"\"\"\n    Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass\n    Loose reference https://en.wikipedia.org/wiki/Trie\n    \"\"\"\n\n    def __init__(self):\n        self.data = {}\n\n    def add(self, word):\n        \"\"\"\n        Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation.\n        The special key `\"\"` is used to represent termination.\n\n        This function is idempotent, adding twice the same word will leave the trie unchanged\n\n        Example:\n\n        ```python\n        >>> trie = Trie()\n        >>> trie.add(\"Hello 友達\")\n        >>> trie.data\n        {\"H\": {\"e\": {\"l\": {\"l\": {\"o\": {\" \": {\"友\": {\"達\": {\"\": 1}}}}}}}}}\n\n        >>> trie.add(\"Hello\")\n        >>> trie.data\n        {\"H\": {\"e\": {\"l\": {\"l\": {\"o\": {\"\": 1, \" \": {\"友\": {\"達\": {\"\": 1}}}}}}}}}\n        ```\n        \"\"\"\n        if not word:\n            # Prevent empty string\n            return\n        ref = self.data\n        for char in word:\n            ref[char] = char in ref and ref[char] or {}\n            ref = ref[char]\n        ref[\"\"] = 1\n\n    def split(self, text):\n        \"\"\"\n        Will look for the words added to the trie within `text`. Output is the original string splitted along the\n        boundaries of the words found.\n\n        This trie will match the longest possible word first !\n\n        Example:\n\n        ```python\n        >>> trie = Trie()\n        >>> trie.split(\"[CLS] This is a extra_id_100\")\n        [\"[CLS] This is a extra_id_100\"]\n\n        >>> trie.add(\"[CLS]\")\n        >>> trie.add(\"extra_id_1\")\n        >>> trie.add(\"extra_id_100\")\n        >>> trie.split(\"[CLS] This is a extra_id_100\")\n        [\"[CLS]\", \" This is a \", \"extra_id_100\"]\n        ```\n        \"\"\"\n        # indexes are counted left of the chars index.\n        # \"hello\", index 0, is left of h, index 1 is between h and e.\n        # index 5 is right of the \"o\".\n\n        # States are going to capture every possible start (indexes as above)\n        # as keys, and have as values, a pointer to the position in the trie\n        # where we're at. This is a partial match for now.\n        # This enables to keep track of multiple matches while we're iterating\n        # the string\n        # If the trie contains, \"blowing\", and \"lower\" and we encounter the\n        # string \"blower\", we need to split into [\"b\", \"lower\"].\n        # This is where we need to keep track of multiple possible starts.\n        states = OrderedDict()\n\n        # This will contain every indices where we need\n        # to cut.\n        # We force to cut at offset 0 and len(text) (added later)\n        offsets = [0]\n\n        # This is used by the lookahead which needs to skip over\n        # some text where the full match exceeded the place in the initial\n        # for loop\n        skip = 0\n        # Main loop, Giving this algorithm O(n) complexity\n        for current, current_char in enumerate(text):\n            if skip and current < skip:\n                # Prevents the lookahead for matching twice\n                # like extra_id_100 and id_100\n                continue\n\n            # This will track every state\n            # that stop matching, we need to stop tracking them.\n            # If we look at \"lowball\", we're going to match \"l\" (add it to states), \"o\", \"w\", then\n            # fail on \"b\", we need to remove 0 from the valid states.\n            to_remove = set()\n            # Whenever we found a match, we need to drop everything\n            # this is a greedy algorithm, it will match on the first found token\n            reset = False\n\n            # In this case, we already have partial matches (But unfinished)\n            for start, trie_pointer in states.items():\n                if \"\" in trie_pointer:\n                    # This is a final match, we need to reset and\n                    # store the results in `offsets`.\n\n                    # Lookahead to match longest first\n                    # Important in case of extra_id_1 vs extra_id_100\n                    # Here we are also actively looking for other earlier partial\n                    # matches\n                    # \"[CLS]\", \"L\", we need to match CLS even if L is special\n                    for lookstart, looktrie_pointer in states.items():\n                        if lookstart > start:\n                            # This partial match is later, we can stop looking\n                            break\n                        elif lookstart < start:\n                            # This partial match is earlier, the trie pointer\n                            # was already updated, so index is + 1\n                            lookahead_index = current + 1\n                            end = current + 1\n                        else:\n                            # Here lookstart == start and\n                            #      looktrie_pointer == trie_pointer\n                            # It wasn't updated yet so indices are current ones\n                            lookahead_index = current\n                            end = current\n                        next_char = text[\n                            lookahead_index] if lookahead_index < len(\n                                text) else None\n                        if \"\" in looktrie_pointer:\n                            start = lookstart\n                            end = lookahead_index\n                            skip = lookahead_index\n\n                        while next_char in looktrie_pointer:\n                            looktrie_pointer = looktrie_pointer[next_char]\n                            lookahead_index += 1\n                            if \"\" in looktrie_pointer:\n                                start = lookstart\n                                end = lookahead_index\n                                skip = lookahead_index\n\n                            if lookahead_index == len(text):\n                                # End of string\n                                break\n                            next_char = text[lookahead_index]\n                        # End lookahead\n\n                        # Storing and resetting\n                    offsets.append(start)\n                    offsets.append(end)\n                    reset = True\n                    break\n                elif current_char in trie_pointer:\n                    # The current character being looked at has a match within the trie\n                    # update the pointer (it will be stored back into states later).\n                    trie_pointer = trie_pointer[current_char]\n\n                    # Storing back the new pointer into the states.\n                    # Partial matches got longer by one.\n                    states[start] = trie_pointer\n                else:\n                    # The new character has not match in the trie, we need\n                    # to stop keeping track of this partial match.\n                    # We can't do it directly within the loop because of how\n                    # python iteration works\n                    to_remove.add(start)\n\n            # Either clearing the full start (we found a real match)\n            # Or clearing only the partial matches that didn't work.\n            if reset:\n                states = {}\n            else:\n                for start in to_remove:\n                    del states[start]\n\n            # If this character is a starting character within the trie\n            # start keeping track of this partial match.\n            if current >= skip and current_char in self.data:\n                states[current] = self.data[current_char]\n\n        # We have a cut at the end with states.\n        for start, trie_pointer in states.items():\n            if \"\" in trie_pointer:\n                # This is a final match, we need to reset and\n                # store the results in `offsets`.\n                end = len(text)\n                offsets.append(start)\n                offsets.append(end)\n                # Longest cut is always the one with lower start so the first\n                # item so we need to break.\n                break\n\n        return self.cut_text(text, offsets)\n\n    def cut_text(self, text, offsets):\n        # We have all the offsets now, we just need to do the actual splitting.\n        # We need to eventually add the first part of the string and the eventual\n        # last part.\n        offsets.append(len(text))\n        tokens = []\n        start = 0\n        for end in offsets:\n            if start > end:\n                logger.error(\n                    \"There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it\"\n                    \" anyway.\")\n                continue\n            elif start == end:\n                # This might happen if there's a match at index 0\n                # we're also preventing zero-width cuts in case of two\n                # consecutive matches\n                continue\n            tokens.append(text[start:end])\n            start = end\n\n        return tokens\n\n\ndef _is_whitespace(char):\n    \"\"\"Checks whether `char` is a whitespace character.\"\"\"\n    # \\t, \\n, and \\r are technically control characters but we treat them\n    # as whitespace since they are generally considered as such.\n    if char == \" \" or char == \"\\t\" or char == \"\\n\" or char == \"\\r\":\n        return True\n    cat = unicodedata.category(char)\n    if cat == \"Zs\":\n        return True\n    return False\n\n\ndef _is_control(char):\n    \"\"\"Checks whether `char` is a control character.\"\"\"\n    # These are technically control characters but we count them as whitespace\n    # characters.\n    if char == \"\\t\" or char == \"\\n\" or char == \"\\r\":\n        return False\n    cat = unicodedata.category(char)\n    if cat.startswith(\"C\"):\n        return True\n    return False\n\n\ndef _is_punctuation(char):\n    \"\"\"Checks whether `char` is a punctuation character.\"\"\"\n    cp = ord(char)\n    # We treat all non-letter/number ASCII as punctuation.\n    # Characters such as \"^\", \"$\", and \"`\" are not in the Unicode\n    # Punctuation class but we treat them as punctuation anyways, for\n    # consistency.\n    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (\n            cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):\n        return True\n    cat = unicodedata.category(char)\n    if cat.startswith(\"P\"):\n        return True\n    return False\n\n\ndef _is_end_of_word(text):\n    \"\"\"Checks whether the last character in text is one of a punctuation, control or whitespace character.\"\"\"\n    last_char = text[-1]\n    return bool(\n        _is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(\n            last_char))\n\n\ndef _is_start_of_word(text):\n    \"\"\"Checks whether the first character in text is one of a punctuation, control or whitespace character.\"\"\"\n    first_char = text[0]\n    return bool(\n        _is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(\n            first_char))\n\n\ndef _insert_one_token_to_ordered_list(token_list, new_token):\n    \"\"\"\n    Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted.\n    \"\"\"\n    insertion_idx = bisect.bisect_left(token_list, new_token)\n    # Checks if new_token is already in the ordered token_list\n    if insertion_idx < len(token_list) and token_list[\n            insertion_idx] == new_token:\n        # new_token is in token_list, don't add\n        return\n    else:\n        token_list.insert(insertion_idx, new_token)\n\n\n@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)\nclass PreTrainedTokenizer(PreTrainedTokenizerBase):\n    \"\"\"\n    Base class for all slow tokenizers.\n\n    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].\n\n    Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading\n    pretrained tokenizers as well as adding tokens to the vocabulary.\n\n    This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the\n    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).\n    \"\"\"\n\n    def __init__(self, **kwargs):\n        super().__init__(**kwargs)\n\n        # Added tokens - We store this for both slow and fast tokenizers\n        # until the serialization of Fast tokenizers is updated\n        self.added_tokens_encoder = {}\n        self.added_tokens_decoder = {}\n        self.unique_no_split_tokens = []\n        self.tokens_trie = Trie()\n\n        self._decode_use_source_tokenizer = False\n\n    @property\n    def is_fast(self):\n        return False\n\n    @property\n    def vocab_size(self):\n        \"\"\"\n        `int`: Size of the base vocabulary (without the added tokens).\n        \"\"\"\n        raise NotImplementedError\n\n    def get_added_vocab(self):\n        \"\"\"\n        Returns the added tokens in the vocabulary as a dictionary of token to index.\n\n        Returns:\n            `Dict[str, int]`: The added tokens.\n        \"\"\"\n        return self.added_tokens_encoder\n\n    def __len__(self):\n        \"\"\"\n        Size of the full vocabulary with the added tokens.\n        \"\"\"\n        return self.vocab_size + len(self.added_tokens_encoder)\n\n    def _add_tokens(self, new_tokens, special_tokens=False):\n        \"\"\"\n        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to\n        it with indices starting from length of the current vocabulary.\n\n        Args:\n            new_tokens (`List[str]`or `List[tokenizers.AddedToken]`):\n                Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by\n                checking if the tokenizer assign the index of the `unk_token` to them).\n            special_tokens (`bool`, *optional*, defaults to `False`):\n                Whether or not the tokens should be added as special tokens.\n\n        Returns:\n            `int`: The number of tokens actually added to the vocabulary.\n\n        Examples:\n\n        ```python\n        # Let's see how to increase the vocabulary of Bert model and tokenizer\n        tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n        model = BertModel.from_pretrained(\"bert-base-uncased\")\n\n        num_added_toks = tokenizer.add_tokens([\"new_tok1\", \"my_new-tok2\"])\n        print(\"We have added\", num_added_toks, \"tokens\")\n        # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.\n        model.resize_token_embeddings(len(tokenizer))\n        ```\"\"\"\n        new_tokens = [str(tok) for tok in new_tokens]\n\n        tokens_to_add = []\n        for token in new_tokens:\n            if not isinstance(token, str):\n                raise TypeError(\n                    \"Token {token} is not a string but a {type(token)}.\")\n            if not special_tokens and hasattr(\n                    self, \"do_lower_case\") and self.do_lower_case:\n                token = token.lower()\n            if (token != self.unk_token and self.convert_tokens_to_ids(token)\n                    == self.convert_tokens_to_ids(self.unk_token) and\n                    token not in tokens_to_add):\n                tokens_to_add.append(token)\n                #if self.verbose:\n                #    logger.info(f\"Adding {token} to the vocabulary\")\n\n        added_tok_encoder = dict((tok, len(self) + i)\n                                 for i, tok in enumerate(tokens_to_add))\n        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}\n        self.added_tokens_encoder.update(added_tok_encoder)\n        self.added_tokens_decoder.update(added_tok_decoder)\n\n        # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)\n        if special_tokens:\n            if len(new_tokens) == 1:\n                _insert_one_token_to_ordered_list(self.unique_no_split_tokens,\n                                                  new_tokens[0])\n            else:\n                self.unique_no_split_tokens = sorted(\n                    set(self.unique_no_split_tokens).union(set(new_tokens)))\n        else:\n            # Or on the newly added tokens\n            if len(tokens_to_add) == 1:\n                _insert_one_token_to_ordered_list(self.unique_no_split_tokens,\n                                                  tokens_to_add[0])\n            else:\n                self.unique_no_split_tokens = sorted(\n                    set(self.unique_no_split_tokens).union(\n                        set(tokens_to_add)))\n        self._create_trie(self.unique_no_split_tokens)\n\n        return len(tokens_to_add)\n\n    def _create_trie(self, unique_no_split_tokens):\n        trie = Trie()\n        for token in unique_no_split_tokens:\n            if hasattr(\n                    self, \"do_lower_case\"\n            ) and self.do_lower_case and token not in self.all_special_tokens:\n                trie.add(token.lower())\n            else:\n                trie.add(token)\n        self.tokens_trie = trie\n\n    def num_special_tokens_to_add(self, pair):\n        \"\"\"\n        Returns the number of added tokens when encoding a sequence with special tokens.\n\n        <Tip>\n\n        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put\n        this inside your training loop.\n\n        </Tip>\n\n        Args:\n            pair (`bool`, *optional*, defaults to `False`):\n                Whether the number of added tokens should be computed in the case of a sequence pair or a single\n                sequence.\n\n        Returns:\n            `int`: Number of special tokens added to sequences.\n        \"\"\"\n        token_ids_0 = []\n        token_ids_1 = []\n        return len(\n            self.build_inputs_with_special_tokens(token_ids_0, token_ids_1\n                                                  if pair else None))\n\n    def tokenize(self, text, **kwargs):\n        \"\"\"\n        Converts a string in a sequence of tokens, using the tokenizer.\n\n        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies\n        (BPE/SentencePieces/WordPieces). Takes care of added tokens.\n\n        Args:\n            text (`str`):\n                The sequence to be encoded.\n            **kwargs (additional keyword arguments):\n                Passed along to the model-specific `prepare_for_tokenization` preprocessing method.\n\n        Returns:\n            `List[str]`: The list of tokens.\n        \"\"\"\n        # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors\n        all_special_tokens_extended = dict(\n            (str(t), t) for t in self.all_special_tokens_extended\n            if isinstance(t, AddedToken))\n\n        text, kwargs = self.prepare_for_tokenization(text, **kwargs)\n\n        if kwargs:\n            logger.warning(\"Keyword arguments {kwargs} not recognized.\")\n\n        # TODO: should this be in the base class?\n        if hasattr(self, \"do_lower_case\") and self.do_lower_case:\n            # convert non-special tokens to lowercase\n            escaped_special_toks = [\n                re.escape(s_tok)\n                for s_tok in (self.unique_no_split_tokens +\n                              self.all_special_tokens)\n            ]\n            pattern = r\"(\" + r\"|\".join(escaped_special_toks) + r\")|\" + r\"(.+?)\"\n            text = re.sub(pattern,\n                          lambda m: m.groups()[0] or m.groups()[1].lower(),\n                          text)\n\n        no_split_token = set(self.unique_no_split_tokens)\n        tokens = self.tokens_trie.split(text)\n        # [\"This is something\", \"<special_token_1>\", \"  else\"]\n        for i, token in enumerate(tokens):\n            if token in no_split_token:\n                tok_extended = all_special_tokens_extended.get(token, None)\n                left = tokens[i - 1] if i > 0 else None\n                right = tokens[i + 1] if i < len(tokens) - 1 else None\n                if isinstance(tok_extended, AddedToken):\n                    if tok_extended.rstrip and right:\n                        # A bit counter-intuitive but we strip the left of the string\n                        # since tok_extended.rstrip means the special token is eating all white spaces on its right\n                        tokens[i + 1] = right.lstrip()\n                    # Strip white spaces on the left\n                    if tok_extended.lstrip and left:\n                        tokens[i - 1] = left.rstrip()  # Opposite here\n                else:\n                    # We strip left and right by default\n                    if right:\n                        tokens[i + 1] = right.lstrip()\n                    if left:\n                        tokens[i - 1] = left.rstrip()\n        # [\"This is something\", \"<special_token_1>\", \"else\"]\n        tokenized_text = []\n        for token in tokens:\n            # Need to skip eventual empty (fully stripped) tokens\n            if not token:\n                continue\n            if token in no_split_token:\n                tokenized_text.append(token)\n            else:\n                tokenized_text.extend(self._tokenize(token))\n        # [\"This\", \" is\", \" something\", \"<special_token_1>\", \"else\"]\n        return tokenized_text\n\n    def _tokenize(self, text, **kwargs):\n        \"\"\"\n        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based\n        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).\n\n        Do NOT take care of added tokens.\n        \"\"\"\n        raise NotImplementedError\n\n    def convert_tokens_to_ids(self, tokens):\n        \"\"\"\n        Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the\n        vocabulary.\n\n        Args:\n            tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).\n\n        Returns:\n            `int` or `List[int]`: The token id or list of token ids.\n        \"\"\"\n        if tokens is None:\n            return None\n\n        if isinstance(tokens, str):\n            return self._convert_token_to_id_with_added_voc(tokens)\n\n        ids = []\n        for token in tokens:\n            ids.append(self._convert_token_to_id_with_added_voc(token))\n        return ids\n\n    def _convert_token_to_id_with_added_voc(self, token):\n        if token is None:\n            return None\n\n        if token in self.added_tokens_encoder:\n            return self.added_tokens_encoder[token]\n        return self._convert_token_to_id(token)\n\n    def _convert_token_to_id(self, token):\n        raise NotImplementedError\n\n    def _encode_plus(self,\n                     text,\n                     text_pair=None,\n                     add_special_tokens=True,\n                     padding_strategy=PaddingStrategy.DO_NOT_PAD,\n                     truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE,\n                     max_length=None,\n                     stride=0,\n                     is_split_into_words=False,\n                     pad_to_multiple_of=None,\n                     return_tensors=None,\n                     return_token_type_ids=None,\n                     return_attention_mask=None,\n                     return_overflowing_tokens=False,\n                     return_special_tokens_mask=False,\n                     return_offsets_mapping=False,\n                     return_length=False,\n                     verbose=True,\n                     **kwargs):\n        def get_input_ids(text):\n            if isinstance(text, str):\n                tokens = self.tokenize(text, **kwargs)\n                return self.convert_tokens_to_ids(tokens)\n            elif isinstance(text,\n                            (list, tuple)) and len(text) > 0 and isinstance(\n                                text[0], str):\n                if is_split_into_words:\n                    tokens = list(\n                        itertools.chain(*(self.tokenize(\n                            t, is_split_into_words=True, **kwargs)\n                                          for t in text)))\n                    return self.convert_tokens_to_ids(tokens)\n                else:\n                    return self.convert_tokens_to_ids(text)\n            elif isinstance(text,\n                            (list, tuple)) and len(text) > 0 and isinstance(\n                                text[0], int):\n                return text\n            else:\n                if is_split_into_words:\n                    raise ValueError(\n                        \"Input {text} is not valid. Should be a string or a list/tuple of strings when\"\n                        \" `is_split_into_words=True`.\")\n                else:\n                    raise ValueError(\n                        \"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of\"\n                        \" integers.\")\n\n        if return_offsets_mapping:\n            raise NotImplementedError(\n                \"return_offset_mapping is not available when using Python tokenizers. \"\n                \"To use this feature, change your tokenizer to one deriving from \"\n                \"transformers.PreTrainedTokenizerFast. \"\n                \"More information on available tokenizers at \"\n                \"https://github.com/huggingface/transformers/pull/2674\")\n\n        first_ids = get_input_ids(text)\n        second_ids = get_input_ids(\n            text_pair) if text_pair is not None else None\n\n        return self.prepare_for_model(\n            first_ids,\n            pair_ids=second_ids,\n            add_special_tokens=add_special_tokens,\n            padding=padding_strategy.value,\n            truncation=truncation_strategy.value,\n            max_length=max_length,\n            stride=stride,\n            pad_to_multiple_of=pad_to_multiple_of,\n            return_tensors=return_tensors,\n            prepend_batch_axis=True,\n            return_attention_mask=return_attention_mask,\n            return_token_type_ids=return_token_type_ids,\n            return_overflowing_tokens=return_overflowing_tokens,\n            return_special_tokens_mask=return_special_tokens_mask,\n            return_length=return_length,\n            verbose=verbose, )\n\n    def _batch_encode_plus(\n            self,\n            batch_text_or_text_pairs,\n            add_special_tokens=True,\n            padding_strategy=PaddingStrategy.DO_NOT_PAD,\n            truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE,\n            max_length=None,\n            stride=0,\n            is_split_into_words=False,\n            pad_to_multiple_of=None,\n            return_tensors=None,\n            return_token_type_ids=None,\n            return_attention_mask=None,\n            return_overflowing_tokens=False,\n            return_special_tokens_mask=False,\n            return_offsets_mapping=False,\n            return_length=False,\n            verbose=True,\n            **kwargs):\n        def get_input_ids(text):\n            if isinstance(text, str):\n                tokens = self.tokenize(text, **kwargs)\n                return self.convert_tokens_to_ids(tokens)\n            elif isinstance(text,\n                            (list, tuple)) and len(text) > 0 and isinstance(\n                                text[0], str):\n                if is_split_into_words:\n                    tokens = list(\n                        itertools.chain(*(self.tokenize(\n                            t, is_split_into_words=True, **kwargs)\n                                          for t in text)))\n                    return self.convert_tokens_to_ids(tokens)\n                else:\n                    return self.convert_tokens_to_ids(text)\n            elif isinstance(text,\n                            (list, tuple)) and len(text) > 0 and isinstance(\n                                text[0], int):\n                return text\n            else:\n                raise ValueError(\n                    \"Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.\"\n                )\n\n        if return_offsets_mapping:\n            raise NotImplementedError(\n                \"return_offset_mapping is not available when using Python tokenizers. \"\n                \"To use this feature, change your tokenizer to one deriving from \"\n                \"transformers.PreTrainedTokenizerFast.\")\n\n        input_ids = []\n        for ids_or_pair_ids in batch_text_or_text_pairs:\n            if not isinstance(ids_or_pair_ids, (list, tuple)):\n                ids, pair_ids = ids_or_pair_ids, None\n            elif is_split_into_words and not isinstance(ids_or_pair_ids[0],\n                                                        (list, tuple)):\n                ids, pair_ids = ids_or_pair_ids, None\n            else:\n                ids, pair_ids = ids_or_pair_ids\n\n            first_ids = get_input_ids(ids)\n            second_ids = get_input_ids(\n                pair_ids) if pair_ids is not None else None\n            input_ids.append((first_ids, second_ids))\n\n        batch_outputs = self._batch_prepare_for_model(\n            input_ids,\n            add_special_tokens=add_special_tokens,\n            padding_strategy=padding_strategy,\n            truncation_strategy=truncation_strategy,\n            max_length=max_length,\n            stride=stride,\n            pad_to_multiple_of=pad_to_multiple_of,\n            return_attention_mask=return_attention_mask,\n            return_token_type_ids=return_token_type_ids,\n            return_overflowing_tokens=return_overflowing_tokens,\n            return_special_tokens_mask=return_special_tokens_mask,\n            return_length=return_length,\n            return_tensors=return_tensors,\n            verbose=verbose, )\n\n        return BatchEncoding(batch_outputs)\n\n    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING,\n                        ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)\n    def _batch_prepare_for_model(\n            self,\n            batch_ids_pairs,\n            add_special_tokens=True,\n            padding_strategy=PaddingStrategy.DO_NOT_PAD,\n            truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE,\n            max_length=None,\n            stride=0,\n            pad_to_multiple_of=None,\n            return_tensors=None,\n            return_token_type_ids=None,\n            return_attention_mask=None,\n            return_overflowing_tokens=False,\n            return_special_tokens_mask=False,\n            return_length=False,\n            verbose=True, ):\n        \"\"\"\n        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It\n        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and\n        manages a moving window (with user defined stride) for overflowing tokens\n\n        Args:\n            batch_ids_pairs: list of tokenized input ids or input ids pairs\n        \"\"\"\n\n        batch_outputs = {}\n        for first_ids, second_ids in batch_ids_pairs:\n            outputs = self.prepare_for_model(\n                first_ids,\n                second_ids,\n                add_special_tokens=add_special_tokens,\n                padding=PaddingStrategy.DO_NOT_PAD.\n                value,  # we pad in batch afterward\n                truncation=truncation_strategy.value,\n                max_length=max_length,\n                stride=stride,\n                pad_to_multiple_of=None,  # we pad in batch afterward\n                return_attention_mask=False,  # we pad in batch afterward\n                return_token_type_ids=return_token_type_ids,\n                return_overflowing_tokens=return_overflowing_tokens,\n                return_special_tokens_mask=return_special_tokens_mask,\n                return_length=return_length,\n                return_tensors=None,  # We convert the whole batch to tensors at the end\n                prepend_batch_axis=False,\n                verbose=verbose, )\n\n            for key, value in outputs.items():\n                if key not in batch_outputs:\n                    batch_outputs[key] = []\n                batch_outputs[key].append(value)\n\n        batch_outputs = self.pad(\n            batch_outputs,\n            padding=padding_strategy.value,\n            max_length=max_length,\n            pad_to_multiple_of=pad_to_multiple_of,\n            return_attention_mask=return_attention_mask, )\n\n        batch_outputs = BatchEncoding(\n            batch_outputs, tensor_type=return_tensors)\n\n        return batch_outputs\n\n    def prepare_for_tokenization(self,\n                                 text,\n                                 is_split_into_words=False,\n                                 **kwargs):\n        \"\"\"\n        Performs any necessary transformations before tokenization.\n\n        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the\n        `kwargs` at the end of the encoding process to be sure all the arguments have been used.\n\n        Args:\n            text (`str`):\n                The text to prepare.\n            is_split_into_words (`bool`, *optional*, defaults to `False`):\n                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the\n                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)\n                which it will tokenize. This is useful for NER or token classification.\n            kwargs:\n                Keyword arguments to use for the tokenization.\n\n        Returns:\n            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.\n        \"\"\"\n        return (text, kwargs)\n\n    def get_special_tokens_mask(self,\n                                token_ids_0,\n                                token_ids_1=None,\n                                already_has_special_tokens=False) -> List[int]:\n        \"\"\"\n        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding\n        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.\n\n        Args:\n            token_ids_0 (`List[int]`):\n                List of ids of the first sequence.\n            token_ids_1 (`List[int]`, *optional*):\n                List of ids of the second sequence.\n            already_has_special_tokens (`bool`, *optional*, defaults to `False`):\n                Whether or not the token list is already formatted with special tokens for the model.\n\n        Returns:\n            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.\n        \"\"\"\n        if already_has_special_tokens:\n            if token_ids_1 is not None:\n                raise ValueError(\n                    \"You should not supply a second sequence if the provided sequence of \"\n                    \"ids is already formatted with special tokens for the model.\"\n                )\n\n            return super().get_special_tokens_mask(\n                token_ids_0=token_ids_0,\n                token_ids_1=token_ids_1,\n                already_has_special_tokens=True)\n        return [0] * ((len(token_ids_1)\n                       if token_ids_1 else 0) + len(token_ids_0))\n\n    @overload\n    def convert_ids_to_tokens(self, ids: int,\n                              skip_special_tokens: bool=False) -> str:\n        ...\n\n    @overload\n    def convert_ids_to_tokens(self,\n                              ids: List[int],\n                              skip_special_tokens: bool=False) -> List[str]:\n        ...\n\n    def convert_ids_to_tokens(\n            self, ids: Union[int, List[int]],\n            skip_special_tokens: bool=False) -> Union[str, List[str]]:\n        \"\"\"\n        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and\n        added tokens.\n\n        Args:\n            ids (`int` or `List[int]`):\n                The token id (or token ids) to convert to tokens.\n            skip_special_tokens (`bool`, *optional*, defaults to `False`):\n                Whether or not to remove special tokens in the decoding.\n\n        Returns:\n            `str` or `List[str]`: The decoded token(s).\n        \"\"\"\n        if isinstance(ids, int):\n            if ids in self.added_tokens_decoder:\n                return self.added_tokens_decoder[ids]\n            else:\n                return self._convert_id_to_token(ids)\n        tokens = []\n        for index in ids:\n            index = int(index)\n            if skip_special_tokens and index in self.all_special_ids:\n                continue\n            if index in self.added_tokens_decoder:\n                tokens.append(self.added_tokens_decoder[index])\n            else:\n                tokens.append(self._convert_id_to_token(index))\n        return tokens\n\n    def _convert_id_to_token(self, index: int) -> str:\n        raise NotImplementedError\n\n    def convert_tokens_to_string(self, tokens: List[str]) -> str:\n        return \" \".join(tokens)\n\n    def _decode(self,\n                token_ids: List[int],\n                skip_special_tokens: bool=False,\n                clean_up_tokenization_spaces: bool=True,\n                spaces_between_special_tokens: bool=True,\n                **kwargs) -> str:\n        self._decode_use_source_tokenizer = kwargs.pop(\"use_source_tokenizer\",\n                                                       False)\n\n        filtered_tokens = self.convert_ids_to_tokens(\n            token_ids, skip_special_tokens=skip_special_tokens)\n\n        # To avoid mixing byte-level and unicode for byte-level BPT\n        # we need to build string separately for added tokens and byte-level tokens\n        # cf. https://github.com/huggingface/transformers/issues/1133\n        sub_texts = []\n        current_sub_text = []\n        for token in filtered_tokens:\n            if skip_special_tokens and token in self.all_special_ids:\n                continue\n            if token in self.added_tokens_encoder:\n                if current_sub_text:\n                    sub_texts.append(\n                        self.convert_tokens_to_string(current_sub_text))\n                    current_sub_text = []\n                sub_texts.append(token)\n            else:\n                current_sub_text.append(token)\n        if current_sub_text:\n            sub_texts.append(self.convert_tokens_to_string(current_sub_text))\n\n        if spaces_between_special_tokens:\n            text = \" \".join(sub_texts)\n        else:\n            text = \"\".join(sub_texts)\n\n        if clean_up_tokenization_spaces:\n            clean_text = self.clean_up_tokenization(text)\n            return clean_text\n        else:\n            return text\n"
  },
  {
    "path": "ppfleetx/data/tokenizers/t5_tokenizer.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Tokenization classes for Google T5.\"\"\"\n\nfrom __future__ import (absolute_import, division, print_function,\n                        unicode_literals)\n\nimport copy\nimport sys\nimport json\nimport logging\nimport warnings\nimport os\nimport regex as re\nfrom io import open\nfrom collections import OrderedDict\nfrom typing import Any, Dict, List, Optional, Tuple, Union, overload\n\nimport sentencepiece as spm\n\nfrom ppfleetx.utils.download import cached_path\nfrom ppfleetx.data.tokenizers.tokenization_utils_base import (\n    _LazyConfigMapping, AddedToken, TruncationStrategy, PaddingStrategy,\n    BatchEncoding, SpecialTokensMixin)\n\ntry:\n    from functools import lru_cache\nexcept ImportError:\n    # Just a dummy decorator to get the checks to run on python2\n    # because honestly I don't want to support a byte-level unicode BPE\n    # tokenizer on python 2 right now.\n    def lru_cache():\n        return lambda func: func\n\n\nfrom ppfleetx.utils.log import logger\n\nMAX_LENGTH = 256\n\nVOCAB_FILES_NAMES = {\"vocab_file\": \"spiece.model\"}\nMODEL_FILES_NAMES = {\"config_file\": \"config.json\"}\nCONFIG_MAPPING_NAMES = OrderedDict([(\"t5\", \"T5Config\")])\n\nCONFIG_MAPPING = _LazyConfigMapping(CONFIG_MAPPING_NAMES)\n\nPRETRAINED_VOCAB_FILES_MAP = {\n    \"vocab_file\": {\n        't5-11b': \"https://fleet.bj.bcebos.com/datasets/t5/spiece.model\",\n    }\n}\n\nPRETRAINED_MERGES_ARCHIVE_MAP = {\n    't5-11b': \"https://fleet.bj.bcebos.com/datasets/gpt/gpt2-merges.txt\",\n}\nPRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {\n    \"t5-small\": 512,\n    \"t5-base\": 512,\n    \"t5-large\": 512,\n    \"t5-3b\": 512,\n    \"t5-11b\": 512,\n}\n\n# Slow tokenizers used to be saved in three separated files\nDEFAULT_T5_NAME = \"projects/imagen/t5/t5-11b\"\nSPECIAL_TOKENS_MAP_FILE = \"special_tokens_map.json\"\nADDED_TOKENS_FILE = \"added_tokens.json\"\nTOKENIZER_CONFIG_FILE = \"tokenizer_config.json\"\n\n# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file\nFULL_TOKENIZER_FILE = \"tokenizer.json\"\n_re_tokenizer_file = re.compile(r\"tokenizer\\.(.*)\\.json\")\n\n\ndef get_t5_tokenizer(name=DEFAULT_T5_NAME):\n    tokenizer = T5Tokenizer.from_pretrained(name)\n    return tokenizer\n\n\ndef t5_tokenize(texts, tokenizer):\n    encoded = tokenizer.batch_encode_plus(\n        texts,\n        return_tensors=\"paddle\",\n        padding='longest',\n        max_length=MAX_LENGTH,\n        truncation=True)\n\n    input_ids = encoded.input_ids\n    attn_mask = encoded.attention_mask\n    return input_ids, attn_mask\n\n\nclass T5Tokenizer(SpecialTokensMixin):\n    \"\"\"\n    T5 tokenizer. \n    \"\"\"\n    vocab_files_names = VOCAB_FILES_NAMES\n    config_files_names = MODEL_FILES_NAMES\n    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP\n    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES\n    model_input_names = [\"input_ids\", \"attention_mask\"]\n    slow_tokenizer_class = None\n    padding_side = \"right\"\n    truncation_side = \"right\"\n\n    def __init__(self,\n                 vocab_file,\n                 eos_token=\"</s>\",\n                 unk_token=\"<unk>\",\n                 pad_token=\"<pad>\",\n                 extra_ids=100,\n                 additional_special_tokens=None,\n                 sp_model_kwargs=None,\n                 **kwargs):\n        # Add extra_ids to the special token list\n        if extra_ids > 0 and additional_special_tokens is None:\n            additional_special_tokens = [\n                f\"<extra_id_{i}>\" for i in range(extra_ids)\n            ]\n        elif extra_ids > 0 and additional_special_tokens is not None:\n            # Check that we have the right number of extra_id special tokens\n            extra_tokens = len(\n                set(\n                    filter(lambda x: bool(\"extra_id\" in str(x)),\n                           additional_special_tokens)))\n            if extra_tokens != extra_ids:\n                raise ValueError(\n                    f\"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are\"\n                    \" provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids\"\n                    \" tokens\")\n\n        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs\n        super().__init__(\n            eos_token=eos_token,\n            unk_token=unk_token,\n            pad_token=pad_token,\n            extra_ids=extra_ids,\n            additional_special_tokens=additional_special_tokens,\n            sp_model_kwargs=self.sp_model_kwargs,\n            **kwargs)\n        self.vocab_file = vocab_file\n        self._extra_ids = extra_ids\n\n        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)\n        self.sp_model.Load(vocab_file)\n        self.deprecation_warnings = ({})\n\n    @classmethod\n    def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs,\n                        **kwargs):\n        cache_dir = kwargs.pop(\"cache_dir\", None)\n        force_download = kwargs.pop(\"force_download\", False)\n        resume_download = kwargs.pop(\"resume_download\", False)\n        proxies = kwargs.pop(\"proxies\", None)\n        local_files_only = kwargs.pop(\"local_files_only\", False)\n        use_auth_token = kwargs.pop(\"use_auth_token\", None)\n        revision = kwargs.pop(\"revision\", None)\n        subfolder = kwargs.pop(\"subfolder\", None)\n\n        pretrained_model_name_or_path = str(pretrained_model_name_or_path)\n        vocab_files = {}\n        init_configuration = {}\n\n        if os.path.isfile(pretrained_model_name_or_path):\n            if len(cls.vocab_files_names) > 1:\n                raise ValueError(\n                    f\"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not \"\n                    \"supported for this tokenizer. Use a model identifier or the path to a directory instead.\"\n                )\n            warnings.warn(\n                f\"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is deprecated and \"\n                \"won't be possible anymore in v5. Use a model identifier or the path to a directory instead.\",\n                FutureWarning, )\n            file_id = list(cls.vocab_files_names.keys())[0]\n            vocab_files[file_id] = pretrained_model_name_or_path\n        else:\n            # At this point pretrained_model_name_or_path is either a directory or a model identifier name\n            additional_files_names = {\n                \"added_tokens_file\": ADDED_TOKENS_FILE,\n                \"special_tokens_map_file\": SPECIAL_TOKENS_MAP_FILE,\n                \"tokenizer_config_file\": TOKENIZER_CONFIG_FILE,\n            }\n            vocab_files_target = {\n                ** cls.vocab_files_names, ** cls.config_files_names, **\n                additional_files_names\n            }\n\n            if \"tokenizer_file\" in vocab_files_target:\n                # Try to get the tokenizer config to see if there are versioned tokenizer files.\n                fast_tokenizer_file = FULL_TOKENIZER_FILE\n                resolved_config_file = get_file_from_repo(\n                    pretrained_model_name_or_path,\n                    TOKENIZER_CONFIG_FILE,\n                    cache_dir=cache_dir,\n                    force_download=force_download,\n                    resume_download=resume_download,\n                    proxies=proxies,\n                    use_auth_token=use_auth_token,\n                    revision=revision,\n                    local_files_only=local_files_only, )\n                if resolved_config_file is not None:\n                    with open(\n                            resolved_config_file, encoding=\"utf-8\") as reader:\n                        tokenizer_config = json.load(reader)\n                        if \"fast_tokenizer_files\" in tokenizer_config:\n                            fast_tokenizer_file = get_fast_tokenizer_file(\n                                tokenizer_config[\"fast_tokenizer_files\"])\n                vocab_files_target[\"tokenizer_file\"] = fast_tokenizer_file\n\n            # Look for the tokenizer files\n            for file_id, file_name in vocab_files_target.items():\n                if os.path.isdir(pretrained_model_name_or_path):\n                    if subfolder is not None:\n                        full_file_name = os.path.join(\n                            pretrained_model_name_or_path, subfolder,\n                            file_name)\n                    else:\n                        full_file_name = os.path.join(\n                            pretrained_model_name_or_path, file_name)\n                    if not os.path.exists(full_file_name):\n                        #logger.info(\"Didn't find file {full_file_name}. We won't load it.\")\n                        full_file_name = None\n\n                vocab_files[file_id] = full_file_name\n\n        # Get files from url, cache, or disk depending on the case\n        resolved_vocab_files = {}\n        unresolved_files = []\n        for file_id, file_path in vocab_files.items():\n            if file_path is None:\n                resolved_vocab_files[file_id] = None\n            else:\n                try:\n                    resolved_vocab_files[file_id] = cached_path(\n                        file_path,\n                        cache_dir=cache_dir, )\n                except EnvironmentError:\n                    logger.error(\n                        \"Model name '{}' was not found in model name list ({}). \"\n                        \"We assumed '{}' was a path or url but couldn't find files {} and {} \"\n                        \"at this path or url.\".format(\n                            pretrained_model_name_or_path, ', '.join(\n                                PRETRAINED_VOCAB_ARCHIVE_MAP.keys(\n                                )), pretrained_model_name_or_path, vocab_file,\n                            merges_file))\n                    return None\n\n        if all(full_file_name is None\n               for full_file_name in resolved_vocab_files.values()):\n            raise EnvironmentError(\n                f\"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from \"\n                \"'https://huggingface.co/models', make sure you don't have a local directory with the same name. \"\n                f\"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory \"\n                f\"containing all relevant files for a {cls.__name__} tokenizer.\"\n            )\n\n        for file_id, file_path in vocab_files.items():\n            if file_id not in resolved_vocab_files:\n                continue\n\n        return cls._from_pretrained(\n            resolved_vocab_files,\n            pretrained_model_name_or_path,\n            init_configuration,\n            *init_inputs,\n            use_auth_token=use_auth_token,\n            cache_dir=cache_dir,\n            **kwargs, )\n\n    @classmethod\n    def _from_pretrained(cls,\n                         resolved_vocab_files,\n                         pretrained_model_name_or_path,\n                         init_configuration,\n                         *init_inputs,\n                         use_auth_token=None,\n                         cache_dir=None,\n                         **kwargs):\n        # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json\n        # file or if `from_slow` is set to True.\n        from_slow = kwargs.get(\"from_slow\", False)\n        has_tokenizer_file = resolved_vocab_files.get(\"tokenizer_file\",\n                                                      None) is not None\n        if (from_slow or not has_tokenizer_file\n            ) and cls.slow_tokenizer_class is not None:\n            slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(\n                copy.deepcopy(resolved_vocab_files),\n                pretrained_model_name_or_path,\n                copy.deepcopy(init_configuration),\n                *init_inputs,\n                **(copy.deepcopy(kwargs)), )\n        else:\n            slow_tokenizer = None\n\n        # Prepare tokenizer initialization kwargs\n        # Did we saved some inputs and kwargs to reload ?\n        tokenizer_config_file = resolved_vocab_files.pop(\n            \"tokenizer_config_file\", None)\n        if tokenizer_config_file is not None:\n            with open(\n                    tokenizer_config_file,\n                    encoding=\"utf-8\") as tokenizer_config_handle:\n                init_kwargs = json.load(tokenizer_config_handle)\n            # First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers.\n            config_tokenizer_class = init_kwargs.get(\"tokenizer_class\")\n            init_kwargs.pop(\"tokenizer_class\", None)\n            init_kwargs.pop(\"auto_map\", None)\n            saved_init_inputs = init_kwargs.pop(\"init_inputs\", ())\n            if not init_inputs:\n                init_inputs = saved_init_inputs\n        else:\n            config_tokenizer_class = None\n            init_kwargs = init_configuration\n\n        if config_tokenizer_class is None:\n            # Second attempt. If we have not yet found tokenizer_class, let's try to use the config.\n            try:\n                config_dict = resolved_vocab_files.pop(\"config_file\", None)\n                config_dict = cls._dict_from_json_file(config_dict)\n                config_tokenizer_class = config_dict[\n                    \"tokenizer_class\"] if \"tokenizer_class\" in config_dict else None\n            except (OSError, ValueError, KeyError):\n                # skip if an error occurred.\n                config_dict = None\n            if config_tokenizer_class is None:\n                # Third attempt. If we have not yet found the original type of the tokenizer,\n                # we are loading we see if we can infer it from the type of the configuration file\n                from ppfleetx.data.tokenizers.tokenization_utils_base import TOKENIZER_MAPPING_NAMES  # tests_ignore\n\n                model_type = config_dict[\n                    \"model_type\"] if \"model_type\" in config_dict else None\n                if model_type is None:\n                    # Fallback: use pattern matching on the string.\n                    model_type = None\n                    for pattern in TOKENIZER_MAPPING_NAMES.keys():\n                        if pattern in str(pretrained_model_name_or_path):\n                            model_type = pattern\n                            break\n\n                if model_type is not None:\n                    config_tokenizer_class, config_tokenizer_class_fast = TOKENIZER_MAPPING_NAMES.get(\n                        model_type, (None, None))\n                    if config_tokenizer_class is None:\n                        config_tokenizer_class = config_tokenizer_class_fast\n\n        if config_tokenizer_class is not None:\n            if cls.__name__.replace(\n                    \"Fast\", \"\") != config_tokenizer_class.replace(\"Fast\", \"\"):\n                logger.warning(\n                    \"The tokenizer class you load from this checkpoint is not the same type as the class this\"\n                    \" function is called from. It may result in unexpected tokenization. \\nThe tokenizer class you\"\n                    f\" load from this checkpoint is '{config_tokenizer_class}'. \\nThe class this function is called\"\n                    f\" from is '{cls.__name__}'.\")\n\n        # Update with newly provided kwargs\n        init_kwargs.update(kwargs)\n\n        # Convert AddedTokens serialized as dict to class instances\n        def convert_added_tokens(obj):\n            if isinstance(obj, dict) and \"__type\" in obj and obj[\n                    \"__type\"] == \"AddedToken\":\n                obj.pop(\"__type\")\n                return AddedToken(**obj)\n            elif isinstance(obj, (list, tuple)):\n                return list(convert_added_tokens(o) for o in obj)\n            elif isinstance(obj, dict):\n                return {k: convert_added_tokens(v) for k, v in obj.items()}\n            return obj\n\n        init_kwargs = convert_added_tokens(init_kwargs)\n\n        # Set max length if needed\n        if pretrained_model_name_or_path in cls.max_model_input_sizes:\n            # if we're using a pretrained model, ensure the tokenizer\n            # wont index sequences longer than the number of positional embeddings\n\n            model_max_length = cls.max_model_input_sizes[\n                pretrained_model_name_or_path]\n            if model_max_length is not None and isinstance(model_max_length,\n                                                           (int, float)):\n\n                model_max_length = min(\n                    init_kwargs.get(\"model_max_length\", int(1e30)),\n                    model_max_length)\n                # TODO(PVP) - uncomment following line in Transformers v5\n                # init_kwargs[\"model_max_length\"] = model_max_length\n                # TODO(PVP) - remove in Transformers v5\n                # ---\n                init_kwargs[\n                    \"model_max_length\"] = cls._eventually_correct_t5_max_length(\n                        pretrained_model_name_or_path, model_max_length,\n                        init_kwargs.get(\"model_max_length\"))\n                # ---\n\n            # Merge resolved_vocab_files arguments in init_kwargs.\n        added_tokens_file = resolved_vocab_files.pop(\"added_tokens_file\", None)\n        for args_name, file_path in resolved_vocab_files.items():\n            if args_name not in init_kwargs:\n                init_kwargs[args_name] = file_path\n\n        if slow_tokenizer is not None:\n            init_kwargs[\"__slow_tokenizer\"] = slow_tokenizer\n\n        init_kwargs[\"name_or_path\"] = pretrained_model_name_or_path\n\n        # Instantiate tokenizer.\n        try:\n            tokenizer = cls(**init_kwargs)\n        except OSError:\n            raise OSError(\n                \"Unable to load vocabulary from file. \"\n                \"Please check that the provided vocabulary is accessible and not corrupted.\"\n            )\n\n        # Save inputs and kwargs for saving and re-loading with ``save_pretrained``\n        # Removed: Now done at the base class level\n        # tokenizer.init_inputs = init_inputs\n        # tokenizer.init_kwargs = init_kwargs\n\n        # If there is a complementary special token map, load it\n        special_tokens_map_file = resolved_vocab_files.pop(\n            \"special_tokens_map_file\", None)\n\n        # Add supplementary tokens.\n        special_tokens = tokenizer.all_special_tokens\n        # Check all our special tokens are registered as \"no split\" token (we don't cut them) and are in the vocab\n        added_tokens = tokenizer.sanitize_special_tokens()\n        if added_tokens:\n            logger.warning_advice(\n                \"Special tokens have been added in the vocabulary, make sure the associated word embeddings are\"\n                \" fine-tuned or trained.\")\n\n        return tokenizer\n\n    def _eventual_warn_about_too_long_sequence(self,\n                                               ids,\n                                               max_length,\n                                               verbose: bool):\n        \"\"\"\n        Depending on the input and internal state we might trigger a warning about a sequence that is too long for its\n        corresponding model\n\n        Args:\n            ids (`List[str]`): The ids produced by the tokenization\n            max_length (`int`, *optional*): The max_length desired (does not trigger a warning if it is set)\n            verbose (`bool`): Whether or not to print more information and warnings.\n\n        \"\"\"\n        if max_length is None and len(ids) > self.model_max_length and verbose:\n            if not self.deprecation_warnings.get(\n                    \"sequence-length-is-longer-than-the-specified-maximum\",\n                    False):\n                logger.warning(\n                    \"Token indices sequence length is longer than the specified maximum sequence length \"\n                    f\"for this model ({len(ids)} > {self.model_max_length}). Running this sequence through the model \"\n                    \"will result in indexing errors\")\n            self.deprecation_warnings[\n                \"sequence-length-is-longer-than-the-specified-maximum\"] = True\n\n    def _get_padding_truncation_strategies(self,\n                                           padding=False,\n                                           truncation=False,\n                                           max_length=None,\n                                           pad_to_multiple_of=None,\n                                           verbose=True,\n                                           **kwargs):\n        \"\"\"\n        Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy\n        and pad_to_max_length) and behaviors.\n        \"\"\"\n        old_truncation_strategy = kwargs.pop(\"truncation_strategy\",\n                                             \"do_not_truncate\")\n        old_pad_to_max_length = kwargs.pop(\"pad_to_max_length\", False)\n\n        # Backward compatibility for previous behavior, maybe we should deprecate it:\n        # If you only set max_length, it activates truncation for max_length\n        if max_length is not None and padding is False and truncation is False:\n            if verbose:\n                if not self.deprecation_warnings.get(\n                        \"Truncation-not-explicitly-activated\", False):\n                    logger.warning(\n                        \"Truncation was not explicitly activated but `max_length` is provided a specific value, please\"\n                        \" use `truncation=True` to explicitly truncate examples to max length. Defaulting to\"\n                        \" 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the\"\n                        \" tokenizer you can select this strategy more precisely by providing a specific strategy to\"\n                        \" `truncation`.\")\n                self.deprecation_warnings[\n                    \"Truncation-not-explicitly-activated\"] = True\n            truncation = \"longest_first\"\n\n        # Get padding strategy\n        if padding is False and old_pad_to_max_length:\n            if verbose:\n                warnings.warn(\n                    \"The `pad_to_max_length` argument is deprecated and will be removed in a future version, \"\n                    \"use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or \"\n                    \"use `padding='max_length'` to pad to a max length. In this case, you can give a specific \"\n                    \"length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the \"\n                    \"maximal input size of the model (e.g. 512 for Bert).\",\n                    FutureWarning, )\n            if max_length is None:\n                padding_strategy = PaddingStrategy.LONGEST\n            else:\n                padding_strategy = PaddingStrategy.MAX_LENGTH\n        elif padding is not False:\n            if padding is True:\n                if verbose:\n                    if max_length is not None and (\n                            truncation is False or\n                            truncation == \"do_not_truncate\"):\n                        warnings.warn(\n                            \"`max_length` is ignored when `padding`=`True` and there is no truncation strategy. \"\n                            \"To pad to max length, use `padding='max_length'`.\")\n                    if old_pad_to_max_length is not False:\n                        warnings.warn(\n                            \"Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`.\"\n                        )\n                padding_strategy = PaddingStrategy.LONGEST  # Default to pad to the longest sequence in the batch\n            elif not isinstance(padding, PaddingStrategy):\n                padding_strategy = PaddingStrategy(padding)\n            elif isinstance(padding, PaddingStrategy):\n                padding_strategy = padding\n        else:\n            padding_strategy = PaddingStrategy.DO_NOT_PAD\n\n        # Get truncation strategy\n        if truncation is False and old_truncation_strategy != \"do_not_truncate\":\n            if verbose:\n                warnings.warn(\n                    \"The `truncation_strategy` argument is deprecated and will be removed in a future version, use\"\n                    \" `truncation=True` to truncate examples to a max length. You can give a specific length with\"\n                    \" `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the maximal input\"\n                    \" size of the model (e.g. 512 for Bert).  If you have pairs of inputs, you can give a specific\"\n                    \" truncation strategy selected among `truncation='only_first'` (will only truncate the first\"\n                    \" sentence in the pairs) `truncation='only_second'` (will only truncate the second sentence in the\"\n                    \" pairs) or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence\"\n                    \" in the pairs).\",\n                    FutureWarning, )\n            truncation_strategy = TruncationStrategy(old_truncation_strategy)\n        elif truncation is not False:\n            if truncation is True:\n                truncation_strategy = (\n                    TruncationStrategy.LONGEST_FIRST\n                )  # Default to truncate the longest sequences in pairs of inputs\n            elif not isinstance(truncation, TruncationStrategy):\n                truncation_strategy = TruncationStrategy(truncation)\n            elif isinstance(truncation, TruncationStrategy):\n                truncation_strategy = truncation\n        else:\n            truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE\n\n        # Set max length if needed\n        if max_length is None:\n            if padding_strategy == PaddingStrategy.MAX_LENGTH:\n                if self.model_max_length > LARGE_INTEGER:\n                    if verbose:\n                        if not self.deprecation_warnings.get(\n                                \"Asking-to-pad-to-max_length\", False):\n                            logger.warning(\n                                \"Asking to pad to max_length but no maximum length is provided and the model has no\"\n                                \" predefined maximum length. Default to no padding.\"\n                            )\n                        self.deprecation_warnings[\n                            \"Asking-to-pad-to-max_length\"] = True\n                    padding_strategy = PaddingStrategy.DO_NOT_PAD\n                else:\n                    max_length = self.model_max_length\n\n            if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:\n                if self.model_max_length > LARGE_INTEGER:\n                    if verbose:\n                        if not self.deprecation_warnings.get(\n                                \"Asking-to-truncate-to-max_length\", False):\n                            logger.warning(\n                                \"Asking to truncate to max_length but no maximum length is provided and the model has\"\n                                \" no predefined maximum length. Default to no truncation.\"\n                            )\n                        self.deprecation_warnings[\n                            \"Asking-to-truncate-to-max_length\"] = True\n                    truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE\n                else:\n                    max_length = self.model_max_length\n\n        # Test if we have a padding token\n        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (\n                not self.pad_token or self.pad_token_id < 0):\n            raise ValueError(\n                \"Asking to pad but the tokenizer does not have a padding token. \"\n                \"Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` \"\n                \"or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.\"\n            )\n\n        # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided\n        if (truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and\n                padding_strategy != PaddingStrategy.DO_NOT_PAD and\n                pad_to_multiple_of is not None and max_length is not None and\n            (max_length % pad_to_multiple_of != 0)):\n            raise ValueError(\n                \"Truncation and padding are both activated but \"\n                f\"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of}).\"\n            )\n\n        return padding_strategy, truncation_strategy, max_length, kwargs\n\n    def _pad(self,\n             encoded_inputs,\n             max_length=None,\n             padding_strategy=PaddingStrategy.DO_NOT_PAD,\n             pad_to_multiple_of=None,\n             return_attention_mask=None):\n        \"\"\"\n        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)\n\n        Args:\n            encoded_inputs:\n                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).\n            max_length: maximum length of the returned list and optionally padding length (see below).\n                Will truncate by taking into account the special tokens.\n            padding_strategy: PaddingStrategy to use for padding.\n\n                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch\n                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)\n                - PaddingStrategy.DO_NOT_PAD: Do not pad\n                The tokenizer padding sides are defined in self.padding_side:\n\n                    - 'left': pads on the left of the sequences\n                    - 'right': pads on the right of the sequences\n            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.\n                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability\n                >= 7.5 (Volta).\n            return_attention_mask:\n                (optional) Set to False to avoid returning attention mask (default: set to model specifics)\n        \"\"\"\n        # Load from model defaults\n        if return_attention_mask is None:\n            return_attention_mask = \"attention_mask\" in self.model_input_names\n\n        required_input = encoded_inputs[self.model_input_names[0]]\n\n        if padding_strategy == PaddingStrategy.LONGEST:\n            max_length = len(required_input)\n\n        if max_length is not None and pad_to_multiple_of is not None and (\n                max_length % pad_to_multiple_of != 0):\n            max_length = (\n                (max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of\n\n        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(\n            required_input) != max_length\n\n        # Initialize attention mask if not present.\n        if return_attention_mask and \"attention_mask\" not in encoded_inputs:\n            encoded_inputs[\"attention_mask\"] = [1] * len(required_input)\n\n        if needs_to_be_padded:\n            difference = max_length - len(required_input)\n\n            if self.padding_side == \"right\":\n                if return_attention_mask:\n\n                    encoded_inputs[\"attention_mask\"] = encoded_inputs[\n                        \"attention_mask\"] + [0] * difference\n                if \"token_type_ids\" in encoded_inputs:\n                    encoded_inputs[\"token_type_ids\"] = (\n                        encoded_inputs[\"token_type_ids\"] +\n                        [self.pad_token_type_id] * difference)\n                if \"special_tokens_mask\" in encoded_inputs:\n                    encoded_inputs[\"special_tokens_mask\"] = encoded_inputs[\n                        \"special_tokens_mask\"] + [1] * difference\n                encoded_inputs[self.model_input_names[\n                    0]] = required_input + [self.pad_token_id] * difference\n            elif self.padding_side == \"left\":\n                if return_attention_mask:\n                    encoded_inputs[\"attention_mask\"] = [\n                        0\n                    ] * difference + encoded_inputs[\"attention_mask\"]\n                if \"token_type_ids\" in encoded_inputs:\n                    encoded_inputs[\"token_type_ids\"] = [\n                        self.pad_token_type_id\n                    ] * difference + encoded_inputs[\"token_type_ids\"]\n                if \"special_tokens_mask\" in encoded_inputs:\n                    encoded_inputs[\"special_tokens_mask\"] = [\n                        1\n                    ] * difference + encoded_inputs[\"special_tokens_mask\"]\n                encoded_inputs[self.model_input_names[\n                    0]] = [self.pad_token_id] * difference + required_input\n            else:\n                raise ValueError(\"Invalid padding strategy:\" + str(\n                    self.padding_side))\n\n        return encoded_inputs\n\n    def pad(\n            self,\n            encoded_inputs,\n            padding=True,\n            max_length=None,\n            pad_to_multiple_of=None,\n            return_attention_mask=None,\n            return_tensors=None,\n            verbose=True, ):\n        \"\"\"\n        Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length\n        in the batch.\n\n        Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,\n        `self.pad_token_id` and `self.pad_token_type_id`)\n\n        <Tip>\n\n        If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the\n        result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of\n        PyTorch tensors, you will lose the specific device of your tensors however.\n\n        </Tip>\n\n        Args:\n            encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`):\n                Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of\n                tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]* or *List[Dict[str,\n                List[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader\n                collate function.\n\n                Instead of `List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see\n                the note above for the return type.\n            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):\n                 Select a strategy to pad the returned sequences (according to the model's padding side and padding\n                 index) among:\n\n                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single\n                  sequence if provided).\n                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum\n                  acceptable input length for the model if that argument is not provided.\n                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different\n                  lengths).\n            max_length (`int`, *optional*):\n                Maximum length of the returned list and optionally padding length (see above).\n            pad_to_multiple_of (`int`, *optional*):\n                If set will pad the sequence to a multiple of the provided value.\n\n                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability\n                >= 7.5 (Volta).\n            return_attention_mask (`bool`, *optional*):\n                Whether to return the attention mask. If left to the default, will return the attention mask according\n                to the specific tokenizer's default, defined by the `return_outputs` attribute.\n\n                [What are attention masks?](../glossary#attention-mask)\n            return_tensors (`str` or [`~utils.TensorType`], *optional*):\n                If set, will return tensors instead of list of python integers. Acceptable values are:\n\n                - `'tf'`: Return TensorFlow `tf.constant` objects.\n                - `'pt'`: Return PyTorch `torch.Tensor` objects.\n                - `'np'`: Return Numpy `np.ndarray` objects.\n            verbose (`bool`, *optional*, defaults to `True`):\n                Whether or not to print more information and warnings.\n        \"\"\"\n        # If we have a list of dicts, let's convert it in a dict of lists\n        # We do this to allow using this method as a collate_fn function in PyTorch Dataloader\n        if isinstance(encoded_inputs, (list, tuple)) and isinstance(\n                encoded_inputs[0], Mapping):\n            encoded_inputs = {\n                key: [example[key] for example in encoded_inputs]\n                for key in encoded_inputs[0].keys()\n            }\n\n        # The model's main input name, usually `input_ids`, has be passed for padding\n        if self.model_input_names[0] not in encoded_inputs:\n            raise ValueError(\n                \"You should supply an encoding or a list of encodings to this method \"\n                f\"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}\"\n            )\n\n        required_input = encoded_inputs[self.model_input_names[0]]\n\n        if not required_input:\n            if return_attention_mask:\n                encoded_inputs[\"attention_mask\"] = []\n            return encoded_inputs\n\n        # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects\n        # and rebuild them afterwards if no return_tensors is specified\n        # Note that we lose the specific device the tensor may be on for PyTorch\n\n        first_element = required_input[0]\n        if isinstance(first_element, (list, tuple)):\n            # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.\n            for item in required_input:\n                if len(item) != 0:\n                    first_element = item[0]\n                    break\n        # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.\n        if not isinstance(first_element, (int, list, tuple)):\n            if is_tf_available() and _is_tensorflow(first_element):\n                return_tensors = \"tf\" if return_tensors is None else return_tensors\n            elif is_torch_available() and _is_torch(first_element):\n                return_tensors = \"pt\" if return_tensors is None else return_tensors\n            elif isinstance(first_element, np.ndarray):\n                return_tensors = \"np\" if return_tensors is None else return_tensors\n            else:\n                raise ValueError(\n                    f\"type of {first_element} unknown: {type(first_element)}. \"\n                    \"Should be one of a python, numpy, pytorch or tensorflow object.\"\n                )\n\n            for key, value in encoded_inputs.items():\n                encoded_inputs[key] = to_py_obj(value)\n\n        # Convert padding_strategy in PaddingStrategy\n        padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(\n            padding=padding, max_length=max_length, verbose=verbose)\n\n        required_input = encoded_inputs[self.model_input_names[0]]\n        if required_input and not isinstance(required_input[0], (list, tuple)):\n            encoded_inputs = self._pad(\n                encoded_inputs,\n                max_length=max_length,\n                padding_strategy=padding_strategy,\n                pad_to_multiple_of=pad_to_multiple_of,\n                return_attention_mask=return_attention_mask, )\n            return BatchEncoding(encoded_inputs, tensor_type=return_tensors)\n\n        batch_size = len(required_input)\n        assert all(\n            len(v) == batch_size for v in encoded_inputs.values()\n        ), \"Some items in the output dictionary have a different batch size than others.\"\n\n        if padding_strategy == PaddingStrategy.LONGEST:\n            max_length = max(len(inputs) for inputs in required_input)\n            padding_strategy = PaddingStrategy.MAX_LENGTH\n\n        batch_outputs = {}\n        for i in range(batch_size):\n            inputs = dict((k, v[i]) for k, v in encoded_inputs.items())\n            outputs = self._pad(\n                inputs,\n                max_length=max_length,\n                padding_strategy=padding_strategy,\n                pad_to_multiple_of=pad_to_multiple_of,\n                return_attention_mask=return_attention_mask, )\n\n            for key, value in outputs.items():\n                if key not in batch_outputs:\n                    batch_outputs[key] = []\n                batch_outputs[key].append(value)\n\n        return BatchEncoding(batch_outputs, tensor_type=return_tensors)\n\n    def create_token_type_ids_from_sequences(self,\n                                             token_ids_0,\n                                             token_ids_1=None):\n        \"\"\"\n        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make\n        use of token type ids, therefore a list of zeros is returned.\n\n        Args:\n            token_ids_0 (`List[int]`):\n                List of IDs.\n            token_ids_1 (`List[int]`, *optional*):\n                Optional second list of IDs for sequence pairs.\n\n        Returns:\n            `List[int]` of zeros.\n        \"\"\"\n        eos = [self.eos_token_id]\n\n        if token_ids_1 is None:\n            return len(token_ids_0 + eos) * [0]\n        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]\n\n    def _add_eos_if_not_present(self, token_ids):\n        \"\"\"Do not add eos again if user already added it.\"\"\"\n        if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:\n            warnings.warn(\n                f\"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated\"\n                \" eos tokens being added.\")\n            return token_ids\n        else:\n            return token_ids + [self.eos_token_id]\n\n    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):\n        \"\"\"\n        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and\n        adding special tokens. A sequence has the following format:\n\n        - single sequence: `X </s>`\n        - pair of sequences: `A </s> B </s>`\n\n        Args:\n            token_ids_0 (`List[int]`):\n                List of IDs to which the special tokens will be added.\n            token_ids_1 (`List[int]`, *optional*):\n                Optional second list of IDs for sequence pairs.\n\n        Returns:\n            `List[int]` of [input IDs](../glossary#input-ids) with the appropriate special tokens.\n        \"\"\"\n        token_ids_0 = self._add_eos_if_not_present(token_ids_0)\n        if token_ids_1 is None:\n            return token_ids_0\n        else:\n            token_ids_1 = self._add_eos_if_not_present(token_ids_1)\n            return token_ids_0 + token_ids_1\n\n    def truncate_sequences(self,\n                           ids,\n                           pair_ids=None,\n                           num_tokens_to_remove=0,\n                           truncation_strategy=\"longest_first\",\n                           stride=0):\n        \"\"\"\n        Truncates a sequence pair in-place following the strategy.\n\n        Args:\n            ids (`List[int]`):\n                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and\n                `convert_tokens_to_ids` methods.\n            pair_ids (`List[int]`, *optional*):\n                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`\n                and `convert_tokens_to_ids` methods.\n            num_tokens_to_remove (`int`, *optional*, defaults to 0):\n                Number of tokens to remove using the truncation strategy.\n            truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):\n                The strategy to follow for truncation. Can be:\n\n                - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the\n                  maximum acceptable input length for the model if that argument is not provided. This will truncate\n                  token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a\n                  batch of pairs) is provided.\n                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the\n                  maximum acceptable input length for the model if that argument is not provided. This will only\n                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.\n                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the\n                  maximum acceptable input length for the model if that argument is not provided. This will only\n                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.\n                - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater\n                  than the model maximum admissible input size).\n            stride (`int`, *optional*, defaults to 0):\n                If set to a positive number, the overflowing tokens returned will contain some tokens from the main\n                sequence returned. The value of this argument defines the number of additional tokens.\n\n        Returns:\n            `Tuple[List[int], List[int], List[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of\n            overflowing tokens. Note: The *longest_first* strategy returns empty list of overflowing tokens if a pair\n            of sequences (or a batch of pairs) is provided.\n        \"\"\"\n        if num_tokens_to_remove <= 0:\n            return ids, pair_ids, []\n\n        if not isinstance(truncation_strategy, TruncationStrategy):\n            truncation_strategy = TruncationStrategy(truncation_strategy)\n\n        overflowing_tokens = []\n        if truncation_strategy == TruncationStrategy.ONLY_FIRST or (\n                truncation_strategy == TruncationStrategy.LONGEST_FIRST and\n                pair_ids is None):\n            if len(ids) > num_tokens_to_remove:\n                window_len = min(len(ids), stride + num_tokens_to_remove)\n                if self.truncation_side == \"left\":\n                    overflowing_tokens = ids[:window_len]\n                    ids = ids[num_tokens_to_remove:]\n                elif self.truncation_side == \"right\":\n                    overflowing_tokens = ids[-window_len:]\n                    ids = ids[:-num_tokens_to_remove]\n                else:\n                    raise ValueError(\n                        f\"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'.\"\n                    )\n\n            else:\n                error_msg = (\n                    f\"We need to remove {num_tokens_to_remove} to truncate the input \"\n                    f\"but the first sequence has a length {len(ids)}. \")\n                if truncation_strategy == TruncationStrategy.ONLY_FIRST:\n                    error_msg = (\n                        error_msg +\n                        \"Please select another truncation strategy than \"\n                        f\"{truncation_strategy}, for instance 'longest_first' or 'only_second'.\"\n                    )\n                logger.error(error_msg)\n        elif truncation_strategy == TruncationStrategy.LONGEST_FIRST:\n            logger.warning(\n                \"Be aware, overflowing tokens are not returned for the setting you have chosen,\"\n                f\" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' \"\n                \"truncation strategy. So the returned list will always be empty even if some \"\n                \"tokens have been removed.\")\n            for _ in range(num_tokens_to_remove):\n                if pair_ids is None or len(ids) > len(pair_ids):\n                    if self.truncation_side == \"right\":\n                        ids = ids[:-1]\n                    elif self.truncation_side == \"left\":\n                        ids = ids[1:]\n                    else:\n                        raise ValueError(\"invalid truncation strategy:\" + str(\n                            self.truncation_side))\n                else:\n                    if self.truncation_side == \"right\":\n                        pair_ids = pair_ids[:-1]\n                    elif self.truncation_side == \"left\":\n                        pair_ids = pair_ids[1:]\n                    else:\n                        raise ValueError(\"invalid truncation strategy:\" + str(\n                            self.truncation_side))\n        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:\n            if len(pair_ids) > num_tokens_to_remove:\n                window_len = min(len(pair_ids), stride + num_tokens_to_remove)\n                if self.truncation_side == \"right\":\n                    overflowing_tokens = pair_ids[-window_len:]\n                    pair_ids = pair_ids[:-num_tokens_to_remove]\n                elif self.truncation_side == \"left\":\n                    overflowing_tokens = pair_ids[:window_len]\n                    pair_ids = pair_ids[num_tokens_to_remove:]\n                else:\n                    raise ValueError(\"invalid truncation strategy:\" + str(\n                        self.truncation_side))\n            else:\n                logger.error(\n                    f\"We need to remove {num_tokens_to_remove} to truncate the input \"\n                    f\"but the second sequence has a length {len(pair_ids)}. \"\n                    f\"Please select another truncation strategy than {truncation_strategy}, \"\n                    \"for instance 'longest_first' or 'only_first'.\")\n\n        return (ids, pair_ids, overflowing_tokens)\n\n    def prepare_for_model(self,\n                          ids,\n                          pair_ids=None,\n                          add_special_tokens=True,\n                          padding=False,\n                          truncation=False,\n                          max_length=None,\n                          stride=0,\n                          pad_to_multiple_of=None,\n                          return_tensors=None,\n                          return_token_type_ids=None,\n                          return_attention_mask=None,\n                          return_overflowing_tokens=False,\n                          return_special_tokens_mask=False,\n                          return_offsets_mapping=False,\n                          return_length=False,\n                          verbose=True,\n                          prepend_batch_axis=False,\n                          **kwargs):\n        \"\"\"\n        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It\n        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and\n        manages a moving window (with user defined stride) for overflowing tokens. Please Note, for *pair_ids*\n        different than `None` and *truncation_strategy = longest_first* or `True`, it is not possible to return\n        overflowing tokens. Such a combination of arguments will raise an error.\n\n        Args:\n            ids (`List[int]`):\n                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and\n                `convert_tokens_to_ids` methods.\n            pair_ids (`List[int]`, *optional*):\n                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`\n                and `convert_tokens_to_ids` methods.\n        \"\"\"\n\n        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'\n        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(\n            padding=padding,\n            truncation=truncation,\n            max_length=max_length,\n            pad_to_multiple_of=pad_to_multiple_of,\n            verbose=verbose,\n            **kwargs, )\n\n        pair = bool(pair_ids is not None)\n        len_ids = len(ids)\n        len_pair_ids = len(pair_ids) if pair else 0\n\n        if return_token_type_ids and not add_special_tokens:\n            raise ValueError(\n                \"Asking to return token_type_ids while setting add_special_tokens to False \"\n                \"results in an undefined behavior. Please set add_special_tokens to True or \"\n                \"set return_token_type_ids to None.\")\n\n        if (return_overflowing_tokens and\n                truncation_strategy == TruncationStrategy.LONGEST_FIRST and\n                pair_ids is not None):\n            raise ValueError(\n                \"Not possible to return overflowing tokens for pair of sequences with the \"\n                \"`longest_first`. Please select another truncation strategy than `longest_first`, \"\n                \"for instance `only_second` or `only_first`.\")\n\n        # Load from model defaults\n        if return_token_type_ids is None:\n            return_token_type_ids = \"token_type_ids\" in self.model_input_names\n        if return_attention_mask is None:\n            return_attention_mask = \"attention_mask\" in self.model_input_names\n\n        encoded_inputs = {}\n\n        # Compute the total size of the returned encodings\n        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(\n            pair=pair) if add_special_tokens else 0)\n\n        # Truncation: Handle max sequence length\n        overflowing_tokens = []\n        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:\n            ids, pair_ids, overflowing_tokens = self.truncate_sequences(\n                ids,\n                pair_ids=pair_ids,\n                num_tokens_to_remove=total_len - max_length,\n                truncation_strategy=truncation_strategy,\n                stride=stride, )\n\n        if return_overflowing_tokens:\n            encoded_inputs[\"overflowing_tokens\"] = overflowing_tokens\n            encoded_inputs[\"num_truncated_tokens\"] = total_len - max_length\n\n        # Add special tokens\n        if add_special_tokens:\n            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)\n            token_type_ids = self.create_token_type_ids_from_sequences(\n                ids, pair_ids)\n        else:\n            sequence = ids + pair_ids if pair else ids\n            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids)\n                                               if pair else [])\n\n        # Build output dictionary\n        encoded_inputs[\"input_ids\"] = sequence\n        if return_token_type_ids:\n            encoded_inputs[\"token_type_ids\"] = token_type_ids\n        if return_special_tokens_mask:\n            if add_special_tokens:\n                encoded_inputs[\n                    \"special_tokens_mask\"] = self.get_special_tokens_mask(\n                        ids, pair_ids)\n            else:\n                encoded_inputs[\"special_tokens_mask\"] = [0] * len(sequence)\n\n        # Check lengths\n        self._eventual_warn_about_too_long_sequence(\n            encoded_inputs[\"input_ids\"], max_length, verbose)\n\n        # Padding\n        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:\n            encoded_inputs = self.pad(\n                encoded_inputs,\n                max_length=max_length,\n                padding=padding_strategy.value,\n                pad_to_multiple_of=pad_to_multiple_of,\n                return_attention_mask=return_attention_mask, )\n\n        if return_length:\n            encoded_inputs[\"length\"] = len(encoded_inputs[\"input_ids\"])\n\n        batch_outputs = BatchEncoding(\n            encoded_inputs,\n            tensor_type=return_tensors,\n            prepend_batch_axis=prepend_batch_axis)\n        return batch_outputs\n\n    def _batch_prepare_for_model(\n            self,\n            batch_ids_pairs,\n            add_special_tokens=True,\n            padding_strategy=PaddingStrategy.DO_NOT_PAD,\n            truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE,\n            max_length=None,\n            stride=0,\n            pad_to_multiple_of=None,\n            return_tensors=None,\n            return_token_type_ids=None,\n            return_attention_mask=None,\n            return_overflowing_tokens=False,\n            return_special_tokens_mask=False,\n            return_length=False,\n            verbose=True, ):\n        \"\"\"\n        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It\n        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and\n        manages a moving window (with user defined stride) for overflowing tokens\n\n        Args:\n            batch_ids_pairs: list of tokenized input ids or input ids pairs\n        \"\"\"\n\n        batch_outputs = {}\n        for first_ids, second_ids in batch_ids_pairs:\n            outputs = self.prepare_for_model(\n                first_ids,\n                second_ids,\n                add_special_tokens=add_special_tokens,\n                padding=PaddingStrategy.DO_NOT_PAD.\n                value,  # we pad in batch afterward\n                truncation=truncation_strategy.value,\n                max_length=max_length,\n                stride=stride,\n                pad_to_multiple_of=None,  # we pad in batch afterward\n                return_attention_mask=False,  # we pad in batch afterward\n                return_token_type_ids=return_token_type_ids,\n                return_overflowing_tokens=return_overflowing_tokens,\n                return_special_tokens_mask=return_special_tokens_mask,\n                return_length=return_length,\n                return_tensors=None,  # We convert the whole batch to tensors at the end\n                prepend_batch_axis=False,\n                verbose=verbose, )\n\n            for key, value in outputs.items():\n                if key not in batch_outputs:\n                    batch_outputs[key] = []\n                batch_outputs[key].append(value)\n\n        batch_outputs = self.pad(\n            batch_outputs,\n            padding=padding_strategy.value,\n            max_length=max_length,\n            pad_to_multiple_of=pad_to_multiple_of,\n            return_attention_mask=return_attention_mask, )\n\n        batch_outputs = BatchEncoding(\n            batch_outputs, tensor_type=return_tensors)\n\n        return batch_outputs\n\n    def _get_padding_truncation_strategies(self,\n                                           padding=False,\n                                           truncation=False,\n                                           max_length=None,\n                                           pad_to_multiple_of=None,\n                                           verbose=True,\n                                           **kwargs):\n        \"\"\"\n        Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy\n        and pad_to_max_length) and behaviors.\n        \"\"\"\n        old_truncation_strategy = kwargs.pop(\"truncation_strategy\",\n                                             \"do_not_truncate\")\n        old_pad_to_max_length = kwargs.pop(\"pad_to_max_length\", False)\n\n        # Backward compatibility for previous behavior, maybe we should deprecate it:\n        # If you only set max_length, it activates truncation for max_length\n        if max_length is not None and padding is False and truncation is False:\n            if verbose:\n                if not self.deprecation_warnings.get(\n                        \"Truncation-not-explicitly-activated\", False):\n                    logger.warning(\n                        \"Truncation was not explicitly activated but `max_length` is provided a specific value, please\"\n                        \" use `truncation=True` to explicitly truncate examples to max length. Defaulting to\"\n                        \" 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the\"\n                        \" tokenizer you can select this strategy more precisely by providing a specific strategy to\"\n                        \" `truncation`.\")\n                self.deprecation_warnings[\n                    \"Truncation-not-explicitly-activated\"] = True\n            truncation = \"longest_first\"\n\n        # Get padding strategy\n        if padding is False and old_pad_to_max_length:\n            if verbose:\n                warnings.warn(\n                    \"The `pad_to_max_length` argument is deprecated and will be removed in a future version, \"\n                    \"use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or \"\n                    \"use `padding='max_length'` to pad to a max length. In this case, you can give a specific \"\n                    \"length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the \"\n                    \"maximal input size of the model (e.g. 512 for Bert).\",\n                    FutureWarning, )\n            if max_length is None:\n                padding_strategy = PaddingStrategy.LONGEST\n            else:\n                padding_strategy = PaddingStrategy.MAX_LENGTH\n        elif padding is not False:\n            if padding is True:\n                if verbose:\n                    if max_length is not None and (\n                            truncation is False or\n                            truncation == \"do_not_truncate\"):\n                        warnings.warn(\n                            \"`max_length` is ignored when `padding`=`True` and there is no truncation strategy. \"\n                            \"To pad to max length, use `padding='max_length'`.\")\n                    if old_pad_to_max_length is not False:\n                        warnings.warn(\n                            \"Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`.\"\n                        )\n                padding_strategy = PaddingStrategy.LONGEST  # Default to pad to the longest sequence in the batch\n            elif not isinstance(padding, PaddingStrategy):\n                padding_strategy = PaddingStrategy(padding)\n            elif isinstance(padding, PaddingStrategy):\n                padding_strategy = padding\n        else:\n            padding_strategy = PaddingStrategy.DO_NOT_PAD\n\n        # Get truncation strategy\n        if truncation is False and old_truncation_strategy != \"do_not_truncate\":\n            if verbose:\n                warnings.warn(\n                    \"The `truncation_strategy` argument is deprecated and will be removed in a future version, use\"\n                    \" `truncation=True` to truncate examples to a max length. You can give a specific length with\"\n                    \" `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the maximal input\"\n                    \" size of the model (e.g. 512 for Bert).  If you have pairs of inputs, you can give a specific\"\n                    \" truncation strategy selected among `truncation='only_first'` (will only truncate the first\"\n                    \" sentence in the pairs) `truncation='only_second'` (will only truncate the second sentence in the\"\n                    \" pairs) or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence\"\n                    \" in the pairs).\",\n                    FutureWarning, )\n            truncation_strategy = TruncationStrategy(old_truncation_strategy)\n        elif truncation is not False:\n            if truncation is True:\n                truncation_strategy = (\n                    TruncationStrategy.LONGEST_FIRST\n                )  # Default to truncate the longest sequences in pairs of inputs\n            elif not isinstance(truncation, TruncationStrategy):\n                truncation_strategy = TruncationStrategy(truncation)\n            elif isinstance(truncation, TruncationStrategy):\n                truncation_strategy = truncation\n        else:\n            truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE\n\n        # Set max length if needed\n        if max_length is None:\n            if padding_strategy == PaddingStrategy.MAX_LENGTH:\n                if self.model_max_length > LARGE_INTEGER:\n                    if verbose:\n                        if not self.deprecation_warnings.get(\n                                \"Asking-to-pad-to-max_length\", False):\n                            logger.warning(\n                                \"Asking to pad to max_length but no maximum length is provided and the model has no\"\n                                \" predefined maximum length. Default to no padding.\"\n                            )\n                        self.deprecation_warnings[\n                            \"Asking-to-pad-to-max_length\"] = True\n                    padding_strategy = PaddingStrategy.DO_NOT_PAD\n                else:\n                    max_length = self.model_max_length\n\n            if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:\n                if self.model_max_length > LARGE_INTEGER:\n                    if verbose:\n                        if not self.deprecation_warnings.get(\n                                \"Asking-to-truncate-to-max_length\", False):\n                            logger.warning(\n                                \"Asking to truncate to max_length but no maximum length is provided and the model has\"\n                                \" no predefined maximum length. Default to no truncation.\"\n                            )\n                        self.deprecation_warnings[\n                            \"Asking-to-truncate-to-max_length\"] = True\n                    truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE\n                else:\n                    max_length = self.model_max_length\n\n        # Test if we have a padding token\n        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (\n                not self.pad_token or self.pad_token_id < 0):\n            raise ValueError(\n                \"Asking to pad but the tokenizer does not have a padding token. \"\n                \"Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` \"\n                \"or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.\"\n            )\n\n        # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided\n        if (truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and\n                padding_strategy != PaddingStrategy.DO_NOT_PAD and\n                pad_to_multiple_of is not None and max_length is not None and\n            (max_length % pad_to_multiple_of != 0)):\n            raise ValueError(\n                \"Truncation and padding are both activated but \"\n                f\"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of}).\"\n            )\n\n        return padding_strategy, truncation_strategy, max_length, kwargs\n\n    def batch_encode_plus(self,\n                          batch_text_or_text_pairs,\n                          add_special_tokens=True,\n                          padding=False,\n                          truncation=False,\n                          max_length=None,\n                          stride=0,\n                          is_split_into_words=False,\n                          pad_to_multiple_of=None,\n                          return_tensors=None,\n                          return_token_type_ids=None,\n                          return_attention_mask=None,\n                          return_overflowing_tokens=False,\n                          return_special_tokens_mask=False,\n                          return_offsets_mapping=False,\n                          return_length=False,\n                          verbose=True,\n                          **kwargs):\n        \"\"\"\n        Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.\n\n        <Tip warning={true}>\n\n        This method is deprecated, `__call__` should be used instead.\n\n        </Tip>\n\n        Args:\n            batch_text_or_text_pairs (`List[str]`, `List[Tuple[str, str]]`, `List[List[str]]`, `List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also `List[List[int]]`, `List[Tuple[List[int], List[int]]]`):\n                Batch of sequences or pair of sequences to be encoded. This can be a list of\n                string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see\n                details in `encode_plus`).\n        \"\"\"\n\n        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'\n        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(\n            padding=padding,\n            truncation=truncation,\n            max_length=max_length,\n            pad_to_multiple_of=pad_to_multiple_of,\n            verbose=verbose,\n            **kwargs, )\n\n        return self._batch_encode_plus(\n            batch_text_or_text_pairs=batch_text_or_text_pairs,\n            add_special_tokens=add_special_tokens,\n            padding_strategy=padding_strategy,\n            truncation_strategy=truncation_strategy,\n            max_length=max_length,\n            stride=stride,\n            is_split_into_words=is_split_into_words,\n            pad_to_multiple_of=pad_to_multiple_of,\n            return_tensors=return_tensors,\n            return_token_type_ids=return_token_type_ids,\n            return_attention_mask=return_attention_mask,\n            return_overflowing_tokens=return_overflowing_tokens,\n            return_special_tokens_mask=return_special_tokens_mask,\n            return_offsets_mapping=return_offsets_mapping,\n            return_length=return_length,\n            verbose=verbose,\n            **kwargs, )\n\n    def _batch_encode_plus(\n            self,\n            batch_text_or_text_pairs,\n            add_special_tokens=True,\n            padding_strategy=PaddingStrategy.DO_NOT_PAD,\n            truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE,\n            max_length=None,\n            stride=0,\n            is_split_into_words=False,\n            pad_to_multiple_of=None,\n            return_tensors=None,\n            return_token_type_ids=None,\n            return_attention_mask=None,\n            return_overflowing_tokens=False,\n            return_special_tokens_mask=False,\n            return_offsets_mapping=False,\n            return_length=False,\n            verbose=True,\n            **kwargs):\n        def get_input_ids(text):\n            if isinstance(text, str):\n                tokens = self.tokenize(text, **kwargs)\n                return self.convert_tokens_to_ids(tokens)\n            elif isinstance(text,\n                            (list, tuple)) and len(text) > 0 and isinstance(\n                                text[0], str):\n                if is_split_into_words:\n                    tokens = list(\n                        itertools.chain(*(self.tokenize(\n                            t, is_split_into_words=True, **kwargs)\n                                          for t in text)))\n                    return self.convert_tokens_to_ids(tokens)\n                else:\n                    return self.convert_tokens_to_ids(text)\n            elif isinstance(text,\n                            (list, tuple)) and len(text) > 0 and isinstance(\n                                text[0], int):\n                return text\n            else:\n                raise ValueError(\n                    \"Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.\"\n                )\n\n        if return_offsets_mapping:\n            raise NotImplementedError(\n                \"return_offset_mapping is not available when using Python tokenizers. \"\n                \"To use this feature, change your tokenizer to one deriving from \"\n                \"transformers.PreTrainedTokenizerFast.\")\n\n        input_ids = []\n        for ids_or_pair_ids in batch_text_or_text_pairs:\n            if not isinstance(ids_or_pair_ids, (list, tuple)):\n                ids, pair_ids = ids_or_pair_ids, None\n            elif is_split_into_words and not isinstance(ids_or_pair_ids[0],\n                                                        (list, tuple)):\n                ids, pair_ids = ids_or_pair_ids, None\n            else:\n                ids, pair_ids = ids_or_pair_ids\n\n            first_ids = get_input_ids(ids)\n            second_ids = get_input_ids(\n                pair_ids) if pair_ids is not None else None\n            input_ids.append((first_ids, second_ids))\n\n        batch_outputs = self._batch_prepare_for_model(\n            input_ids,\n            add_special_tokens=add_special_tokens,\n            padding_strategy=padding_strategy,\n            truncation_strategy=truncation_strategy,\n            max_length=max_length,\n            stride=stride,\n            pad_to_multiple_of=pad_to_multiple_of,\n            return_attention_mask=return_attention_mask,\n            return_token_type_ids=return_token_type_ids,\n            return_overflowing_tokens=return_overflowing_tokens,\n            return_special_tokens_mask=return_special_tokens_mask,\n            return_length=return_length,\n            return_tensors=return_tensors,\n            verbose=verbose, )\n\n        return BatchEncoding(batch_outputs)\n\n    def tokenize(self, text, **kwargs):\n        \"\"\"\n        Converts a string in a sequence of tokens, using the tokenizer.\n\n        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies\n        (BPE/SentencePieces/WordPieces). Takes care of added tokens.\n\n        Args:\n            text (`str`):\n                The sequence to be encoded.\n            **kwargs (additional keyword arguments):\n                Passed along to the model-specific `prepare_for_tokenization` preprocessing method.\n\n        Returns:\n            `List[str]`: The list of tokens.\n        \"\"\"\n        # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors\n        all_special_tokens_extended = dict(\n            (str(t), t) for t in self.all_special_tokens_extended\n            if isinstance(t, AddedToken))\n\n        text, kwargs = self.prepare_for_tokenization(text, **kwargs)\n\n        if kwargs:\n            logger.warning(f\"Keyword arguments {kwargs} not recognized.\")\n\n        # TODO: should this be in the base class?\n        if hasattr(self, \"do_lower_case\") and self.do_lower_case:\n            # convert non-special tokens to lowercase\n            escaped_special_toks = [\n                re.escape(s_tok)\n                for s_tok in (self.unique_no_split_tokens +\n                              self.all_special_tokens)\n            ]\n            pattern = r\"(\" + r\"|\".join(escaped_special_toks) + r\")|\" + r\"(.+?)\"\n            text = re.sub(pattern,\n                          lambda m: m.groups()[0] or m.groups()[1].lower(),\n                          text)\n\n        no_split_token = set(self.unique_no_split_tokens)\n        tokens = self.tokens_trie.split(text)\n        # [\"This is something\", \"<special_token_1>\", \"  else\"]\n        for i, token in enumerate(tokens):\n            if token in no_split_token:\n                tok_extended = all_special_tokens_extended.get(token, None)\n                left = tokens[i - 1] if i > 0 else None\n                right = tokens[i + 1] if i < len(tokens) - 1 else None\n                if isinstance(tok_extended, AddedToken):\n                    if tok_extended.rstrip and right:\n                        # A bit counter-intuitive but we strip the left of the string\n                        # since tok_extended.rstrip means the special token is eating all white spaces on its right\n                        tokens[i + 1] = right.lstrip()\n                    # Strip white spaces on the left\n                    if tok_extended.lstrip and left:\n                        tokens[i - 1] = left.rstrip()  # Opposite here\n                else:\n                    # We strip left and right by default\n                    if right:\n                        tokens[i + 1] = right.lstrip()\n                    if left:\n                        tokens[i - 1] = left.rstrip()\n        # [\"This is something\", \"<special_token_1>\", \"else\"]\n        tokenized_text = []\n        for token in tokens:\n            # Need to skip eventual empty (fully stripped) tokens\n            if not token:\n                continue\n            if token in no_split_token:\n                tokenized_text.append(token)\n            else:\n                tokenized_text.extend(self._tokenize(token))\n        # [\"This\", \" is\", \" something\", \"<special_token_1>\", \"else\"]\n        return tokenized_text\n\n    def _tokenize(self, text):\n        \"\"\"Take as input a string and return a list of strings (tokens) for words/sub-words\"\"\"\n        return self.sp_model.encode(text, out_type=str)\n\n    def prepare_for_tokenization(self,\n                                 text,\n                                 is_split_into_words=False,\n                                 **kwargs):\n        \"\"\"\n        Performs any necessary transformations before tokenization.\n\n        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the\n        `kwargs` at the end of the encoding process to be sure all the arguments have been used.\n\n        Args:\n            text (`str`):\n                The text to prepare.\n            is_split_into_words (`bool`, *optional*, defaults to `False`):\n                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the\n                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)\n                which it will tokenize. This is useful for NER or token classification.\n            kwargs:\n                Keyword arguments to use for the tokenization.\n\n        Returns:\n            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.\n        \"\"\"\n        return (text, kwargs)\n\n    def convert_tokens_to_ids(self, tokens):\n        \"\"\"\n        Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the\n        vocabulary.\n\n        Args:\n            tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).\n\n        Returns:\n            `int` or `List[int]`: The token id or list of token ids.\n        \"\"\"\n        if tokens is None:\n            return None\n\n        if isinstance(tokens, str):\n            return self._convert_token_to_id_with_added_voc(tokens)\n\n        ids = []\n        for token in tokens:\n            ids.append(self._convert_token_to_id_with_added_voc(token))\n        return ids\n\n    def _convert_token_to_id_with_added_voc(self, token):\n        if token is None:\n            return None\n\n        if token in self.added_tokens_encoder:\n            return self.added_tokens_encoder[token]\n        return self._convert_token_to_id(token)\n\n    def _convert_token_to_id(self, token):\n        \"\"\"Converts a token (str) in an id using the vocab.\"\"\"\n        if token.startswith(\"<extra_id_\"):\n            match = re.match(r\"<extra_id_(\\d+)>\", token)\n            num = int(match.group(1))\n            return self.vocab_size - num - 1\n        return self.sp_model.piece_to_id(token)\n\n    def num_special_tokens_to_add(self, pair=False):\n        \"\"\"\n        Returns the number of added tokens when encoding a sequence with special tokens.\n\n        <Tip>\n\n        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put\n        this inside your training loop.\n\n        </Tip>\n\n        Args:\n            pair (`bool`, *optional*, defaults to `False`):\n                Whether the number of added tokens should be computed in the case of a sequence pair or a single\n                sequence.\n\n        Returns:\n            `int`: Number of special tokens added to sequences.\n        \"\"\"\n        token_ids_0 = []\n        token_ids_1 = []\n        return len(\n            self.build_inputs_with_special_tokens(token_ids_0, token_ids_1\n                                                  if pair else None))\n\n    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):\n        \"\"\"\n        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and\n        adding special tokens. A sequence has the following format:\n\n        - single sequence: `X </s>`\n        - pair of sequences: `A </s> B </s>`\n\n        Args:\n            token_ids_0 (`List[int]`):\n                List of IDs to which the special tokens will be added.\n            token_ids_1 (`List[int]`, *optional*):\n                Optional second list of IDs for sequence pairs.\n\n        Returns:\n            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.\n        \"\"\"\n        token_ids_0 = self._add_eos_if_not_present(token_ids_0)\n        if token_ids_1 is None:\n            return token_ids_0\n        else:\n            token_ids_1 = self._add_eos_if_not_present(token_ids_1)\n            return token_ids_0 + token_ids_1\n\n    @staticmethod\n    def _eventually_correct_t5_max_length(pretrained_model_name_or_path,\n                                          max_model_length,\n                                          init_max_model_length):\n        if pretrained_model_name_or_path in T5Tokenizer.max_model_input_sizes:\n            deprecated_max_model_length = T5Tokenizer.max_model_input_sizes[\n                pretrained_model_name_or_path]\n            if init_max_model_length is not None and init_max_model_length != max_model_length:\n                return init_max_model_length\n            elif init_max_model_length is None:\n                warnings.warn(\n                    \"This tokenizer was incorrectly instantiated with a model max length of\"\n                    f\" {deprecated_max_model_length} which will be corrected in Transformers v5.\\nFor now, this\"\n                    \" behavior is kept to avoid breaking backwards compatibility when padding/encoding with\"\n                    \" `truncation is True`.\\n- Be aware that you SHOULD NOT rely on\"\n                    f\" {pretrained_model_name_or_path} automatically truncating your input to\"\n                    f\" {deprecated_max_model_length} when padding/encoding.\\n- If you want to encode/pad to sequences\"\n                    f\" longer than {deprecated_max_model_length} you can either instantiate this tokenizer with\"\n                    \" `model_max_length` or pass `max_length` when encoding/padding.\\n- To avoid this warning, please\"\n                    \" instantiate this tokenizer with `model_max_length` set to your preferred value.\",\n                    FutureWarning, )\n\n        return max_model_length\n\n    @property\n    def vocab_size(self):\n        return self.sp_model.get_piece_size() + self._extra_ids\n\n    def get_vocab(self):\n        vocab = {\n            self.convert_ids_to_tokens(i): i\n            for i in range(self.vocab_size)\n        }\n        vocab.update(self.added_tokens_encoder)\n        return vocab\n\n    def get_special_tokens_mask(self,\n                                token_ids_0,\n                                token_ids_1=None,\n                                already_has_special_tokens=False):\n        \"\"\"\n        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding\n        special tokens using the tokenizer `prepare_for_model` method.\n\n        Args:\n            token_ids_0 (`List[int]`):\n                List of IDs.\n            token_ids_1 (`List[int]`, *optional*):\n                Optional second list of IDs for sequence pairs.\n            already_has_special_tokens (`bool`, *optional*, defaults to `False`):\n                Whether or not the token list is already formatted with special tokens for the model.\n\n        Returns:\n            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.\n        \"\"\"\n        if already_has_special_tokens:\n            return super().get_special_tokens_mask(\n                token_ids_0=token_ids_0,\n                token_ids_1=token_ids_1,\n                already_has_special_tokens=True)\n\n        # normal case: some special tokens\n        if token_ids_1 is None:\n            return ([0] * len(token_ids_0)) + [1]\n        return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]\n\n    def _add_eos_if_not_present(self, token_ids):\n        \"\"\"Do not add eos again if user already added it.\"\"\"\n        if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:\n            warnings.warn(\n                f\"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated\"\n                \" eos tokens being added.\")\n            return token_ids\n        else:\n            return token_ids + [self.eos_token_id]\n\n    def create_token_type_ids_from_sequences(self,\n                                             token_ids_0,\n                                             token_ids_1=None):\n        \"\"\"\n        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make\n        use of token type ids, therefore a list of zeros is returned.\n\n        Args:\n            token_ids_0 (`List[int]`):\n                List of IDs.\n            token_ids_1 (`List[int]`, *optional*):\n                Optional second list of IDs for sequence pairs.\n\n        Returns:\n            `List[int]` of zeros.\n        \"\"\"\n        eos = [self.eos_token_id]\n\n        if token_ids_1 is None:\n            return len(token_ids_0 + eos) * [0]\n        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]\n\n    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):\n        \"\"\"\n        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and\n        adding special tokens. A sequence has the following format:\n\n        - single sequence: `X </s>`\n        - pair of sequences: `A </s> B </s>`\n\n        Args:\n            token_ids_0 (`List[int]`):\n                List of IDs to which the special tokens will be added.\n            token_ids_1 (`List[int]`, *optional*):\n                Optional second list of IDs for sequence pairs.\n\n        Returns:\n            `List[int]` of [input IDs](../glossary#input-ids) with the appropriate special tokens.\n        \"\"\"\n        token_ids_0 = self._add_eos_if_not_present(token_ids_0)\n        if token_ids_1 is None:\n            return token_ids_0\n        else:\n            token_ids_1 = self._add_eos_if_not_present(token_ids_1)\n            return token_ids_0 + token_ids_1\n\n    def __getstate__(self):\n        state = self.__dict__.copy()\n        state[\"sp_model\"] = None\n        return state\n\n    def __setstate__(self, d):\n        self.__dict__ = d\n\n        # for backward compatibility\n        if not hasattr(self, \"sp_model_kwargs\"):\n            self.sp_model_kwargs = {}\n\n        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)\n        self.sp_model.Load(self.vocab_file)\n\n    def _tokenize(self, text: str):\n        \"\"\"Take as input a string and return a list of strings (tokens) for words/sub-words\"\"\"\n        return self.sp_model.encode(text, out_type=str)\n\n    def _convert_token_to_id(self, token):\n        \"\"\"Converts a token (str) in an id using the vocab.\"\"\"\n        if token.startswith(\"<extra_id_\"):\n            match = re.match(r\"<extra_id_(\\d+)>\", token)\n            num = int(match.group(1))\n            return self.vocab_size - num - 1\n        return self.sp_model.piece_to_id(token)\n\n    def _convert_id_to_token(self, index):\n        \"\"\"Converts an index (integer) in a token (str) using the vocab.\"\"\"\n        if index < self.sp_model.get_piece_size():\n            token = self.sp_model.IdToPiece(index)\n        else:\n            token = f\"<extra_id_{self.vocab_size - 1 - index}>\"\n        return token\n\n    def convert_tokens_to_string(self, tokens):\n        \"\"\"Converts a sequence of tokens (string) in a single string.\"\"\"\n        current_sub_tokens = []\n        out_string = \"\"\n        for token in tokens:\n            # make sure that special tokens are not decoded using sentencepiece model\n            if token in self.all_special_tokens:\n                out_string += self.sp_model.decode_pieces(\n                    current_sub_tokens) + token + \" \"\n                current_sub_tokens = []\n            else:\n                current_sub_tokens.append(token)\n        out_string += self.sp_model.decode_pieces(current_sub_tokens)\n        return out_string.strip()\n\n    def save_vocabulary(self, save_directory, filename_prefix=None):\n        if not os.path.isdir(save_directory):\n            logger.error(\n                f\"Vocabulary path ({save_directory}) should be a directory\")\n            return\n        out_vocab_file = os.path.join(\n            save_directory,\n            (filename_prefix + \"-\"\n             if filename_prefix else \"\") + VOCAB_FILES_NAMES[\"vocab_file\"])\n\n        if os.path.abspath(self.vocab_file) != os.path.abspath(\n                out_vocab_file) and os.path.isfile(self.vocab_file):\n            copyfile(self.vocab_file, out_vocab_file)\n        elif not os.path.isfile(self.vocab_file):\n            with open(out_vocab_file, \"wb\") as fi:\n                content_spiece_model = self.sp_model.serialized_model_proto()\n                fi.write(content_spiece_model)\n\n        return (out_vocab_file, )\n\n    @classmethod\n    def _dict_from_json_file(cls, json_file):\n        with open(json_file, \"r\", encoding=\"utf-8\") as reader:\n            text = reader.read()\n        return json.loads(text)\n"
  },
  {
    "path": "ppfleetx/data/tokenizers/tokenization_utils_base.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# Copyright 2020 The HuggingFace Inc. team.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nBase classes common to both the slow and the fast tokenization classes: PreTrainedTokenizerBase (host all the user\nfronting encoding methods) Special token mixing (host the special tokens logic) and BatchEncoding (wrap the dictionary\nof output with special method for the Fast tokenizers)\n\"\"\"\n\nimport copy\nimport json\nimport os\nimport re\nimport warnings\nfrom collections import OrderedDict, UserDict\nfrom collections.abc import Mapping\nfrom contextlib import contextmanager\nfrom dataclasses import dataclass, field\nfrom typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union\nimport importlib\n\nimport numpy as np\n\n\ndef is_sentencepiece_available():\n    return importlib.util.find_spec(\"sentencepiece\") is not None\n\n\ndef is_tokenizers_available():\n    return importlib.util.find_spec(\"tokenizers\") is not None\n\n\nif is_tokenizers_available():\n    from tokenizers import AddedToken\nelse:\n\n    @dataclass(frozen=True, eq=True)\n    class AddedToken:\n        \"\"\"\n        AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the\n        way it should behave.\n        \"\"\"\n\n        content: str = field(default_factory=str)\n        single_word: bool = False\n        lstrip: bool = False\n        rstrip: bool = False\n        normalized: bool = True\n\n        def __getstate__(self):\n            return self.__dict__\n\n\nTOKENIZER_MAPPING_NAMES = OrderedDict([\n    (\n        \"albert\",\n        (\n            \"AlbertTokenizer\" if is_sentencepiece_available() else None,\n            \"AlbertTokenizerFast\" if is_tokenizers_available() else None, ), ),\n    (\"bart\", (\"BartTokenizer\", \"BartTokenizerFast\")),\n    (\n        \"barthez\",\n        (\n            \"BarthezTokenizer\" if is_sentencepiece_available() else None,\n            \"BarthezTokenizerFast\"\n            if is_tokenizers_available() else None, ), ),\n    (\"bartpho\", (\"BartphoTokenizer\", None)),\n    (\"bert\", (\"BertTokenizer\", \"BertTokenizerFast\"\n              if is_tokenizers_available() else None)),\n    (\"bert-generation\", (\"BertGenerationTokenizer\"\n                         if is_sentencepiece_available() else None, None)),\n    (\"bert-japanese\", (\"BertJapaneseTokenizer\", None)),\n    (\"bertweet\", (\"BertweetTokenizer\", None)),\n    (\n        \"big_bird\",\n        (\n            \"BigBirdTokenizer\" if is_sentencepiece_available() else None,\n            \"BigBirdTokenizerFast\"\n            if is_tokenizers_available() else None, ), ),\n    (\"bigbird_pegasus\", (\"PegasusTokenizer\", \"PegasusTokenizerFast\"\n                         if is_tokenizers_available() else None)),\n    (\"blenderbot\", (\"BlenderbotTokenizer\", \"BlenderbotTokenizerFast\")),\n    (\"blenderbot-small\", (\"BlenderbotSmallTokenizer\", None)),\n    (\"bloom\", (None, \"BloomTokenizerFast\"\n               if is_tokenizers_available() else None)),\n    (\"byt5\", (\"ByT5Tokenizer\", None)),\n    (\n        \"camembert\",\n        (\n            \"CamembertTokenizer\" if is_sentencepiece_available() else None,\n            \"CamembertTokenizerFast\"\n            if is_tokenizers_available() else None, ), ),\n    (\"canine\", (\"CanineTokenizer\", None)),\n    (\n        \"clip\",\n        (\n            \"CLIPTokenizer\",\n            \"CLIPTokenizerFast\" if is_tokenizers_available() else None, ), ),\n    (\"convbert\", (\"ConvBertTokenizer\", \"ConvBertTokenizerFast\"\n                  if is_tokenizers_available() else None)),\n    (\n        \"cpm\",\n        (\n            \"CpmTokenizer\" if is_sentencepiece_available() else None,\n            \"CpmTokenizerFast\" if is_tokenizers_available() else None, ), ),\n    (\"ctrl\", (\"CTRLTokenizer\", None)),\n    (\"data2vec-text\", (\"RobertaTokenizer\", \"RobertaTokenizerFast\"\n                       if is_tokenizers_available() else None)),\n    (\"deberta\", (\"DebertaTokenizer\", \"DebertaTokenizerFast\"\n                 if is_tokenizers_available() else None)),\n    (\n        \"deberta-v2\",\n        (\n            \"DebertaV2Tokenizer\" if is_sentencepiece_available() else None,\n            \"DebertaV2TokenizerFast\"\n            if is_tokenizers_available() else None, ), ),\n    (\"distilbert\", (\"DistilBertTokenizer\", \"DistilBertTokenizerFast\"\n                    if is_tokenizers_available() else None)),\n    (\n        \"dpr\",\n        (\n            \"DPRQuestionEncoderTokenizer\",\n            \"DPRQuestionEncoderTokenizerFast\"\n            if is_tokenizers_available() else None, ), ),\n    (\"electra\", (\"ElectraTokenizer\", \"ElectraTokenizerFast\"\n                 if is_tokenizers_available() else None)),\n    (\"flaubert\", (\"FlaubertTokenizer\", None)),\n    (\"fnet\", (\"FNetTokenizer\", \"FNetTokenizerFast\"\n              if is_tokenizers_available() else None)),\n    (\"fsmt\", (\"FSMTTokenizer\", None)),\n    (\"funnel\", (\"FunnelTokenizer\", \"FunnelTokenizerFast\"\n                if is_tokenizers_available() else None)),\n    (\"gpt2\", (\"GPT2Tokenizer\", \"GPT2TokenizerFast\"\n              if is_tokenizers_available() else None)),\n    (\"gpt_neo\", (\"GPT2Tokenizer\", \"GPT2TokenizerFast\"\n                 if is_tokenizers_available() else None)),\n    (\"gpt_neox\", (None, \"GPTNeoXTokenizerFast\"\n                  if is_tokenizers_available() else None)),\n    (\"gptj\", (\"GPT2Tokenizer\", \"GPT2TokenizerFast\"\n              if is_tokenizers_available() else None)),\n    (\"herbert\", (\"HerbertTokenizer\", \"HerbertTokenizerFast\"\n                 if is_tokenizers_available() else None)),\n    (\"hubert\", (\"Wav2Vec2CTCTokenizer\", None)),\n    (\"ibert\", (\"RobertaTokenizer\", \"RobertaTokenizerFast\"\n               if is_tokenizers_available() else None)),\n    (\"layoutlm\", (\"LayoutLMTokenizer\", \"LayoutLMTokenizerFast\"\n                  if is_tokenizers_available() else None)),\n    (\"layoutlmv2\", (\"LayoutLMv2Tokenizer\", \"LayoutLMv2TokenizerFast\"\n                    if is_tokenizers_available() else None)),\n    (\"layoutlmv3\", (\"LayoutLMv3Tokenizer\", \"LayoutLMv3TokenizerFast\"\n                    if is_tokenizers_available() else None)),\n    (\"layoutxlm\", (\"LayoutXLMTokenizer\", \"LayoutXLMTokenizerFast\"\n                   if is_tokenizers_available() else None)),\n    (\"led\", (\"LEDTokenizer\", \"LEDTokenizerFast\"\n             if is_tokenizers_available() else None)),\n    (\"longformer\", (\"LongformerTokenizer\", \"LongformerTokenizerFast\"\n                    if is_tokenizers_available() else None)),\n    (\n        \"longt5\",\n        (\n            \"T5Tokenizer\" if is_sentencepiece_available() else None,\n            \"T5TokenizerFast\" if is_tokenizers_available() else None, ), ),\n    (\"luke\", (\"LukeTokenizer\", None)),\n    (\"lxmert\", (\"LxmertTokenizer\", \"LxmertTokenizerFast\"\n                if is_tokenizers_available() else None)),\n    (\"m2m_100\", (\"M2M100Tokenizer\"\n                 if is_sentencepiece_available() else None, None)),\n    (\"marian\", (\"MarianTokenizer\"\n                if is_sentencepiece_available() else None, None)),\n    (\n        \"mbart\",\n        (\n            \"MBartTokenizer\" if is_sentencepiece_available() else None,\n            \"MBartTokenizerFast\" if is_tokenizers_available() else None, ), ),\n    (\n        \"mbart50\",\n        (\n            \"MBart50Tokenizer\" if is_sentencepiece_available() else None,\n            \"MBart50TokenizerFast\"\n            if is_tokenizers_available() else None, ), ),\n    (\"megatron-bert\", (\"BertTokenizer\", \"BertTokenizerFast\"\n                       if is_tokenizers_available() else None)),\n    (\"mluke\", (\"MLukeTokenizer\"\n               if is_sentencepiece_available() else None, None)),\n    (\"mobilebert\", (\"MobileBertTokenizer\", \"MobileBertTokenizerFast\"\n                    if is_tokenizers_available() else None)),\n    (\"mpnet\", (\"MPNetTokenizer\", \"MPNetTokenizerFast\"\n               if is_tokenizers_available() else None)),\n    (\n        \"mt5\",\n        (\n            \"MT5Tokenizer\" if is_sentencepiece_available() else None,\n            \"MT5TokenizerFast\" if is_tokenizers_available() else None, ), ),\n    (\n        \"nystromformer\",\n        (\n            \"AlbertTokenizer\" if is_sentencepiece_available() else None,\n            \"AlbertTokenizerFast\" if is_tokenizers_available() else None, ), ),\n    (\"openai-gpt\", (\"OpenAIGPTTokenizer\", \"OpenAIGPTTokenizerFast\"\n                    if is_tokenizers_available() else None)),\n    (\"opt\", (\"GPT2Tokenizer\", None)),\n    (\n        \"pegasus\",\n        (\n            \"PegasusTokenizer\" if is_sentencepiece_available() else None,\n            \"PegasusTokenizerFast\"\n            if is_tokenizers_available() else None, ), ),\n    (\n        \"perceiver\",\n        (\n            \"PerceiverTokenizer\",\n            None, ), ),\n    (\"phobert\", (\"PhobertTokenizer\", None)),\n    (\"plbart\", (\"PLBartTokenizer\"\n                if is_sentencepiece_available() else None, None)),\n    (\"prophetnet\", (\"ProphetNetTokenizer\", None)),\n    (\"qdqbert\", (\"BertTokenizer\", \"BertTokenizerFast\"\n                 if is_tokenizers_available() else None)),\n    (\"rag\", (\"RagTokenizer\", None)),\n    (\"realm\", (\"RealmTokenizer\", \"RealmTokenizerFast\"\n               if is_tokenizers_available() else None)),\n    (\n        \"reformer\",\n        (\n            \"ReformerTokenizer\" if is_sentencepiece_available() else None,\n            \"ReformerTokenizerFast\"\n            if is_tokenizers_available() else None, ), ),\n    (\n        \"rembert\",\n        (\n            \"RemBertTokenizer\" if is_sentencepiece_available() else None,\n            \"RemBertTokenizerFast\"\n            if is_tokenizers_available() else None, ), ),\n    (\"retribert\", (\"RetriBertTokenizer\", \"RetriBertTokenizerFast\"\n                   if is_tokenizers_available() else None)),\n    (\"roberta\", (\"RobertaTokenizer\", \"RobertaTokenizerFast\"\n                 if is_tokenizers_available() else None)),\n    (\"roformer\", (\"RoFormerTokenizer\", \"RoFormerTokenizerFast\"\n                  if is_tokenizers_available() else None)),\n    (\"speech_to_text\", (\"Speech2TextTokenizer\"\n                        if is_sentencepiece_available() else None, None)),\n    (\"speech_to_text_2\", (\"Speech2Text2Tokenizer\", None)),\n    (\"splinter\", (\"SplinterTokenizer\", \"SplinterTokenizerFast\")),\n    (\n        \"squeezebert\",\n        (\"SqueezeBertTokenizer\", \"SqueezeBertTokenizerFast\"\n         if is_tokenizers_available() else None), ),\n    (\n        \"t5\",\n        (\n            \"T5Tokenizer\" if is_sentencepiece_available() else None,\n            \"T5TokenizerFast\" if is_tokenizers_available() else None, ), ),\n    (\"tapas\", (\"TapasTokenizer\", None)),\n    (\"tapex\", (\"TapexTokenizer\", None)),\n    (\"transfo-xl\", (\"TransfoXLTokenizer\", None)),\n    (\"vilt\", (\"BertTokenizer\", \"BertTokenizerFast\"\n              if is_tokenizers_available() else None)),\n    (\"visual_bert\", (\"BertTokenizer\", \"BertTokenizerFast\"\n                     if is_tokenizers_available() else None)),\n    (\"wav2vec2\", (\"Wav2Vec2CTCTokenizer\", None)),\n    (\"wav2vec2-conformer\", (\"Wav2Vec2CTCTokenizer\", None)),\n    (\"wav2vec2_phoneme\", (\"Wav2Vec2PhonemeCTCTokenizer\", None)),\n    (\n        \"xglm\",\n        (\n            \"XGLMTokenizer\" if is_sentencepiece_available() else None,\n            \"XGLMTokenizerFast\" if is_tokenizers_available() else None, ), ),\n    (\"xlm\", (\"XLMTokenizer\", None)),\n    (\"xlm-prophetnet\", (\"XLMProphetNetTokenizer\"\n                        if is_sentencepiece_available() else None, None)),\n    (\n        \"xlm-roberta\",\n        (\n            \"XLMRobertaTokenizer\" if is_sentencepiece_available() else None,\n            \"XLMRobertaTokenizerFast\"\n            if is_tokenizers_available() else None, ), ),\n    (\"xlm-roberta-xl\", (\"RobertaTokenizer\", \"RobertaTokenizerFast\"\n                        if is_tokenizers_available() else None)),\n    (\n        \"xlnet\",\n        (\n            \"XLNetTokenizer\" if is_sentencepiece_available() else None,\n            \"XLNetTokenizerFast\" if is_tokenizers_available() else None, ), ),\n    (\n        \"yoso\",\n        (\n            \"AlbertTokenizer\" if is_sentencepiece_available() else None,\n            \"AlbertTokenizerFast\" if is_tokenizers_available() else None, ), ),\n])\n\nSPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict([\n    (\"openai-gpt\", \"openai\"),\n    (\"data2vec-audio\", \"data2vec\"),\n    (\"data2vec-text\", \"data2vec\"),\n    (\"data2vec-vision\", \"data2vec\"),\n])\n\n\ndef model_type_to_module_name(key):\n    \"\"\"Converts a config key to the corresponding module.\"\"\"\n    # Special treatment\n    if key in SPECIAL_MODEL_TYPE_TO_MODULE_NAME:\n        return SPECIAL_MODEL_TYPE_TO_MODULE_NAME[key]\n\n    return key.replace(\"-\", \"_\")\n\n\nclass _LazyConfigMapping(OrderedDict):\n    \"\"\"\n    A dictionary that lazily load its values when they are requested.\n    \"\"\"\n\n    def __init__(self, mapping):\n        self._mapping = mapping\n        self._extra_content = {}\n        self._modules = {}\n\n    def __getitem__(self, key):\n        if key in self._extra_content:\n            return self._extra_content[key]\n        if key not in self._mapping:\n            raise KeyError(key)\n        value = self._mapping[key]\n        module_name = model_type_to_module_name(key)\n        if module_name not in self._modules:\n\n            self._modules[module_name] = importlib.import_module(\n                f\".{module_name}\", \"transformers.models\")\n        if hasattr(self._modules[module_name], value):\n            return getattr(self._modules[module_name], value)\n\n        # Some of the mappings have entries model_type -> config of another model type. In that case we try to grab the\n        # object at the top level.\n        transformers_module = importlib.import_module(\"transformers\")\n        return getattr(transformers_module, value)\n\n    def keys(self):\n        return list(self._mapping.keys()) + list(self._extra_content.keys())\n\n    def values(self):\n        return [self[k] for k in self._mapping.keys()] + list(\n            self._extra_content.values())\n\n    def items(self):\n        return [(k, self[k]) for k in self._mapping.keys()] + list(\n            self._extra_content.items())\n\n    def __iter__(self):\n        return iter(\n            list(self._mapping.keys()) + list(self._extra_content.keys()))\n\n    def __contains__(self, item):\n        return item in self._mapping or item in self._extra_content\n\n    def register(self, key, value):\n        \"\"\"\n        Register a new configuration in this mapping.\n        \"\"\"\n        if key in self._mapping.keys():\n            raise ValueError(\n                f\"'{key}' is already used by a Transformers config, pick another name.\"\n            )\n        self._extra_content[key] = value\n\n\nclass Trie:\n    \"\"\"\n    Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass\n    Loose reference https://en.wikipedia.org/wiki/Trie\n    \"\"\"\n\n    def __init__(self):\n        self.data = {}\n\n    def add(self, word: str):\n        \"\"\"\n        Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation.\n        The special key `\"\"` is used to represent termination.\n\n        This function is idempotent, adding twice the same word will leave the trie unchanged\n\n        Example:\n\n        ```python\n        >>> trie = Trie()\n        >>> trie.add(\"Hello 友達\")\n        >>> trie.data\n        {\"H\": {\"e\": {\"l\": {\"l\": {\"o\": {\" \": {\"友\": {\"達\": {\"\": 1}}}}}}}}}\n\n        >>> trie.add(\"Hello\")\n        >>> trie.data\n        {\"H\": {\"e\": {\"l\": {\"l\": {\"o\": {\"\": 1, \" \": {\"友\": {\"達\": {\"\": 1}}}}}}}}}\n        ```\n        \"\"\"\n        if not word:\n            # Prevent empty string\n            return\n        ref = self.data\n        for char in word:\n            ref[char] = char in ref and ref[char] or {}\n            ref = ref[char]\n        ref[\"\"] = 1\n\n    def split(self, text: str) -> List[str]:\n        \"\"\"\n        Will look for the words added to the trie within `text`. Output is the original string splitted along the\n        boundaries of the words found.\n\n        This trie will match the longest possible word first !\n\n        Example:\n\n        ```python\n        >>> trie = Trie()\n        >>> trie.split(\"[CLS] This is a extra_id_100\")\n        [\"[CLS] This is a extra_id_100\"]\n\n        >>> trie.add(\"[CLS]\")\n        >>> trie.add(\"extra_id_1\")\n        >>> trie.add(\"extra_id_100\")\n        >>> trie.split(\"[CLS] This is a extra_id_100\")\n        [\"[CLS]\", \" This is a \", \"extra_id_100\"]\n        ```\n        \"\"\"\n        # indexes are counted left of the chars index.\n        # \"hello\", index 0, is left of h, index 1 is between h and e.\n        # index 5 is right of the \"o\".\n\n        # States are going to capture every possible start (indexes as above)\n        # as keys, and have as values, a pointer to the position in the trie\n        # where we're at. This is a partial match for now.\n        # This enables to keep track of multiple matches while we're iterating\n        # the string\n        # If the trie contains, \"blowing\", and \"lower\" and we encounter the\n        # string \"blower\", we need to split into [\"b\", \"lower\"].\n        # This is where we need to keep track of multiple possible starts.\n        states = OrderedDict()\n\n        # This will contain every indices where we need\n        # to cut.\n        # We force to cut at offset 0 and len(text) (added later)\n        offsets = [0]\n\n        # This is used by the lookahead which needs to skip over\n        # some text where the full match exceeded the place in the initial\n        # for loop\n        skip = 0\n        # Main loop, Giving this algorithm O(n) complexity\n        for current, current_char in enumerate(text):\n            if skip and current < skip:\n                # Prevents the lookahead for matching twice\n                # like extra_id_100 and id_100\n                continue\n\n            # This will track every state\n            # that stop matching, we need to stop tracking them.\n            # If we look at \"lowball\", we're going to match \"l\" (add it to states), \"o\", \"w\", then\n            # fail on \"b\", we need to remove 0 from the valid states.\n            to_remove = set()\n            # Whenever we found a match, we need to drop everything\n            # this is a greedy algorithm, it will match on the first found token\n            reset = False\n\n            # In this case, we already have partial matches (But unfinished)\n            for start, trie_pointer in states.items():\n                if \"\" in trie_pointer:\n                    # This is a final match, we need to reset and\n                    # store the results in `offsets`.\n\n                    # Lookahead to match longest first\n                    # Important in case of extra_id_1 vs extra_id_100\n                    # Here we are also actively looking for other earlier partial\n                    # matches\n                    # \"[CLS]\", \"L\", we need to match CLS even if L is special\n                    for lookstart, looktrie_pointer in states.items():\n                        if lookstart > start:\n                            # This partial match is later, we can stop looking\n                            break\n                        elif lookstart < start:\n                            # This partial match is earlier, the trie pointer\n                            # was already updated, so index is + 1\n                            lookahead_index = current + 1\n                            end = current + 1\n                        else:\n                            # Here lookstart == start and\n                            #      looktrie_pointer == trie_pointer\n                            # It wasn't updated yet so indices are current ones\n                            lookahead_index = current\n                            end = current\n                        next_char = text[\n                            lookahead_index] if lookahead_index < len(\n                                text) else None\n                        if \"\" in looktrie_pointer:\n                            start = lookstart\n                            end = lookahead_index\n                            skip = lookahead_index\n\n                        while next_char in looktrie_pointer:\n                            looktrie_pointer = looktrie_pointer[next_char]\n                            lookahead_index += 1\n                            if \"\" in looktrie_pointer:\n                                start = lookstart\n                                end = lookahead_index\n                                skip = lookahead_index\n\n                            if lookahead_index == len(text):\n                                # End of string\n                                break\n                            next_char = text[lookahead_index]\n                        # End lookahead\n\n                        # Storing and resetting\n                    offsets.append(start)\n                    offsets.append(end)\n                    reset = True\n                    break\n                elif current_char in trie_pointer:\n                    # The current character being looked at has a match within the trie\n                    # update the pointer (it will be stored back into states later).\n                    trie_pointer = trie_pointer[current_char]\n\n                    # Storing back the new pointer into the states.\n                    # Partial matches got longer by one.\n                    states[start] = trie_pointer\n                else:\n                    # The new character has not match in the trie, we need\n                    # to stop keeping track of this partial match.\n                    # We can't do it directly within the loop because of how\n                    # python iteration works\n                    to_remove.add(start)\n\n            # Either clearing the full start (we found a real match)\n            # Or clearing only the partial matches that didn't work.\n            if reset:\n                states = {}\n            else:\n                for start in to_remove:\n                    del states[start]\n\n            # If this character is a starting character within the trie\n            # start keeping track of this partial match.\n            if current >= skip and current_char in self.data:\n                states[current] = self.data[current_char]\n\n        # We have a cut at the end with states.\n        for start, trie_pointer in states.items():\n            if \"\" in trie_pointer:\n                # This is a final match, we need to reset and\n                # store the results in `offsets`.\n                end = len(text)\n                offsets.append(start)\n                offsets.append(end)\n                # Longest cut is always the one with lower start so the first\n                # item so we need to break.\n                break\n\n        return self.cut_text(text, offsets)\n\n    def cut_text(self, text, offsets):\n        # We have all the offsets now, we just need to do the actual splitting.\n        # We need to eventually add the first part of the string and the eventual\n        # last part.\n        offsets.append(len(text))\n        tokens = []\n        start = 0\n        for end in offsets:\n            if start > end:\n                logger.error(\n                    \"There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it\"\n                    \" anyway.\")\n                continue\n            elif start == end:\n                # This might happen if there's a match at index 0\n                # we're also preventing zero-width cuts in case of two\n                # consecutive matches\n                continue\n            tokens.append(text[start:end])\n            start = end\n\n        return tokens\n\n\nfrom enum import Enum\n\n\nclass ExplicitEnum(Enum):\n    \"\"\"\n    Enum with more explicit error message for missing values.\n    \"\"\"\n\n    @classmethod\n    def _missing_(cls, value):\n        raise ValueError(\n            f\"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}\"\n        )\n\n\nclass TensorType(ExplicitEnum):\n    \"\"\"\n    Possible values for the `return_tensors` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for\n    tab-completion in an IDE.\n    \"\"\"\n\n    PADDLE = \"paddle\"\n    PYTORCH = \"pt\"\n    TENSORFLOW = \"tf\"\n    NUMPY = \"np\"\n    JAX = \"jax\"\n\n\nclass BatchEncoding(UserDict):\n    \"\"\"\n    Holds the output of the [`~tokenization_utils_base.PreTrainedTokenizerBase.__call__`],\n    [`~tokenization_utils_base.PreTrainedTokenizerBase.encode_plus`] and\n    [`~tokenization_utils_base.PreTrainedTokenizerBase.batch_encode_plus`] methods (tokens, attention_masks, etc).\n\n    This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes\n    utility methods to map from word/character space to token space.\n\n    Args:\n        data (`dict`):\n            Dictionary of lists/arrays/tensors returned by the `__call__`/`encode_plus`/`batch_encode_plus` methods\n            ('input_ids', 'attention_mask', etc.).\n        encoding (`tokenizers.Encoding` or `Sequence[tokenizers.Encoding]`, *optional*):\n            If the tokenizer is a fast tokenizer which outputs additional information like mapping from word/character\n            space to token space the `tokenizers.Encoding` instance or list of instance (for batches) hold this\n            information.\n        tensor_type (`Union[None, str, TensorType]`, *optional*):\n            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at\n            initialization.\n        prepend_batch_axis (`bool`, *optional*, defaults to `False`):\n            Whether or not to add a batch axis when converting to tensors (see `tensor_type` above).\n        n_sequences (`Optional[int]`, *optional*):\n            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at\n            initialization.\n    \"\"\"\n\n    def __init__(\n            self,\n            data=None,\n            encoding=None,\n            tensor_type=None,\n            prepend_batch_axis: bool=False,\n            n_sequences=None, ):\n        super().__init__(data)\n\n        #if isinstance(encoding, EncodingFast):\n        #    encoding = [encoding]\n\n        self._encodings = encoding\n\n        if n_sequences is None and encoding is not None and len(encoding):\n            n_sequences = encoding[0].n_sequences\n\n        self._n_sequences = n_sequences\n\n        self.convert_to_tensors(\n            tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)\n\n    @property\n    def n_sequences(self) -> Optional[int]:\n        \"\"\"\n        `Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this\n        [`BatchEncoding`]. Currently can be one of `None` (unknown), `1` (a single sentence) or `2` (a pair of\n        sentences)\n        \"\"\"\n        return self._n_sequences\n\n    @property\n    def is_fast(self) -> bool:\n        \"\"\"\n        `bool`: Indicate whether this [`BatchEncoding`] was generated from the result of a [`PreTrainedTokenizerFast`]\n        or not.\n        \"\"\"\n        return self._encodings is not None\n\n# def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:\n\n    def __getitem__(self, item):\n        \"\"\"\n        If the key is a string, returns the value of the dict associated to `key` ('input_ids', 'attention_mask',\n        etc.).\n\n        If the key is an integer, get the `tokenizers.Encoding` for batch item with index `key`.\n        \"\"\"\n        if isinstance(item, str):\n            return self.data[item]\n        elif self._encodings is not None:\n            return self._encodings[item]\n        else:\n            raise KeyError(\n                \"Indexing with integers (to access backend Encoding for a given batch index) \"\n                \"is not available when using Python based tokenizers\")\n\n    def __getattr__(self, item: str):\n        try:\n            return self.data[item]\n        except KeyError:\n            raise AttributeError\n\n    def __getstate__(self):\n        return {\"data\": self.data, \"encodings\": self._encodings}\n\n    def __setstate__(self, state):\n        if \"data\" in state:\n            self.data = state[\"data\"]\n\n        if \"encodings\" in state:\n            self._encodings = state[\"encodings\"]\n\n    def keys(self):\n        return self.data.keys()\n\n    def values(self):\n        return self.data.values()\n\n    def items(self):\n        return self.data.items()\n\n    # After this point:\n    # Extended properties and methods only available for fast (Rust-based) tokenizers\n    # provided by HuggingFace tokenizers library.\n\n    @property\n    def encodings(self):\n        \"\"\"\n        `Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process. Returns `None` if\n        the input was tokenized through Python (i.e., not a fast) tokenizer.\n        \"\"\"\n        return self._encodings\n\n    def tokens(self, batch_index=0):\n        \"\"\"\n        Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to\n        integer indices) at a given batch index (only works for the output of a fast tokenizer).\n\n        Args:\n            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.\n\n        Returns:\n            `List[str]`: The list of tokens at that index.\n        \"\"\"\n        if not self._encodings:\n            raise ValueError(\n                \"tokens() is not available when using Python-based tokenizers\")\n        return self._encodings[batch_index].tokens\n\n    def sequence_ids(self, batch_index=0):\n        \"\"\"\n        Return a list mapping the tokens to the id of their original sentences:\n\n            - `None` for special tokens added around or between sequences,\n            - `0` for tokens corresponding to words in the first sequence,\n            - `1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly\n              encoded.\n\n        Args:\n            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.\n\n        Returns:\n            `List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens added\n            by the tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding\n            sequence.\n        \"\"\"\n        if not self._encodings:\n            raise ValueError(\n                \"sequence_ids() is not available when using Python-based tokenizers\"\n            )\n        return self._encodings[batch_index].sequence_ids\n\n    def words(self, batch_index=0):\n        \"\"\"\n        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.\n\n        Args:\n            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.\n\n        Returns:\n            `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the\n            tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word\n            (several tokens will be mapped to the same word index if they are parts of that word).\n        \"\"\"\n        if not self._encodings:\n            raise ValueError(\n                \"words() is not available when using Python-based tokenizers\")\n        warnings.warn(\n            \"`BatchEncoding.words()` property is deprecated and should be replaced with the identical, \"\n            \"but more self-explanatory `BatchEncoding.word_ids()` property.\",\n            FutureWarning, )\n        return self.word_ids(batch_index)\n\n    def word_ids(self, batch_index: int=0) -> List[Optional[int]]:\n        \"\"\"\n        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.\n\n        Args:\n            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.\n\n        Returns:\n            `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the\n            tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word\n            (several tokens will be mapped to the same word index if they are parts of that word).\n        \"\"\"\n        if not self._encodings:\n            raise ValueError(\n                \"word_ids() is not available when using Python-based tokenizers\"\n            )\n        return self._encodings[batch_index].word_ids\n\n    def token_to_sequence(self, batch_or_token_index, token_index):\n        \"\"\"\n        Get the index of the sequence represented by the given token. In the general use case, this method returns `0`\n        for a single sequence or the first sequence of a pair, and `1` for the second sequence of a pair\n\n        Can be called as:\n\n        - `self.token_to_sequence(token_index)` if batch size is 1\n        - `self.token_to_sequence(batch_index, token_index)` if batch size is greater than 1\n\n        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,\n        words are defined by the user). In this case it allows to easily associate encoded tokens with provided\n        tokenized words.\n\n        Args:\n            batch_or_token_index (`int`):\n                Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of\n                the token in the sequence.\n            token_index (`int`, *optional*):\n                If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the\n                sequence.\n\n        Returns:\n            `int`: Index of the word in the input sequence.\n        \"\"\"\n\n        if not self._encodings:\n            raise ValueError(\n                \"token_to_sequence() is not available when using Python based tokenizers\"\n            )\n        if token_index is not None:\n            batch_index = batch_or_token_index\n        else:\n            batch_index = 0\n            token_index = batch_or_token_index\n        if batch_index < 0:\n            batch_index = self._batch_size + batch_index\n        if token_index < 0:\n            token_index = self._seq_len + token_index\n        return self._encodings[batch_index].token_to_sequence(token_index)\n\n    def token_to_word(self, batch_or_token_index, token_index=None):\n        \"\"\"\n        Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch.\n\n        Can be called as:\n\n        - `self.token_to_word(token_index)` if batch size is 1\n        - `self.token_to_word(batch_index, token_index)` if batch size is greater than 1\n\n        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,\n        words are defined by the user). In this case it allows to easily associate encoded tokens with provided\n        tokenized words.\n\n        Args:\n            batch_or_token_index (`int`):\n                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of\n                the token in the sequence.\n            token_index (`int`, *optional*):\n                If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the\n                sequence.\n\n        Returns:\n            `int`: Index of the word in the input sequence.\n        \"\"\"\n\n        if not self._encodings:\n            raise ValueError(\n                \"token_to_word() is not available when using Python based tokenizers\"\n            )\n        if token_index is not None:\n            batch_index = batch_or_token_index\n        else:\n            batch_index = 0\n            token_index = batch_or_token_index\n        if batch_index < 0:\n            batch_index = self._batch_size + batch_index\n        if token_index < 0:\n            token_index = self._seq_len + token_index\n        return self._encodings[batch_index].token_to_word(token_index)\n\n    def word_to_tokens(self,\n                       batch_or_word_index,\n                       word_index=None,\n                       sequence_index=0):\n        \"\"\"\n        Get the encoded token span corresponding to a word in a sequence of the batch.\n\n        Token spans are returned as a [`~tokenization_utils_base.TokenSpan`] with:\n\n        - **start** -- Index of the first token.\n        - **end** -- Index of the token following the last token.\n\n        Can be called as:\n\n        - `self.word_to_tokens(word_index, sequence_index: int = 0)` if batch size is 1\n        - `self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)` if batch size is greater or equal to\n          1\n\n        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words\n        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized\n        words.\n\n        Args:\n            batch_or_word_index (`int`):\n                Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of\n                the word in the sequence.\n            word_index (`int`, *optional*):\n                If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the\n                sequence.\n            sequence_index (`int`, *optional*, defaults to 0):\n                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0\n                or 1) the provided word index belongs to.\n\n        Returns:\n            Optional [`~tokenization_utils_base.TokenSpan`] Span of tokens in the encoded sequence. Returns `None` if\n            no tokens correspond to the word.\n        \"\"\"\n\n        if not self._encodings:\n            raise ValueError(\n                \"word_to_tokens() is not available when using Python based tokenizers\"\n            )\n        if word_index is not None:\n            batch_index = batch_or_word_index\n        else:\n            batch_index = 0\n            word_index = batch_or_word_index\n        if batch_index < 0:\n            batch_index = self._batch_size + batch_index\n        if word_index < 0:\n            word_index = self._seq_len + word_index\n        span = self._encodings[batch_index].word_to_tokens(word_index,\n                                                           sequence_index)\n        return TokenSpan(*span) if span is not None else None\n\n    def token_to_chars(self, batch_or_token_index: int, token_index=None):\n        \"\"\"\n        Get the character span corresponding to an encoded token in a sequence of the batch.\n\n        Character spans are returned as a [`~tokenization_utils_base.CharSpan`] with:\n\n        - **start** -- Index of the first character in the original string associated to the token.\n        - **end** -- Index of the character following the last character in the original string associated to the\n          token.\n\n        Can be called as:\n\n        - `self.token_to_chars(token_index)` if batch size is 1\n        - `self.token_to_chars(batch_index, token_index)` if batch size is greater or equal to 1\n\n        Args:\n            batch_or_token_index (`int`):\n                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of\n                the token in the sequence.\n            token_index (`int`, *optional*):\n                If a batch index is provided in *batch_or_token_index*, this can be the index of the token or tokens in\n                the sequence.\n\n        Returns:\n            [`~tokenization_utils_base.CharSpan`]: Span of characters in the original string, or None, if the token\n            (e.g. <s>, </s>) doesn't correspond to any chars in the origin string.\n        \"\"\"\n\n        if not self._encodings:\n            raise ValueError(\n                \"token_to_chars() is not available when using Python based tokenizers\"\n            )\n        if token_index is not None:\n            batch_index = batch_or_token_index\n        else:\n            batch_index = 0\n            token_index = batch_or_token_index\n        span_indices = self._encodings[batch_index].token_to_chars(token_index)\n\n        return CharSpan(*span_indices) if span_indices is not None else None\n\n    def char_to_token(self,\n                      batch_or_char_index: int,\n                      char_index: Optional[int]=None,\n                      sequence_index: int=0) -> int:\n        \"\"\"\n        Get the index of the token in the encoded output comprising a character in the original string for a sequence\n        of the batch.\n\n        Can be called as:\n\n        - `self.char_to_token(char_index)` if batch size is 1\n        - `self.char_to_token(batch_index, char_index)` if batch size is greater or equal to 1\n\n        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words\n        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized\n        words.\n\n        Args:\n            batch_or_char_index (`int`):\n                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of\n                the word in the sequence\n            char_index (`int`, *optional*):\n                If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the\n                sequence.\n            sequence_index (`int`, *optional*, defaults to 0):\n                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0\n                or 1) the provided character index belongs to.\n\n\n        Returns:\n            `int`: Index of the token.\n        \"\"\"\n\n        if not self._encodings:\n            raise ValueError(\n                \"char_to_token() is not available when using Python based tokenizers\"\n            )\n        if char_index is not None:\n            batch_index = batch_or_char_index\n        else:\n            batch_index = 0\n            char_index = batch_or_char_index\n        return self._encodings[batch_index].char_to_token(char_index,\n                                                          sequence_index)\n\n    def word_to_chars(self,\n                      batch_or_word_index: int,\n                      word_index: Optional[int]=None,\n                      sequence_index: int=0):\n        \"\"\"\n        Get the character span in the original string corresponding to given word in a sequence of the batch.\n\n        Character spans are returned as a CharSpan NamedTuple with:\n\n        - start: index of the first character in the original string\n        - end: index of the character following the last character in the original string\n\n        Can be called as:\n\n        - `self.word_to_chars(word_index)` if batch size is 1\n        - `self.word_to_chars(batch_index, word_index)` if batch size is greater or equal to 1\n\n        Args:\n            batch_or_word_index (`int`):\n                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of\n                the word in the sequence\n            word_index (`int`, *optional*):\n                If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the\n                sequence.\n            sequence_index (`int`, *optional*, defaults to 0):\n                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0\n                or 1) the provided word index belongs to.\n\n        Returns:\n            `CharSpan` or `List[CharSpan]`: Span(s) of the associated character or characters in the string. CharSpan\n            are NamedTuple with:\n\n                - start: index of the first character associated to the token in the original string\n                - end: index of the character following the last character associated to the token in the original\n                  string\n        \"\"\"\n\n        if not self._encodings:\n            raise ValueError(\n                \"word_to_chars() is not available when using Python based tokenizers\"\n            )\n        if word_index is not None:\n            batch_index = batch_or_word_index\n        else:\n            batch_index = 0\n            word_index = batch_or_word_index\n        return CharSpan(*(self._encodings[batch_index].word_to_chars(\n            word_index, sequence_index)))\n\n    def char_to_word(self,\n                     batch_or_char_index: int,\n                     char_index: Optional[int]=None,\n                     sequence_index: int=0) -> int:\n        \"\"\"\n        Get the word in the original string corresponding to a character in the original string of a sequence of the\n        batch.\n\n        Can be called as:\n\n        - `self.char_to_word(char_index)` if batch size is 1\n        - `self.char_to_word(batch_index, char_index)` if batch size is greater than 1\n\n        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words\n        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized\n        words.\n\n        Args:\n            batch_or_char_index (`int`):\n                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of\n                the character in the original string.\n            char_index (`int`, *optional*):\n                If a batch index is provided in *batch_or_token_index*, this can be the index of the character in the\n                original string.\n            sequence_index (`int`, *optional*, defaults to 0):\n                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0\n                or 1) the provided character index belongs to.\n\n\n        Returns:\n            `int` or `List[int]`: Index or indices of the associated encoded token(s).\n        \"\"\"\n\n        if not self._encodings:\n            raise ValueError(\n                \"char_to_word() is not available when using Python based tokenizers\"\n            )\n        if char_index is not None:\n            batch_index = batch_or_char_index\n        else:\n            batch_index = 0\n            char_index = batch_or_char_index\n        return self._encodings[batch_index].char_to_word(char_index,\n                                                         sequence_index)\n\n    def convert_to_tensors(self,\n                           tensor_type=None,\n                           prepend_batch_axis: bool=False):\n        \"\"\"\n        Convert the inner content to tensors.\n\n        Args:\n            tensor_type (`str` or [`~utils.TensorType`], *optional*):\n                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If\n                `None`, no modification is done.\n            prepend_batch_axis (`int`, *optional*, defaults to `False`):\n                Whether or not to add the batch dimension during the conversion.\n        \"\"\"\n        if tensor_type is None:\n            return self\n\n        # Get a function reference for the correct framework\n        if tensor_type == 'paddle':\n            import paddle\n\n            as_tensor = paddle.to_tensor\n            is_tensor = paddle.is_tensor\n        else:\n            as_tensor = np.asarray\n            is_tensor = _is_numpy\n        # (mfuntowicz: This code is unreachable)\n        # else:\n        #     raise ImportError(\n        #         f\"Unable to convert output to tensors format {tensor_type}\"\n        #     )\n\n        # Do the tensor conversion in batch\n        for key, value in self.items():\n            try:\n                if prepend_batch_axis:\n                    value = [value]\n\n                if not is_tensor(value):\n                    tensor = as_tensor(value)\n\n                    # Removing this for now in favor of controlling the shape with `prepend_batch_axis`\n                    # # at-least2d\n                    # if tensor.ndim > 2:\n                    #     tensor = tensor.squeeze(0)\n                    # elif tensor.ndim < 2:\n                    #     tensor = tensor[None, :]\n\n                    self[key] = tensor\n            except:  # noqa E722\n                if key == \"overflowing_tokens\":\n                    raise ValueError(\n                        \"Unable to create tensor returning overflowing tokens of different lengths. \"\n                        \"Please see if a fast version of this tokenizer is available to have this feature available.\"\n                    )\n                raise ValueError(\n                    \"Unable to create tensor, you should probably activate truncation and/or padding \"\n                    \"with 'padding=True' 'truncation=True' to have batched tensors with the same length.\"\n                )\n\n        return self\n\n\nclass TruncationStrategy(ExplicitEnum):\n    \"\"\"\n    Possible values for the `truncation` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for tab-completion in\n    an IDE.\n    \"\"\"\n\n    ONLY_FIRST = \"only_first\"\n    ONLY_SECOND = \"only_second\"\n    LONGEST_FIRST = \"longest_first\"\n    DO_NOT_TRUNCATE = \"do_not_truncate\"\n\n\nclass PaddingStrategy(ExplicitEnum):\n    \"\"\"\n    Possible values for the `padding` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for tab-completion in an\n    IDE.\n    \"\"\"\n\n    LONGEST = \"longest\"\n    MAX_LENGTH = \"max_length\"\n    DO_NOT_PAD = \"do_not_pad\"\n\n\nclass SpecialTokensMixin:\n    \"\"\"\n    A mixin derived by [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] to handle specific behaviors related to\n    special tokens. In particular, this class hold the attributes which can be used to directly access these special\n    tokens in a model-independent manner and allow to set and update the special tokens.\n\n    Args:\n        bos_token (`str` or `tokenizers.AddedToken`, *optional*):\n            A special token representing the beginning of a sentence.\n        eos_token (`str` or `tokenizers.AddedToken`, *optional*):\n            A special token representing the end of a sentence.\n        unk_token (`str` or `tokenizers.AddedToken`, *optional*):\n            A special token representing an out-of-vocabulary token.\n        sep_token (`str` or `tokenizers.AddedToken`, *optional*):\n            A special token separating two different sentences in the same input (used by BERT for instance).\n        pad_token (`str` or `tokenizers.AddedToken`, *optional*):\n            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by\n            attention mechanisms or loss computation.\n        cls_token (`str` or `tokenizers.AddedToken`, *optional*):\n            A special token representing the class of the input (used by BERT for instance).\n        mask_token (`str` or `tokenizers.AddedToken`, *optional*):\n            A special token representing a masked token (used by masked-language modeling pretraining objectives, like\n            BERT).\n        additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):\n            A tuple or a list of additional special tokens.\n    \"\"\"\n\n    SPECIAL_TOKENS_ATTRIBUTES = [\n        \"bos_token\",\n        \"eos_token\",\n        \"unk_token\",\n        \"sep_token\",\n        \"pad_token\",\n        \"cls_token\",\n        \"mask_token\",\n        \"additional_special_tokens\",\n    ]\n\n    def __init__(self, verbose=True, **kwargs):\n        self._bos_token = None\n        self._eos_token = None\n        self._unk_token = None\n        self._sep_token = None\n        self._pad_token = None\n        self._cls_token = None\n        self._mask_token = None\n        self._pad_token_type_id = 0\n        self._additional_special_tokens = []\n        self.verbose = verbose\n        self.added_tokens_encoder: Dict[str, int] = {}\n        self.added_tokens_decoder: Dict[int, str] = {}\n        self.unique_no_split_tokens: List[str] = []\n        self.tokens_trie = Trie()\n\n        self._decode_use_source_tokenizer = False\n\n        # We directly set the hidden value to allow initialization with special tokens\n        # which are not yet in the vocabulary. Necessary for serialization/de-serialization\n        # TODO clean this up at some point (probably by switching to fast tokenizers)\n        for key, value in kwargs.items():\n            if value is None:\n                continue\n            if key in self.SPECIAL_TOKENS_ATTRIBUTES:\n                if key == \"additional_special_tokens\":\n                    assert isinstance(value, (\n                        list, tuple)), f\"Value {value} is not a list or tuple\"\n                    assert all(\n                        isinstance(t, (str, AddedToken)) for t in value\n                    ), \"One of the tokens is not a string or an AddedToken\"\n                    setattr(self, key, value)\n                elif isinstance(value, (str, AddedToken)):\n                    setattr(self, key, value)\n                else:\n                    raise TypeError(\n                        f\"special token {key} has to be either str or AddedToken but got: {type(value)}\"\n                    )\n\n    def convert_tokens_to_ids(\n            self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:\n        \"\"\"\n        Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the\n        vocabulary.\n\n        Args:\n            tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).\n\n        Returns:\n            `int` or `List[int]`: The token id or list of token ids.\n        \"\"\"\n        if tokens is None:\n            return None\n\n        if isinstance(tokens, str):\n            return self._convert_token_to_id_with_added_voc(tokens)\n\n        ids = []\n        for token in tokens:\n            ids.append(self._convert_token_to_id_with_added_voc(token))\n        return ids\n\n    def _convert_token_to_id_with_added_voc(self, token):\n        if token is None:\n            return None\n\n        if token in self.added_tokens_encoder:\n            return self.added_tokens_encoder[token]\n        return self._convert_token_to_id(token)\n\n    def _convert_token_to_id(self, token):\n        \"\"\"Converts a token (str) in an id using the vocab.\"\"\"\n        if token.startswith(\"<extra_id_\"):\n            match = re.match(r\"<extra_id_(\\d+)>\", token)\n            num = int(match.group(1))\n            return self.vocab_size - num - 1\n        return self.sp_model.piece_to_id(token)\n\n    def sanitize_special_tokens(self) -> int:\n        \"\"\"\n        Make sure that all the special tokens attributes of the tokenizer (`tokenizer.mask_token`,\n        `tokenizer.cls_token`, etc.) are in the vocabulary.\n\n        Add the missing ones to the vocabulary if needed.\n\n        Return:\n            `int`: The number of tokens added in the vocabulary during the operation.\n        \"\"\"\n        return self.add_tokens(\n            self.all_special_tokens_extended, special_tokens=True)\n\n    def add_special_tokens(\n            self,\n            special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int:\n        \"\"\"\n        Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If\n        special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the\n        current vocabulary).\n\n        Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding\n        matrix of the model so that its embedding matrix matches the tokenizer.\n\n        In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.\n\n        Using `add_special_tokens` will ensure your special tokens can be used in several ways:\n\n        - Special tokens are carefully handled by the tokenizer (they are never split).\n        - You can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This\n          makes it easy to develop model-agnostic training and fine-tuning scripts.\n\n        When possible, special tokens are already registered for provided pretrained models (for instance\n        [`BertTokenizer`] `cls_token` is already registered to be :obj*'[CLS]'* and XLM's one is also registered to be\n        `'</s>'`).\n\n        Args:\n            special_tokens_dict (dictionary *str* to *str* or `tokenizers.AddedToken`):\n                Keys should be in the list of predefined special attributes: [`bos_token`, `eos_token`, `unk_token`,\n                `sep_token`, `pad_token`, `cls_token`, `mask_token`, `additional_special_tokens`].\n\n                Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer\n                assign the index of the `unk_token` to them).\n\n        Returns:\n            `int`: Number of tokens added to the vocabulary.\n\n        Examples:\n\n        ```python\n        # Let's see how to add a new classification token to GPT-2\n        tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")\n        model = GPT2Model.from_pretrained(\"gpt2\")\n\n        special_tokens_dict = {\"cls_token\": \"<CLS>\"}\n\n        num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)\n        print(\"We have added\", num_added_toks, \"tokens\")\n        # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.\n        model.resize_token_embeddings(len(tokenizer))\n\n        assert tokenizer.cls_token == \"<CLS>\"\n        ```\"\"\"\n        if not special_tokens_dict:\n            return 0\n\n        added_tokens = 0\n        for key, value in special_tokens_dict.items():\n            assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f\"Key {key} is not a special token\"\n\n            if self.verbose:\n                #logger.info(f\"Assigning {value} to the {key} key of the tokenizer\")\n                print(f\"Assigning {value} to the {key} key of the tokenizer\")\n            setattr(self, key, value)\n\n            if key == \"additional_special_tokens\":\n                assert isinstance(value, (list, tuple)) and all(\n                    isinstance(t, (str, AddedToken)) for t in value\n                ), f\"Tokens {value} for key {key} should all be str or AddedToken instances\"\n                added_tokens += self.add_tokens(value, special_tokens=True)\n            else:\n                assert isinstance(\n                    value, (str, AddedToken)\n                ), f\"Token {value} for key {key} should be a str or an AddedToken instance\"\n                added_tokens += self.add_tokens([value], special_tokens=True)\n\n        return added_tokens\n\n    def add_tokens(\n            self,\n            new_tokens: Union[str, AddedToken, List[Union[str, AddedToken]]],\n            special_tokens: bool=False) -> int:\n        \"\"\"\n        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to\n        it with indices starting from length of the current vocabulary.\n\n        Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding\n        matrix of the model so that its embedding matrix matches the tokenizer.\n\n        In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.\n\n        Args:\n            new_tokens (`str`, `tokenizers.AddedToken` or a list of *str* or `tokenizers.AddedToken`):\n                Tokens are only added if they are not already in the vocabulary. `tokenizers.AddedToken` wraps a string\n                token to let you personalize its behavior: whether this token should only match against a single word,\n                whether this token should strip all potential whitespaces on the left side, whether this token should\n                strip all potential whitespaces on the right side, etc.\n            special_tokens (`bool`, *optional*, defaults to `False`):\n                Can be used to specify if the token is a special token. This mostly change the normalization behavior\n                (special tokens like CLS or [MASK] are usually not lower-cased for instance).\n\n                See details for `tokenizers.AddedToken` in HuggingFace tokenizers library.\n\n        Returns:\n            `int`: Number of tokens added to the vocabulary.\n\n        Examples:\n\n        ```python\n        # Let's see how to increase the vocabulary of Bert model and tokenizer\n        tokenizer = BertTokenizerFast.from_pretrained(\"bert-base-uncased\")\n        model = BertModel.from_pretrained(\"bert-base-uncased\")\n\n        num_added_toks = tokenizer.add_tokens([\"new_tok1\", \"my_new-tok2\"])\n        print(\"We have added\", num_added_toks, \"tokens\")\n        # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.\n        model.resize_token_embeddings(len(tokenizer))\n        ```\"\"\"\n        if not new_tokens:\n            return 0\n\n        if not isinstance(new_tokens, (list, tuple)):\n            new_tokens = [new_tokens]\n\n        return self._add_tokens(new_tokens, special_tokens=special_tokens)\n\n    def _add_tokens(self,\n                    new_tokens: Union[List[str], List[AddedToken]],\n                    special_tokens: bool=False) -> int:\n        new_tokens = [str(tok) for tok in new_tokens]\n\n        tokens_to_add = []\n        for token in new_tokens:\n            if not isinstance(token, str):\n                raise TypeError(\n                    f\"Token {token} is not a string but a {type(token)}.\")\n            if not special_tokens and hasattr(\n                    self, \"do_lower_case\") and self.do_lower_case:\n                token = token.lower()\n            if (token != self.unk_token and self.convert_tokens_to_ids(token)\n                    == self.convert_tokens_to_ids(self.unk_token) and\n                    token not in tokens_to_add):\n                tokens_to_add.append(token)\n                #if self.verbose:\n            #logger.info(f\"Adding {token} to the vocabulary\")\n            #print(f\"Adding {token} to the vocabulary\")\n\n        added_tok_encoder = dict((tok, len(self) + i)\n                                 for i, tok in enumerate(tokens_to_add))\n        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}\n        self.added_tokens_encoder.update(added_tok_encoder)\n        self.added_tokens_decoder.update(added_tok_decoder)\n\n        # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)\n        if special_tokens:\n            if len(new_tokens) == 1:\n                _insert_one_token_to_ordered_list(self.unique_no_split_tokens,\n                                                  new_tokens[0])\n            else:\n                self.unique_no_split_tokens = sorted(\n                    set(self.unique_no_split_tokens).union(set(new_tokens)))\n        else:\n            # Or on the newly added tokens\n            if len(tokens_to_add) == 1:\n                _insert_one_token_to_ordered_list(self.unique_no_split_tokens,\n                                                  tokens_to_add[0])\n            else:\n                self.unique_no_split_tokens = sorted(\n                    set(self.unique_no_split_tokens).union(\n                        set(tokens_to_add)))\n        self._create_trie(self.unique_no_split_tokens)\n\n        return len(tokens_to_add)\n\n    def _create_trie(self, unique_no_split_tokens):\n        trie = Trie()\n        for token in unique_no_split_tokens:\n            if hasattr(\n                    self, \"do_lower_case\"\n            ) and self.do_lower_case and token not in self.all_special_tokens:\n                trie.add(token.lower())\n            else:\n                trie.add(token)\n        self.tokens_trie = trie\n\n    @property\n    def bos_token(self) -> str:\n        \"\"\"\n        `str`: Beginning of sentence token. Log an error if used while not having been set.\n        \"\"\"\n        if self._bos_token is None and self.verbose:\n            print(\"Using bos_token, but it is not set yet.\")\n            #logger.error(\"Using bos_token, but it is not set yet.\")\n            return None\n        return str(self._bos_token)\n\n    @property\n    def eos_token(self) -> str:\n        \"\"\"\n        `str`: End of sentence token. Log an error if used while not having been set.\n        \"\"\"\n        if self._eos_token is None and self.verbose:\n            #logger.error(\"Using eos_token, but it is not set yet.\")\n            print(\"Using eos_token, but it is not set yet.\")\n            return None\n        return str(self._eos_token)\n\n    @property\n    def unk_token(self) -> str:\n        \"\"\"\n        `str`: Unknown token. Log an error if used while not having been set.\n        \"\"\"\n        if self._unk_token is None and self.verbose:\n            print(\"Using unk_token, but it is not set yet.\")\n            #logger.error(\"Using unk_token, but it is not set yet.\")\n            return None\n        return str(self._unk_token)\n\n    @property\n    def sep_token(self) -> str:\n        \"\"\"\n        `str`: Separation token, to separate context and query in an input sequence. Log an error if used while not\n        having been set.\n        \"\"\"\n        if self._sep_token is None and self.verbose:\n            print(\"Using sep_token, but it is not set yet.\")\n            #logger.error(\"Using sep_token, but it is not set yet.\")\n            return None\n        return str(self._sep_token)\n\n    @property\n    def pad_token(self) -> str:\n        \"\"\"\n        `str`: Padding token. Log an error if used while not having been set.\n        \"\"\"\n        if self._pad_token is None and self.verbose:\n            #logger.error(\"Using pad_token, but it is not set yet.\")\n            print(\"Using pad_token, but it is not set yet.\")\n            return None\n        return str(self._pad_token)\n\n    @property\n    def cls_token(self) -> str:\n        \"\"\"\n        `str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the full\n        depth of the model. Log an error if used while not having been set.\n        \"\"\"\n        if self._cls_token is None and self.verbose:\n            #logger.error(\"Using cls_token, but it is not set yet.\")\n            print(\"Using cls_token, but it is not set yet.\")\n            return None\n        return str(self._cls_token)\n\n    @property\n    def mask_token(self) -> str:\n        \"\"\"\n        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not\n        having been set.\n        \"\"\"\n        if self._mask_token is None and self.verbose:\n            #logger.error(\"Using mask_token, but it is not set yet.\")\n            print(\"Using mask_token, but it is not set yet.\")\n            return None\n        return str(self._mask_token)\n\n    @property\n    def additional_special_tokens(self) -> List[str]:\n        \"\"\"\n        `List[str]`: All the additional special tokens you may want to use. Log an error if used while not having been\n        set.\n        \"\"\"\n        if self._additional_special_tokens is None and self.verbose:\n            #logger.error(\"Using additional_special_tokens, but it is not set yet.\")\n            print(\"Using additional_special_tokens, but it is not set yet.\")\n            return None\n        return [str(tok) for tok in self._additional_special_tokens]\n\n    @bos_token.setter\n    def bos_token(self, value):\n        self._bos_token = value\n\n    @eos_token.setter\n    def eos_token(self, value):\n        self._eos_token = value\n\n    @unk_token.setter\n    def unk_token(self, value):\n        self._unk_token = value\n\n    @sep_token.setter\n    def sep_token(self, value):\n        self._sep_token = value\n\n    @pad_token.setter\n    def pad_token(self, value):\n        self._pad_token = value\n\n    @cls_token.setter\n    def cls_token(self, value):\n        self._cls_token = value\n\n    @mask_token.setter\n    def mask_token(self, value):\n        self._mask_token = value\n\n    @additional_special_tokens.setter\n    def additional_special_tokens(self, value):\n        self._additional_special_tokens = value\n\n    @property\n    def bos_token_id(self) -> Optional[int]:\n        \"\"\"\n        `Optional[int]`: Id of the beginning of sentence token in the vocabulary. Returns `None` if the token has not\n        been set.\n        \"\"\"\n        if self._bos_token is None:\n            return None\n        return self.convert_tokens_to_ids(self.bos_token)\n\n    @property\n    def eos_token_id(self) -> Optional[int]:\n        \"\"\"\n        `Optional[int]`: Id of the end of sentence token in the vocabulary. Returns `None` if the token has not been\n        set.\n        \"\"\"\n        if self._eos_token is None:\n            return None\n        return self.convert_tokens_to_ids(self.eos_token)\n\n    @property\n    def unk_token_id(self) -> Optional[int]:\n        \"\"\"\n        `Optional[int]`: Id of the unknown token in the vocabulary. Returns `None` if the token has not been set.\n        \"\"\"\n        if self._unk_token is None:\n            return None\n        return self.convert_tokens_to_ids(self.unk_token)\n\n    @property\n    def sep_token_id(self) -> Optional[int]:\n        \"\"\"\n        `Optional[int]`: Id of the separation token in the vocabulary, to separate context and query in an input\n        sequence. Returns `None` if the token has not been set.\n        \"\"\"\n        if self._sep_token is None:\n            return None\n        return self.convert_tokens_to_ids(self.sep_token)\n\n    @property\n    def pad_token_id(self) -> Optional[int]:\n        \"\"\"\n        `Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been set.\n        \"\"\"\n        if self._pad_token is None:\n            return None\n        return self.convert_tokens_to_ids(self.pad_token)\n\n    @property\n    def pad_token_type_id(self) -> int:\n        \"\"\"\n        `int`: Id of the padding token type in the vocabulary.\n        \"\"\"\n        return self._pad_token_type_id\n\n    @property\n    def cls_token_id(self) -> Optional[int]:\n        \"\"\"\n        `Optional[int]`: Id of the classification token in the vocabulary, to extract a summary of an input sequence\n        leveraging self-attention along the full depth of the model.\n\n        Returns `None` if the token has not been set.\n        \"\"\"\n        if self._cls_token is None:\n            return None\n        return self.convert_tokens_to_ids(self.cls_token)\n\n    @property\n    def mask_token_id(self) -> Optional[int]:\n        \"\"\"\n        `Optional[int]`: Id of the mask token in the vocabulary, used when training a model with masked-language\n        modeling. Returns `None` if the token has not been set.\n        \"\"\"\n        if self._mask_token is None:\n            return None\n        return self.convert_tokens_to_ids(self.mask_token)\n\n    @property\n    def additional_special_tokens_ids(self) -> List[int]:\n        \"\"\"\n        `List[int]`: Ids of all the additional special tokens in the vocabulary. Log an error if used while not having\n        been set.\n        \"\"\"\n        return self.convert_tokens_to_ids(self.additional_special_tokens)\n\n    @bos_token_id.setter\n    def bos_token_id(self, value):\n        self._bos_token = self.convert_ids_to_tokens(\n            value) if value is not None else None\n\n    @eos_token_id.setter\n    def eos_token_id(self, value):\n        self._eos_token = self.convert_ids_to_tokens(\n            value) if value is not None else None\n\n    @unk_token_id.setter\n    def unk_token_id(self, value):\n        self._unk_token = self.convert_ids_to_tokens(\n            value) if value is not None else None\n\n    @sep_token_id.setter\n    def sep_token_id(self, value):\n        self._sep_token = self.convert_ids_to_tokens(\n            value) if value is not None else None\n\n    @pad_token_id.setter\n    def pad_token_id(self, value):\n        self._pad_token = self.convert_ids_to_tokens(\n            value) if value is not None else None\n\n    @cls_token_id.setter\n    def cls_token_id(self, value):\n        self._cls_token = self.convert_ids_to_tokens(\n            value) if value is not None else None\n\n    @mask_token_id.setter\n    def mask_token_id(self, value):\n        self._mask_token = self.convert_ids_to_tokens(\n            value) if value is not None else None\n\n    @additional_special_tokens_ids.setter\n    def additional_special_tokens_ids(self, values):\n        self._additional_special_tokens = [\n            self.convert_ids_to_tokens(value) for value in values\n        ]\n\n    @property\n    def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:\n        \"\"\"\n        `Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes (`cls_token`,\n        `unk_token`, etc.) to their values (`'<unk>'`, `'<cls>'`, etc.).\n\n        Convert potential tokens of `tokenizers.AddedToken` type to string.\n        \"\"\"\n        set_attr = {}\n        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:\n            attr_value = getattr(self, \"_\" + attr)\n            if attr_value:\n                set_attr[attr] = (type(attr_value)(\n                    str(attr_value_sub) for attr_value_sub in attr_value)\n                                  if isinstance(attr_value, (list, tuple)) else\n                                  str(attr_value))\n        return set_attr\n\n    @property\n    def special_tokens_map_extended(self) -> Dict[str, Union[\n            str, AddedToken, List[Union[str, AddedToken]]]]:\n        \"\"\"\n        `Dict[str, Union[str, tokenizers.AddedToken, List[Union[str, tokenizers.AddedToken]]]]`: A dictionary mapping\n        special token class attributes (`cls_token`, `unk_token`, etc.) to their values (`'<unk>'`, `'<cls>'`, etc.).\n\n        Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely how\n        special tokens are tokenized.\n        \"\"\"\n        set_attr = {}\n        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:\n            attr_value = getattr(self, \"_\" + attr)\n            if attr_value:\n                set_attr[attr] = attr_value\n        return set_attr\n\n    @property\n    def all_special_tokens(self) -> List[str]:\n        \"\"\"\n        `List[str]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.\n\n        Convert tokens of `tokenizers.AddedToken` type to string.\n        \"\"\"\n        all_toks = [str(s) for s in self.all_special_tokens_extended]\n        return all_toks\n\n    @property\n    def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:\n        \"\"\"\n        `List[Union[str, tokenizers.AddedToken]]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class\n        attributes.\n\n        Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely how\n        special tokens are tokenized.\n        \"\"\"\n        all_toks = []\n        set_attr = self.special_tokens_map_extended\n        for attr_value in set_attr.values():\n            all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (\n                list, tuple)) else [attr_value])\n        all_toks = list(OrderedDict.fromkeys(all_toks))\n        return all_toks\n\n    @property\n    def all_special_ids(self) -> List[int]:\n        \"\"\"\n        `List[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.\n        \"\"\"\n        all_toks = self.all_special_tokens\n        all_ids = self.convert_tokens_to_ids(all_toks)\n        return all_ids\n"
  },
  {
    "path": "ppfleetx/data/transforms/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/data/transforms/preprocess.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\nfrom __future__ import unicode_literals\n\nfrom functools import partial\nimport math\nimport random\nimport cv2\nimport numpy as np\nfrom PIL import Image\nfrom PIL import ImageFilter\n\nfrom paddle.vision.transforms import functional as F\nfrom paddle.vision.transforms import ColorJitter as PPColorJitter\nfrom paddle.vision.transforms import Grayscale\n\nfrom ppfleetx.utils.log import logger\n\n\nclass OperatorParamError(ValueError):\n    \"\"\" OperatorParamError\n    \"\"\"\n    pass\n\n\nclass DecodeImage(object):\n    \"\"\" decode image \"\"\"\n\n    def __init__(self, to_rgb=True, channel_first=False):\n        self.to_rgb = to_rgb\n        self.channel_first = channel_first\n\n    def __call__(self, img):\n        assert type(img) is bytes and len(\n            img) > 0, \"invalid input 'img' in DecodeImage\"\n        data = np.frombuffer(img, dtype='uint8')\n        img = cv2.imdecode(data, 1)\n        if self.to_rgb:\n            assert img.shape[2] == 3, 'invalid shape of image[%s]' % (\n                img.shape)\n            img = img[:, :, ::-1]\n\n        if self.channel_first:\n            img = img.transpose((2, 0, 1))\n\n        return img\n\n\nclass UnifiedResize(object):\n    def __init__(self, interpolation=None, backend=\"cv2\"):\n        _cv2_interp_from_str = {\n            'nearest': cv2.INTER_NEAREST,\n            'bilinear': cv2.INTER_LINEAR,\n            'area': cv2.INTER_AREA,\n            'bicubic': cv2.INTER_CUBIC,\n            'lanczos': cv2.INTER_LANCZOS4\n        }\n        _pil_interp_from_str = {\n            'nearest': Image.NEAREST,\n            'bilinear': Image.BILINEAR,\n            'bicubic': Image.BICUBIC,\n            'box': Image.BOX,\n            'lanczos': Image.LANCZOS,\n            'hamming': Image.HAMMING\n        }\n\n        def _pil_resize(src, size, resample):\n            pil_img = Image.fromarray(src)\n            pil_img = pil_img.resize(size, resample)\n            return np.asarray(pil_img)\n\n        if backend.lower() == \"cv2\":\n            if isinstance(interpolation, str):\n                interpolation = _cv2_interp_from_str[interpolation.lower()]\n            # compatible with opencv < version 4.4.0\n            elif interpolation is None:\n                interpolation = cv2.INTER_LINEAR\n            self.resize_func = partial(cv2.resize, interpolation=interpolation)\n        elif backend.lower() == \"pil\":\n            if isinstance(interpolation, str):\n                interpolation = _pil_interp_from_str[interpolation.lower()]\n            self.resize_func = partial(_pil_resize, resample=interpolation)\n        else:\n            logger.warning(\n                f\"The backend of Resize only support \\\"cv2\\\" or \\\"PIL\\\". \\\"f{backend}\\\" is unavailable. Use \\\"cv2\\\" instead.\"\n            )\n            self.resize_func = cv2.resize\n\n    def __call__(self, src, size):\n        return self.resize_func(src, size)\n\n\nclass ResizeImage(object):\n    \"\"\" resize image \"\"\"\n\n    def __init__(self,\n                 size=None,\n                 resize_short=None,\n                 interpolation=None,\n                 backend=\"cv2\"):\n        if resize_short is not None and resize_short > 0:\n            self.resize_short = resize_short\n            self.w = None\n            self.h = None\n        elif size is not None:\n            self.resize_short = None\n            self.w = size if type(size) is int else size[0]\n            self.h = size if type(size) is int else size[1]\n        else:\n            raise OperatorParamError(\"invalid params for ReisizeImage for '\\\n                'both 'size' and 'resize_short' are None\")\n\n        self._resize_func = UnifiedResize(\n            interpolation=interpolation, backend=backend)\n\n    def __call__(self, img):\n        img_h, img_w = img.shape[:2]\n        if self.resize_short is not None:\n            percent = float(self.resize_short) / min(img_w, img_h)\n            w = int(round(img_w * percent))\n            h = int(round(img_h * percent))\n        else:\n            w = self.w\n            h = self.h\n        return self._resize_func(img, (w, h))\n\n\nclass CenterCropImage(object):\n    \"\"\" crop image \"\"\"\n\n    def __init__(self, size):\n        if type(size) is int:\n            self.size = (size, size)\n        else:\n            self.size = size  # (h, w)\n\n    def __call__(self, img):\n        w, h = self.size\n        img_h, img_w = img.shape[:2]\n        w_start = (img_w - w) // 2\n        h_start = (img_h - h) // 2\n\n        w_end = w_start + w\n        h_end = h_start + h\n        return img[h_start:h_end, w_start:w_end, :]\n\n\nclass RandCropImage(object):\n    \"\"\" random crop image \"\"\"\n\n    def __init__(self,\n                 size,\n                 scale=None,\n                 ratio=None,\n                 interpolation=None,\n                 backend=\"cv2\"):\n        if type(size) is int:\n            self.size = (size, size)  # (h, w)\n        else:\n            self.size = size\n\n        self.scale = [0.08, 1.0] if scale is None else scale\n        self.ratio = [3. / 4., 4. / 3.] if ratio is None else ratio\n\n        self._resize_func = UnifiedResize(\n            interpolation=interpolation, backend=backend)\n\n    def __call__(self, img):\n        size = self.size\n        scale = self.scale\n        ratio = self.ratio\n\n        aspect_ratio = math.sqrt(random.uniform(*ratio))\n        w = 1. * aspect_ratio\n        h = 1. / aspect_ratio\n\n        img_h, img_w = img.shape[:2]\n\n        bound = min((float(img_w) / img_h) / (w**2),\n                    (float(img_h) / img_w) / (h**2))\n        scale_max = min(scale[1], bound)\n        scale_min = min(scale[0], bound)\n\n        target_area = img_w * img_h * random.uniform(scale_min, scale_max)\n        target_size = math.sqrt(target_area)\n        w = int(target_size * w)\n        h = int(target_size * h)\n\n        i = random.randint(0, img_w - w)\n        j = random.randint(0, img_h - h)\n\n        img = img[j:j + h, i:i + w, :]\n\n        return self._resize_func(img, size)\n\n\nclass RandFlipImage(object):\n    \"\"\" random flip image\n        flip_code:\n            1: Flipped Horizontally\n            0: Flipped Vertically\n            -1: Flipped Horizontally & Vertically\n    \"\"\"\n\n    def __init__(self, flip_code=1):\n        assert flip_code in [-1, 0, 1\n                             ], \"flip_code should be a value in [-1, 0, 1]\"\n        self.flip_code = flip_code\n\n    def __call__(self, img):\n        if random.randint(0, 1) == 1:\n            return cv2.flip(img, self.flip_code)\n        else:\n            return img\n\n\nclass NormalizeImage(object):\n    \"\"\" normalize image such as substract mean, divide std\n    \"\"\"\n\n    def __init__(self,\n                 scale=None,\n                 mean=None,\n                 std=None,\n                 order='chw',\n                 output_fp16=False,\n                 channel_num=3):\n        if isinstance(scale, str):\n            scale = eval(scale)\n        assert channel_num in [\n            3, 4\n        ], \"channel number of input image should be set to 3 or 4.\"\n        self.channel_num = channel_num\n        self.output_dtype = 'float16' if output_fp16 else 'float32'\n        self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)\n        self.order = order\n        mean = mean if mean is not None else [0.485, 0.456, 0.406]\n        std = std if std is not None else [0.229, 0.224, 0.225]\n\n        shape = (3, 1, 1) if self.order == 'chw' else (1, 1, 3)\n        self.mean = np.array(mean).reshape(shape).astype('float32')\n        self.std = np.array(std).reshape(shape).astype('float32')\n\n    def __call__(self, img):\n        if isinstance(img, Image.Image):\n            img = np.array(img)\n\n        assert isinstance(img,\n                          np.ndarray), \"invalid input 'img' in NormalizeImage\"\n\n        img = (img.astype('float32') * self.scale - self.mean) / self.std\n\n        if self.channel_num == 4:\n            img_h = img.shape[1] if self.order == 'chw' else img.shape[0]\n            img_w = img.shape[2] if self.order == 'chw' else img.shape[1]\n            pad_zeros = np.zeros(\n                (1, img_h, img_w)) if self.order == 'chw' else np.zeros(\n                    (img_h, img_w, 1))\n            img = (np.concatenate(\n                (img, pad_zeros), axis=0)\n                   if self.order == 'chw' else np.concatenate(\n                       (img, pad_zeros), axis=2))\n        return img.astype(self.output_dtype)\n\n\nclass ToCHWImage(object):\n    \"\"\" convert hwc image to chw image\n    \"\"\"\n\n    def __init__(self):\n        pass\n\n    def __call__(self, img):\n        if isinstance(img, Image.Image):\n            img = np.array(img)\n\n        return img.transpose((2, 0, 1))\n\n\nclass ColorJitter(PPColorJitter):\n    \"\"\"ColorJitter.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        self.p = kwargs.pop('p', 1.0)\n        super().__init__(*args, **kwargs)\n\n    def __call__(self, img):\n        if random.random() < self.p:\n            if not isinstance(img, Image.Image):\n                img = np.ascontiguousarray(img)\n                img = Image.fromarray(img)\n            img = super()._apply_image(img)\n            if isinstance(img, Image.Image):\n                img = np.asarray(img)\n        return img\n\n\nclass GaussianBlur(object):\n    \"\"\"Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709\"\"\"\n\n    def __init__(self, sigma=[.1, 2.], p=1.0):\n        self.p = p\n        self.sigma = sigma\n\n    def __call__(self, img):\n        if random.random() < self.p:\n            if not isinstance(img, Image.Image):\n                img = np.ascontiguousarray(img)\n                img = Image.fromarray(img)\n            sigma = random.uniform(self.sigma[0], self.sigma[1])\n            img = img.filter(ImageFilter.GaussianBlur(radius=sigma))\n            if isinstance(img, Image.Image):\n                img = np.asarray(img)\n        return img\n\n\nclass Pixels(object):\n    def __init__(self, mode=\"const\", mean=[0., 0., 0.]):\n        self._mode = mode\n        self._mean = mean\n\n    def __call__(self, h=224, w=224, c=3):\n        if self._mode == \"rand\":\n            return np.random.normal(size=(1, 1, 3))\n        elif self._mode == \"pixel\":\n            return np.random.normal(size=(h, w, c))\n        elif self._mode == \"const\":\n            return self._mean\n        else:\n            raise Exception(\n                \"Invalid mode in RandomErasing, only support \\\"const\\\", \\\"rand\\\", \\\"pixel\\\"\"\n            )\n\n\nclass RandomErasing(object):\n    \"\"\"RandomErasing.\n    This code is adapted from https://github.com/zhunzhong07/Random-Erasing, and refer to Timm.\n    \"\"\"\n\n    def __init__(self,\n                 EPSILON=0.5,\n                 sl=0.02,\n                 sh=0.4,\n                 r1=0.3,\n                 mean=[0., 0., 0.],\n                 attempt=100,\n                 use_log_aspect=False,\n                 mode='const'):\n        self.EPSILON = eval(EPSILON) if isinstance(EPSILON, str) else EPSILON\n        self.sl = eval(sl) if isinstance(sl, str) else sl\n        self.sh = eval(sh) if isinstance(sh, str) else sh\n        r1 = eval(r1) if isinstance(r1, str) else r1\n        self.r1 = (math.log(r1), math.log(1 / r1)) if use_log_aspect else (\n            r1, 1 / r1)\n        self.use_log_aspect = use_log_aspect\n        self.attempt = attempt\n        self.get_pixels = Pixels(mode, mean)\n\n    def __call__(self, img):\n        if random.random() > self.EPSILON:\n            return img\n\n        for _ in range(self.attempt):\n            area = img.shape[0] * img.shape[1]\n\n            target_area = random.uniform(self.sl, self.sh) * area\n            aspect_ratio = random.uniform(*self.r1)\n            if self.use_log_aspect:\n                aspect_ratio = math.exp(aspect_ratio)\n\n            h = int(round(math.sqrt(target_area * aspect_ratio)))\n            w = int(round(math.sqrt(target_area / aspect_ratio)))\n\n            if w < img.shape[1] and h < img.shape[0]:\n                pixels = self.get_pixels(h, w, img.shape[2])\n                x1 = random.randint(0, img.shape[0] - h)\n                y1 = random.randint(0, img.shape[1] - w)\n                if img.shape[2] == 3:\n                    img[x1:x1 + h, y1:y1 + w, :] = pixels\n                else:\n                    img[x1:x1 + h, y1:y1 + w, 0] = pixels[0]\n                return img\n        return img\n\n\nclass RandomGrayscale(object):\n    \"\"\"Randomly convert image to grayscale with a probability of p (default 0.1).\n    Args:\n        p (float): probability that image should be converted to grayscale.\n    Returns:\n        PIL Image: Grayscale version of the input image with probability p and unchanged\n        with probability (1-p).\n        - If input image is 1 channel: grayscale version is 1 channel\n        - If input image is 3 channel: grayscale version is 3 channel with r == g == b\n    \"\"\"\n\n    def __init__(self, p=0.1):\n        self.p = p\n\n    def __call__(self, img):\n        \"\"\"\n        Args:\n            img (PIL Image): Image to be converted to grayscale.\n        Returns:\n            PIL Image: Randomly grayscaled image.\n        \"\"\"\n\n        flag = False\n        if not isinstance(img, Image.Image):\n            img = np.ascontiguousarray(img)\n            img = Image.fromarray(img)\n            flag = True\n\n        num_output_channels = 1 if img.mode == 'L' else 3\n\n        if random.random() < self.p:\n            img = F.to_grayscale(img, num_output_channels=num_output_channels)\n\n        if flag:\n            img = np.asarray(img)\n\n        return img\n"
  },
  {
    "path": "ppfleetx/data/transforms/utils.py",
    "content": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom . import preprocess\n\n\ndef transform(data, ops=[]):\n    \"\"\" transform \"\"\"\n    for op in ops:\n        data = op(data)\n    return data\n\n\ndef create_preprocess_operators(params):\n    \"\"\"\n    create operators based on the config\n    Args:\n        params(list): a dict list, used to create some operators\n    \"\"\"\n    assert isinstance(params, list), ('operator config should be a list')\n    ops = []\n    for operator in params:\n        assert isinstance(operator,\n                          dict) and len(operator) == 1, \"yaml format error\"\n        op_name = list(operator)[0]\n        param = {} if operator[op_name] is None else operator[op_name]\n        op = getattr(preprocess, op_name)(**param)\n        ops.append(op)\n\n    return ops\n"
  },
  {
    "path": "ppfleetx/data/utils/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom .batch_collate_fn import *\n"
  },
  {
    "path": "ppfleetx/data/utils/batch_collate_fn.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\n\nimport os\nimport sys\nimport numbers\nimport numpy as np\nfrom dataclasses import dataclass\n\ntry:\n    from collections.abc import Sequence, Mapping\nexcept:\n    from collections import Sequence, Mapping\n\nfrom ppfleetx.data.sampler import Stack, Tuple\n\n\ndef collate_fn(batch):\n    \"\"\"\n    Default batch collating function for :code:`paddle.io.DataLoader`,\n    get input data as a list of sample datas, each element in list\n    if the data of a sample, and sample data should composed of list,\n    dictionary, string, number, numpy array and paddle.Tensor, this\n    function will parse input data recursively and stack number,\n    numpy array and paddle.Tensor datas as batch datas. e.g. for\n    following input data:\n    [{'image': np.array(shape=[3, 224, 224]), 'label': 1},\n     {'image': np.array(shape=[3, 224, 224]), 'label': 3},\n     {'image': np.array(shape=[3, 224, 224]), 'label': 4},\n     {'image': np.array(shape=[3, 224, 224]), 'label': 5},]\n    \n    \n    This default collate function zipped each number and numpy array\n    field together and stack each field as the batch field as follows:\n    {'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])}\n    Args:  \n        batch(list of sample data): batch should be a list of sample data.\n    \n    Returns:\n        Batched data: batched each number, numpy array and paddle.Tensor\n                      in input data.\n    \"\"\"\n    sample = batch[0]\n    if isinstance(sample, np.ndarray):\n        batch = np.stack(batch, axis=0)\n        return batch\n    elif isinstance(sample, paddle.Tensor):\n        return paddle.stack(batch, axis=0)\n    elif isinstance(sample, numbers.Number):\n        batch = np.array(batch)\n        return batch\n    elif isinstance(sample, (str, bytes)):\n        return batch\n    elif isinstance(sample, Mapping):\n        return {key: collate_fn([d[key] for d in batch]) for key in sample}\n    elif isinstance(sample, Sequence):\n        sample_fields_num = len(sample)\n        if not all(len(sample) == sample_fields_num for sample in iter(batch)):\n            raise RuntimeError(\n                \"fileds number not same among samples in a batch\")\n        return [collate_fn(fields) for fields in zip(*batch)]\n\n    raise TypeError(\"batch data con only contains: tensor, numpy.ndarray, \"\n                    \"dict, list, number, but got {}\".format(type(sample)))\n\n\ndef default_collate_fn(batch_transform=None):\n    if batch_transform is not None:\n        # batch_ops = create_preprocess_operators(batch_transform)\n\n        # def inner_collate_fn(batch):\n        #     batch = transform(batch, batch_ops)\n        #     batch = collate_fn(batch)\n        #     return batch\n\n        # return inner_collate_fn\n        pass\n    else:\n        return collate_fn\n\n\ndef gpt_collate_fn(batch):\n    return Tuple([Stack() for raw in zip(*batch)])(batch)\n\n\nclass ErnieCollateData():\n    def __init__(self, micro_batch_size=1):\n        self.micro_batch_size = micro_batch_size\n\n    def generate_data(self, data, stack_fn=Stack()):\n        num_fields = len(data[0])\n        out = [None] * num_fields\n        # 0. input_ids,\n        # 1. segment_ids,\n        # 2. input_mask,\n        # 3. masked_lm_positions,\n        # 4. masked_lm_labels,\n        # 5. next_sentence_labels\n        for i in (0, 1, 2, 5):\n            out[i] = stack_fn([x[i] for x in data])\n        out[5] = out[5].reshape([-1, 1])\n        batch_size, seq_length = out[0].shape\n        size = num_mask = sum(len(x[3]) for x in data)\n        # masked_lm_positions\n        # Organize as a 1D tensor for gather or use gather_nd\n        if size % 8 != 0:\n            size += 8 - (size % 8)\n        out[3] = np.full(size, 0, dtype=np.int32)\n\n        # masked_lm_labels\n        out[4] = np.full([size, 1], -1, dtype=np.int64)\n        mask_token_num = 0\n        for i, x in enumerate(data):\n            for j, pos in enumerate(x[3]):\n                out[3][mask_token_num] = i * seq_length + pos\n                out[4][mask_token_num] = x[4][j]\n                mask_token_num += 1\n        return out\n\n    def __call__(self, data):\n        accumulate_steps = len(data) // self.micro_batch_size\n        if accumulate_steps == 1:\n            return self.generate_data(data)\n        else:\n            self.micro_batch_size = len(data) // accumulate_steps\n            all_data = [[] for _ in range(6)]\n            for acc_step in range(accumulate_steps):\n                tmp = self.generate_data(\n                    data[acc_step * self.micro_batch_size:(acc_step + 1) *\n                         self.micro_batch_size])\n                for i in range(6):\n                    all_data[i].append(tmp[i])\n            return all_data\n\n\n@dataclass\nclass DataCollatorWithPadding:\n    \"\"\"\n    Data collator that will dynamically pad the inputs to the longest sequence in the batch.\n\n    Args:\n        tokenizer_type (str): The type of tokenizer used for encoding the data.\n    \"\"\"\n\n    def __init__(self,\n                 tokenizer_type,\n                 padding=True,\n                 max_length=None,\n                 pad_to_multiple_of=None,\n                 return_tensors=\"pd\",\n                 return_attention_mask=None):\n        from ppfleetx.data.tokenizers import get_ernie_tokenizer\n        self.tokenizer = get_ernie_tokenizer(tokenizer_type)\n        self.padding = padding\n        self.max_length = max_length\n        self.pad_to_multiple_of = pad_to_multiple_of\n        self.return_tensors = return_tensors\n        self.return_attention_mask = return_attention_mask\n\n    def __call__(self, features):\n        batch = self.tokenizer.pad(\n            features,\n            padding=self.padding,\n            max_length=self.max_length,\n            pad_to_multiple_of=self.pad_to_multiple_of,\n            return_tensors=self.return_tensors,\n            return_attention_mask=self.return_attention_mask)\n        if \"label\" in batch:\n            batch[\"labels\"] = batch[\"label\"]\n            del batch[\"label\"]\n        if \"label_ids\" in batch:\n            batch[\"labels\"] = batch[\"label_ids\"]\n            del batch[\"label_ids\"]\n        return batch\n\n\ndef imagen_collate_fn(samples):\n    \"\"\" collate for imagen base64 \"\"\"\n    tmp = []\n    for i in samples:\n        if i and len(i['image']):\n            tmp.append(i)\n    samples = tmp\n\n    if len(samples) == 0:\n        return None\n\n    pad_idx = 0\n    text_items = [sample['caption'] for sample in samples]\n    image_items = [sample['image'] for sample in samples]\n    text_lengths = [len(cap) for cap in text_items]\n\n    bsz = len(text_items)\n    text_input = text_items\n\n    image_input = paddle.stack(image_items, axis=0)\n    _input = {'images': image_input, 'texts': text_input}\n    return _input\n"
  },
  {
    "path": "ppfleetx/distributed/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/distributed/apis/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/distributed/apis/amp.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom collections import defaultdict\nfrom types import MethodType\nimport numpy as np\n\nimport paddle\nimport paddle.nn as nn\nfrom paddle import _legacy_C_ops\nfrom paddle.fluid.dygraph import to_variable\nfrom paddle.fluid import framework\nfrom paddle.fluid.dygraph import base as imperative_base\nfrom paddle.framework import core\n\nfrom ppfleetx.distributed.apis import env\n\n\nclass MixPrecisionLayer(nn.Layer):\n    def __init__(self, layers, dtype=\"float16\"):\n        super().__init__(layers.full_name() + \"_mix_precision\")\n\n        self._layers = layers\n        self._dtype = dtype\n\n        assert self._dtype in [\"float16\", \"bfloat16\"]\n\n        for param in self._layers.parameters():\n            if not param.stop_gradient and not hasattr(param, \"main_grad\"):\n                setattr(param, \"main_grad\", None)\n                param._register_grad_hook(self._update_main_grad_hook(param))\n\n    def _update_main_grad_hook(self, param):\n        \"\"\"Create the update_main_grad hook for backprop.\"\"\"\n\n        # Hook used for back-prop and grad-merge.\n        @paddle.autograd.no_grad()\n        def param_hook(tmp_grad):\n            assert param.grad is None, \\\n                \"In main_grad node, param.grad should be None, but find param[{}] has grad.\".format(param.name)\n            if param.main_grad is None:\n                param.main_grad = core.eager.Tensor(\n                    value=tmp_grad.cast(paddle.float32).value(),\n                    place=tmp_grad.place,\n                    name=\"main_grad@\" + param.name, )\n            else:\n                param.main_grad.add_(tmp_grad.cast(paddle.float32))\n\n            tmp_grad._clear_data()\n            return None\n\n        return param_hook\n\n    def forward(self, *inputs, **kwargs):\n        outputs = self._layers(*inputs, **kwargs)\n\n        return outputs\n\n    def state_dict(\n            self,\n            destination=None,\n            include_sublayers=True,\n            structured_name_prefix=\"\", ):\n\n        return self._layers.state_dict(\n            destination=destination,\n            include_sublayers=include_sublayers,\n            structured_name_prefix=structured_name_prefix, )\n\n    @framework.deprecate_stat_dict\n    def set_state_dict(self, state_dict, use_structured_name=True):\n\n        self._layers.set_state_dict(\n            state_dict, use_structured_name=use_structured_name)\n\n\nclass MixPrecisionOptimizer:\n    def __init__(self, optimizer):\n        self._inner_opt = optimizer\n        self._parameter_list = self._obtain_optimizer_parameters_list()\n\n    def _obtain_optimizer_parameters_list(self):\n        if getattr(self._inner_opt, '_param_groups', None) and isinstance(\n                self._inner_opt._param_groups[0], dict):\n            parameters_list = []\n            for group in self._inner_opt._param_groups:\n                for param in group['params']:\n                    parameters_list.append(param)\n        else:\n            parameters_list = [\n                param for param in self._inner_opt._parameter_list\n            ]\n\n        return parameters_list\n\n    @imperative_base.no_grad\n    @framework.dygraph_only\n    def step(self):\n\n        if not isinstance(self._parameter_list[0], dict):\n            params_grads = []\n            for param in self._parameter_list:\n                if param.stop_gradient:\n                    continue\n                grad_var = param.main_grad\n                if framework.in_dygraph_mode():\n                    if (hasattr(grad_var, \"is_selected_rows\") and\n                            grad_var.is_selected_rows() and\n                            self._inner_opt.regularization is not None):\n                        raise RuntimeError(\n                            \"AdamW don't support weight_decay with sparse parameters, please set it to None.\"\n                        )\n                else:\n                    if (hasattr(grad_var, \"_is_sparse\") and\n                            grad_var._is_sparse() and\n                            self._inner_opt.regularization is not None):\n                        raise RuntimeError(\n                            \"AdamW don't support weight_decay with sparse parameters, please set it to None.\"\n                        )\n                params_grads.append((param, grad_var))\n\n            optimize_ops = self._inner_opt._apply_optimize(\n                loss=None, startup_program=None, params_grads=params_grads)\n        else:\n            # optimize parameters in groups\n            for param_group in self._inner_opt._param_groups:\n                params_grads = defaultdict(lambda: list())\n                for param in param_group['params']:\n                    if param.stop_gradient:\n                        continue\n                    grad_var = param.main_grad\n                    if framework.in_dygraph_mode():\n                        if (hasattr(grad_var, \"is_selected_rows\") and\n                                grad_var.is_selected_rows() and\n                                self._inner_opt.regularization is not None):\n                            raise RuntimeError(\n                                \"AdamW don't support weight_decay with sparse parameters, please set it to None.\"\n                            )\n                    else:\n                        if (hasattr(grad_var, \"_is_sparse\") and\n                                grad_var._is_sparse() and\n                                self._inner_opt.regularization is not None):\n                            raise RuntimeError(\n                                \"AdamW don't support weight_decay with sparse parameters, please set it to None.\"\n                            )\n                    params_grads['params'].append((param, grad_var))\n                params_grads.update(\n                    {k: v\n                     for k, v in param_group.items() if k != 'params'})\n                self._apply_optimize(\n                    loss=None, startup_program=None, params_grads=params_grads)\n\n    @framework.dygraph_only\n    def clear_grad(self, set_to_zero=True):\n\n        param_list = []\n        if self._parameter_list is None or not isinstance(\n                self._parameter_list[0], dict):\n            for p in self._parameter_list:\n                if not p.stop_gradient:\n                    param_list.append(p)\n        else:\n            for param_group in self._param_groups:\n                for p in param_group['params']:\n                    if not p.stop_gradient:\n                        param_list.append(p)\n\n        for p in param_list:\n            if hasattr(p, \"main_grad\") and p.main_grad is not None:\n                if set_to_zero:\n                    p.main_grad.zero_()\n                else:\n                    p.main_grad._clear()\n                    p.main_grad = None\n            elif not hasattr(p, \"main_grad\"):\n                p.clear_gradient(set_to_zero)\n\n    def __getattr__(self, item):\n        return getattr(self._inner_opt, item)\n\n\ndef unscale_method(self, optimizer):\n    if not self._enable:\n        return\n    param_grads = []\n    if getattr(optimizer, '_param_groups', None) and isinstance(\n            optimizer._param_groups[0], dict):\n        for group in optimizer._param_groups:\n            for param in group['params']:\n                if param.main_grad is not None:\n                    assert param.main_grad.dtype == core.VarDesc.VarType.FP32\n                    param_grads.append(param.main_grad)\n    else:\n        for param in optimizer._parameter_list:\n            if param.main_grad is not None:\n                assert param.main_grad.dtype == core.VarDesc.VarType.FP32\n                param_grads.append(param.main_grad)\n\n    temp_found_inf = to_variable(np.array([0]).astype(np.bool_))\n    if len(param_grads):\n        _legacy_C_ops.check_finite_and_unscale(\n            param_grads,\n            self._scale,\n            param_grads,\n            temp_found_inf, )\n\n    self._found_inf = 1 if temp_found_inf else 0\n\n    hcg = env.get_hcg()\n    if hcg is not None and hcg.nranks > hcg.get_data_parallel_world_size():\n        is_found_inf = paddle.to_tensor([self._found_inf], dtype=\"int32\")\n        paddle.distributed.all_reduce(\n            is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=None)\n        self._found_inf = is_found_inf.numpy()[0]\n\n\nclass MixPrecisionScaler:\n    def __init__(self, scaler):\n        self._inner_scaler = scaler\n        self._inner_scaler._unscale = MethodType(unscale_method, scaler)\n\n    def __getattr__(self, item):\n        return getattr(self._inner_scaler, item)\n"
  },
  {
    "path": "ppfleetx/distributed/apis/comm_groups.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.distributed as dist\nfrom paddle.distributed import fleet\nfrom paddle.distributed.fleet.base.strategy_group import (\n    StrategyGroupBase,\n    DPGroup,\n    MPGroup,\n    PPGroup,\n    ShardingGroup, )\nfrom paddle.distributed.fleet.base.orthogonal_strategy import OrthogonalStrategy\n\n\ndef create_hcg(strategy, hcg_name):\n    if hcg_name == \"HybridCommunicateGroup\":\n        fleet.init(is_collective=True, strategy=strategy)\n        hcg = fleet.get_hybrid_communicate_group()\n    else:\n        dist.init_parallel_env()\n        hcg = eval(\"{}\".format(hcg_name))(strategy)\n\n    return hcg\n\n\nclass MoEGroup(StrategyGroupBase):\n    \"\"\"\n    The communication group strategy for expert parallel.\n    Args:\n        list_of_ranks: A 2D-array, such as `[[0, 1, 2, 3], [4, 5, 6, 7]]`. Ranks in sublist represents\n    they are in the same communication group.\n    Returns:\n        The instance of expert parallel strategy group.\n    \"\"\"\n\n    def __init__(self, list_of_ranks):\n        super(MoEGroup, self).__init__(list_of_ranks)\n        assert not isinstance(\n            self.group,\n            list), \"Rank {} belongs to multi moe groups\".format(self._rank)\n\n\nclass Hybrid4DCommGroup(OrthogonalStrategy):\n    def __init__(self, list_of_strategy=None, fused_strategy_dict={}):\n        list_of_strategy = [\n            (\"dp\", 1, DPGroup),\n            (\"mp\", 1, MPGroup),\n            (\"pp\", 1, PPGroup),\n            (\"sharding\", 1, ShardingGroup),\n        ] if list_of_strategy is None else list_of_strategy\n\n        fused_strategy_dict[\"check\"] = [\"mp\", \"pp\"]\n\n        super().__init__(list_of_strategy, fused_strategy_dict)\n\n    # data parallel\n    def get_data_parallel_rank(self):\n        return self.rank_in_strategy(\"dp\")\n\n    def get_data_parallel_world_size(self):\n        return self.strategy_group(\"dp\").world_size\n\n    def get_data_parallel_group(self):\n        return self.strategy_group(\"dp\").group\n\n    def get_data_parallel_group_src_rank(self):\n        return self.strategy_group(\"dp\").group.ranks[0]\n\n    # tensor parallel\n    def get_model_parallel_rank(self):\n        return self.rank_in_strategy(\"mp\")\n\n    def get_model_parallel_world_size(self):\n        return self.strategy_group(\"mp\").world_size\n\n    def get_model_parallel_group(self):\n        return self.strategy_group(\"mp\").group\n\n    def get_model_parallel_group_src_rank(self):\n        return self.strategy_group(\"mp\").group.ranks[0]\n\n    # pipeline parallel\n    def get_stage_id(self):\n        return self.rank_in_strategy(\"pp\")\n\n    def get_pipe_parallel_world_size(self):\n        return self.strategy_group(\"pp\").world_size\n\n    def get_pipe_parallel_group(self):\n        return self.strategy_group(\"pp\").group\n\n    def get_p2p_groups(self):\n        return (self.strategy_group(\"pp\").p2p_groups)\n\n    # group sharded parallel\n    def get_sharding_parallel_rank(self):\n        return self.rank_in_strategy(\"sharding\")\n\n    def get_sharding_parallel_world_size(self):\n        return self.strategy_group(\"sharding\").world_size\n\n    def get_sharding_parallel_group(self):\n        return self.strategy_group(\"sharding\")\n\n    def get_sharding_parallel_group_src_rank(self):\n        return self.strategy_group(\"sharding\").ranks[0]\n\n    # check parallel group\n    def get_check_parallel_group(self):\n        return self.strategy_group(\"check\").group\n\n\nclass HybridCommGroupForMoE(Hybrid4DCommGroup):\n    def __init__(self, strategy):\n        self._dp_degree = strategy.hybrid_configs.get(\"dp_degree\", 1)\n        self._mp_degree = strategy.hybrid_configs.get(\"mp_degree\", 1)\n        self._pp_degree = strategy.hybrid_configs.get(\"pp_degree\", 1)\n        self._sharding_degree = strategy.hybrid_configs.get(\"sharding_degree\",\n                                                            1)\n\n        assert self._pp_degree == 1, \"The strategy combination of moe and pp \\\n            has not been supported in ppfleetx right now.\"\n\n        assert self._sharding_degree == 1, \"The strategy combination of moe and sharding \\\n            has not been supported in ppfleetx right now.\"\n\n        list_of_strategy = [\n            (\"dp\", self._dp_degree, DPGroup),\n            (\"mp\", self._mp_degree, MPGroup),\n            (\"pp\", self._pp_degree, PPGroup),\n            (\"sharding\", self._sharding_degree, ShardingGroup),\n        ]\n        fused_strategy_dict = {\"moe\": [\"dp\", \"mp\"]}\n\n        super().__init__(list_of_strategy, fused_strategy_dict)\n\n    def get_expert_parallel_world_size(self):\n        return self.fused_strategy_group(\"moe\").world_size\n\n    def get_expert_parallel_group(self):\n        return self.fused_strategy_group(\"moe\").group\n"
  },
  {
    "path": "ppfleetx/distributed/apis/env.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport random\nimport numpy as np\n\nimport paddle\nimport paddle.distributed as dist\nfrom paddle.distributed import fleet\nfrom paddle.distributed.fleet.meta_parallel import get_rng_state_tracker\n\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.distributed.apis import comm_groups\n\n__all__ = ['init_dist_env']\n\n_seed = None\n_dp_seed = None\n_hcg = None\n\n\ndef set_seed(seed):\n    # NOTE(shenliang03): For parameter init seed:\n    # seed: dp/mp_undistributed_paramter/sharding is same; others is different\n    # For compute seed(dropout):\n    # global seed: only mp group is same.\n    # local seed: all groups are different\n\n    if dist.get_world_size() > 1:\n        # obtain rank message of hybrid parallel\n        hcg = get_hcg()\n\n        mp_rank = hcg.get_model_parallel_rank()\n        mp_size = hcg.get_model_parallel_world_size()\n\n        pp_rank = hcg.get_stage_id()\n        pp_size = hcg.get_pipe_parallel_world_size()\n\n        dp_rank = hcg.get_data_parallel_rank()\n        dp_size = hcg.get_data_parallel_world_size()\n\n        sharding_rank = hcg.get_sharding_parallel_rank()\n        sharding_size = hcg.get_sharding_parallel_world_size()\n    else:\n        mp_rank, mp_size = 0, 1\n        pp_rank, pp_size = 0, 1\n        dp_rank, dp_size = 0, 1\n        sharding_rank, sharding_size = 0, 1\n\n    # NOTE: the commented seeds are set only for precision validation\n    # seed += 100 * pp_rank\n    random.seed(seed + 100 * pp_rank)\n    np.random.seed(seed + 100 * pp_rank)\n\n    # seed = mp_rank + \n    #        pp_rank * (mp_size) + \n    #        dp_rank * (mp_size * pp_size) + \n    #        sharding_rank * (mp_size * pp_size * dp_size)\n    # seed offset is order to avoid conflicts with the parameter initialization seed\n\n    seed_offset = seed + 1024 + paddle.distributed.get_world_size()\n    global_seed = seed_offset + \\\n                  pp_rank * (mp_size) + \\\n                  dp_rank * (mp_size * pp_size) + \\\n                  sharding_rank * (mp_size * pp_size * dp_size)\n\n    seed_offset += paddle.distributed.get_world_size()\n    local_seed = seed_offset + \\\n                 mp_rank + \\\n                 pp_rank * (mp_size) + \\\n                 dp_rank * (mp_size * pp_size) + \\\n                 sharding_rank * (mp_size * pp_size * dp_size)\n\n    tracker = get_rng_state_tracker()\n    tracker.add('global_seed', global_seed)\n    tracker.add('local_seed', local_seed)\n\n    paddle.seed(global_seed)\n\n    logger.info(\"The global seed is set to {} and local seed is set to {}.\".\n                format(global_seed, local_seed))\n\n    global _seed\n    global _dp_seed\n    _seed = seed\n    _dp_seed = global_seed\n\n\ndef set_hcg(hcg):\n    global _hcg\n    _hcg = hcg\n\n\ndef get_hcg():\n    global _hcg\n    return _hcg\n\n\ndef get_seed():\n    global _seed\n    return _seed\n\n\ndef get_dp_seed():\n    global _dp_seed\n    return _dp_seed\n\n\ndef init_dist_env(config):\n    paddle.set_device(config.Global.device)\n\n    strategy = fleet.DistributedStrategy()\n    strategy.hybrid_configs = {\n        \"dp_degree\": config.Distributed.dp_degree,\n        \"mp_degree\": config.Distributed.mp_degree,\n        \"pp_degree\": config.Distributed.pp_degree,\n        \"sharding_degree\": config.Distributed.sharding.sharding_degree,\n    }\n\n    if config.Distributed.pp_degree > 1:\n        if 'sequence_parallel' in config.Model:\n            if config.Model.sequence_parallel:\n                assert config.Global.enable_partial_send_recv is False, \\\n                    \"if config.Distributed.pp_degree > 1 and config.Model.sequence_parallel is True, \" \\\n                    \"config.Global.enable_partial_send_recv should be set False.\"\n\n    strategy.pipeline_configs = {\n        \"accumulate_steps\":\n        config.Global.local_batch_size // config.Global.micro_batch_size,\n        \"micro_batch_size\": config.Global.micro_batch_size,\n        \"enable_partial_send_recv\": config.Global.enable_partial_send_recv,\n    }\n\n    # set control in tensor parallel\n    seed = config.Global.seed\n    strategy.tensor_parallel_configs = {\"tensor_init_seed\": seed}\n\n    hcg = comm_groups.create_hcg(strategy, hcg_name=config.Distributed.hcg)\n    set_hcg(hcg)\n\n\ndef get_local_rank():\n    return int(os.getenv(\"PADDLE_RANK_IN_NODE\", 0))\n\n\ndef get_data_world_size():\n    if paddle.distributed.get_world_size() == 1:\n        return 1\n\n    hcg = get_hcg()\n    dp_size = hcg.get_data_parallel_world_size()\n    sharding_size = hcg.get_sharding_parallel_world_size()\n\n    return dp_size * sharding_size\n\n\ndef get_data_world_rank():\n    if paddle.distributed.get_world_size() == 1:\n        return 0\n\n    hcg = get_hcg()\n    dp_rank = hcg.get_data_parallel_rank()\n    sharding_rank = hcg.get_sharding_parallel_rank()\n    sharding_size = hcg.get_sharding_parallel_world_size()\n\n    return dp_rank * sharding_size + sharding_rank\n\n\ndef work_at_local_rank0(func):\n    def wrapper(*args, **kwargs):\n        local_rank = 0\n        if paddle.fluid.core.is_compiled_with_dist(\n        ) and paddle.distributed.get_world_size() > 1:\n            local_rank = paddle.distributed.ParallelEnv().dev_id\n        if local_rank == 0:\n            func(*args, **kwargs)\n\n    return wrapper\n"
  },
  {
    "path": "ppfleetx/distributed/apis/io.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport random\nimport numpy as np\n\nimport paddle\nimport paddle.distributed as dist\nfrom paddle.distributed import fleet\nfrom paddle.incubate.distributed.utils.io import save_for_auto_inference\n\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.distributed.apis import env\n\n\ndef save(output_dir, model, optimizer=None, step=0, epoch=0, sharding_stage=2):\n    \"\"\"\n    save the state dicts of model and optimizer into an checkpoint.\n    \"\"\"\n\n    nranks = dist.get_world_size()\n    if nranks > 1:\n        hcg = env.get_hcg()\n\n        dp_rank = hcg.get_data_parallel_rank()\n        mp_rank = hcg.get_model_parallel_rank()\n        pp_rank = hcg.get_stage_id()\n        sharding_rank = hcg.get_sharding_parallel_rank()\n    else:\n        dp_rank = 0\n\n    if dp_rank != 0:\n        logger.info(\"DP_Rank %d doesn't save model\" % dp_rank)\n        return\n\n    if output_dir and isinstance(output_dir, str):\n        output_dir = os.path.join(output_dir,\n                                  \"epoch_%d_step_%d\" % (epoch, step))\n\n        if not os.path.exists(output_dir):\n            os.makedirs(output_dir, exist_ok=True)\n        logger.info(\"Save model to %s\" % output_dir)\n\n        save_dir = \"{}/mp_{:0>2d}_sharding_{:0>2d}_pp_{:0>2d}\".format(\n            output_dir, mp_rank, sharding_rank,\n            pp_rank) if nranks > 1 else output_dir\n\n        if sharding_stage == 3:\n            model.get_all_parameters(convert2cpu=False)\n\n        paddle.save(model.state_dict(),\n                    os.path.join(save_dir, \"model.pdparams\"))\n\n        if optimizer is not None:\n            paddle.save(optimizer.state_dict(),\n                        os.path.join(save_dir, \"model_state.pdopt\"))\n\n        meta_dict = {\n            \"epoch\": epoch,\n            \"step\": step,\n            \"cuda_rng_state\": paddle.get_cuda_rng_state()\n        }\n        paddle.save(meta_dict, os.path.join(save_dir, \"meta_state.pdopt\"))\n\n        save_auto_dir = os.path.join(output_dir, \"auto_infer\")\n        save_for_auto_inference(os.path.join(save_auto_dir, \"auto\"), model)\n\n    else:\n        raise TypeError(\"`save` requires a valid value of `output_dir`.\")\n\n\ndef load(ckpt_dir, model, optimizer=None, mode='train', load_recovery=None):\n    nranks = dist.get_world_size()\n    if nranks > 1:\n        hcg = env.get_hcg()\n\n        dp_rank = hcg.get_data_parallel_rank()\n        mp_rank = hcg.get_model_parallel_rank()\n        pp_rank = hcg.get_stage_id()\n        sharding_rank = hcg.get_sharding_parallel_rank()\n    else:\n        dp_rank = 0\n\n    load_recovery = {} if load_recovery is None else load_recovery\n\n    if ckpt_dir and isinstance(ckpt_dir, str):\n        logger.info(\"Try to load checkpoint from %s \" % ckpt_dir)\n\n        if mode == 'quant':\n            load_dir = ckpt_dir\n        else:\n            load_dir = \"{}/mp_{:0>2d}_sharding_{:0>2d}_pp_{:0>2d}\".format(\n                ckpt_dir, mp_rank, sharding_rank,\n                pp_rank) if nranks > 1 else ckpt_dir\n        model_path = os.path.join(load_dir, \"model.pdparams\")\n        opt_path = os.path.join(load_dir, \"model_state.pdopt\")\n        meta_path = os.path.join(load_dir, \"meta_state.pdopt\")\n\n        if os.path.exists(model_path):\n            model_dict = paddle.load(model_path)\n            for name, param in model.state_dict().items():\n                assert name in model_dict.keys(\n                ), \"No param named `{}` was found in checkpoint file.\".format(\n                    name)\n\n                if param.dtype != model_dict[name].dtype:\n                    model_dict[name] = model_dict[name].cast(param.dtype)\n\n            model.set_state_dict(model_dict)\n        else:\n            raise ValueError(\"No model checkpoint file found in %s.\" %\n                             model_path)\n\n        if mode == 'train':\n            if os.path.exists(opt_path):\n                opt_dict = paddle.load(opt_path)\n                optimizer.set_state_dict(opt_dict)\n            else:\n                raise ValueError(\"No optimizer checkpoint file found in %s.\" %\n                                 opt_path)\n\n            if os.path.exists(meta_path):\n                meta_dict = paddle.load(meta_path)\n\n                load_recovery.update({\n                    'step': meta_dict['step'],\n                    'epoch': meta_dict['epoch'],\n                    'rng_state': meta_dict['cuda_rng_state']\n                })\n\n            else:\n                raise ValueError(\"No meta checkpoint file found in %s.\" %\n                                 meta_path)\n\n        logger.info(\"successfully load checkpoints\")\n    else:\n        logger.warning(\"`load` requires a valid value of `ckpt_dir`.\")\n        raise TypeError(\"`load` requires a valid value of `ckpt_dir`.\")\n"
  },
  {
    "path": "ppfleetx/distributed/apis/strategy.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.distributed as dist\nimport paddle.distributed.fleet as fleet\n\nfrom paddle.distributed.parallel import sync_params_buffers\nfrom paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients\nfrom paddle.distributed.fleet.meta_parallel import TensorParallel\nfrom paddle.distributed.sharding import group_sharded_parallel\n\nfrom ppfleetx.distributed.apis import env, amp\nfrom ppfleetx.utils.tensor_fusion_helper import all_reduce_parameters\n\n\ndef wrap_with_fleet(dist_config, model, optimizer=None, scaler=None):\n    if dist_config.sharding.sharding_stage in [2, 3]:\n        assert dist_config.pp_degree == 1, \\\n            \"sharding stage2/3 will support pipeline parallel later\"\n        return wrap_sharding_2_3(dist_config, model, optimizer, scaler)\n    else:\n        return wrap_3D_parallel(dist_config, model, optimizer, scaler)\n\n\ndef wrap_sharding_2_3(dist_config, model, optimizer=None, scaler=None):\n    hcg = env.get_hcg()\n    dp_group = hcg.get_data_parallel_group()\n    sharding_group = hcg.get_sharding_parallel_group()\n\n    if dist_config.dp_degree > 1 and dist_config.sharding.sharding_stage == 3:\n        sync_params_buffers(\n            model, comm_group=dp_group, src_rank=dp_group.ranks[0])\n\n    if dist_config.mp_degree > 1:\n        assert dist_config.sharding.sharding_stage == 2, \"only support mp + sharding stage2 hybrid parallel now.\"\n        model = TensorParallel(model, hcg, strategy=None)\n\n    level = \"p_g_os\" if dist_config.sharding.sharding_stage == 3 else \"os_g\"\n    origin_model = model\n    model, optimizer, scaler = group_sharded_parallel(\n        model=model,\n        optimizer=optimizer,\n        level=level,\n        scaler=scaler,\n        group=sharding_group,\n        offload=dist_config.sharding.sharding_offload,\n        dp_group=dp_group if dp_group.nranks > 1 else None)\n\n    if dist_config.sharding.reduce_overlap:\n        model._set_reduce_overlap(dist_config.sharding.reduce_overlap)\n\n    if dist_config.sharding.broadcast_overlap:\n        optimizer._set_broadcast_overlap(\n            dist_config.sharding.broadcast_overlap,\n            layers=origin_model,\n            num_groups=2)\n\n    return model, optimizer, scaler\n\n\ndef wrap_3D_parallel(dist_config, model, optimizer=None, scaler=None):\n    hcg = env.get_hcg()\n    dp_group = hcg.get_data_parallel_group()\n\n    if isinstance(model, amp.MixPrecisionLayer):\n        if dist.get_world_size() == dist_config.dp_degree:\n            sync_params_buffers(\n                model, comm_group=dp_group, src_rank=dp_group.ranks[0])\n        elif dist_config.pp_degree > 1:\n            model = fleet.distributed_model(model._layers)\n    else:\n        model = fleet.distributed_model(model)\n\n    optimizer = fleet.distributed_optimizer(\n        optimizer) if optimizer is not None else optimizer\n    scaler = fleet.distributed_scaler(scaler) if scaler is not None else scaler\n\n    return model, optimizer, scaler\n"
  },
  {
    "path": "ppfleetx/distributed/protein_folding/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and \n# limitations under the License.\n\nfrom . scg import scg\n"
  },
  {
    "path": "ppfleetx/distributed/protein_folding/bp.py",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\" Branch Parallel helper function\"\"\"\n\nimport paddle\nfrom paddle.autograd import PyLayer\nfrom . import scg\n\n__all__ = [\n    'get_world_size',\n    'get_rank_in_group',\n    ]\n\ndef get_world_size():\n    nranks = 1\n    if hasattr(scg, \"bp_group\"):\n        nranks = scg.bp_group.nranks\n    return nranks\n\n\ndef get_rank_in_group():\n    rank = 0\n    if hasattr(scg, \"get_rank_in_bp_group\"):\n        rank = scg.get_rank_in_bp_group()\n    return rank\n\n@paddle.no_grad()\ndef broadcast(tensor, src):\n    \"\"\" broadcast tensor from src rank in bp group \"\"\"\n    if get_world_size() == 1:\n        return tensor\n  \n    assert src in [0, 1], \"Branch Parallel is only support bp_degree=2 now!\"\n  \n    group = scg.bp_group\n    task = group.process_group.broadcast(tensor, src)\n    task.wait()\n    return tensor\n\nclass BroadcastGrad(PyLayer):\n    \"\"\" A PyLayer Op broadcast gradient in backward stage \"\"\"\n    @staticmethod\n    def forward(ctx, input, src):\n        \"\"\" return input directly \"\"\" \n        ctx.src = src\n        return input.clone()\n\n    @staticmethod\n    def backward(ctx, grad_output):\n        \"\"\" broadcast grad form src \"\"\" \n        broadcast(grad_output, ctx.src)\n        return grad_output.clone()\n\ndef broadcast_grad_for_backward(input, src):\n    \"\"\" a warpper for boradcast gradient in backward stage \"\"\"\n    if get_world_size() == 1:\n        return input\n\n    if not input.stop_gradient:\n        output = BroadcastGrad.apply(input, src)\n    else:\n        output = input.clone()\n    return output\n\n@paddle.no_grad()\ndef all_reduce(tensor):\n    \"\"\" allreduce a tensor in bp group \"\"\"\n    if get_world_size() == 1:\n        return tensor\n\n    group = scg.bp_group\n    paddle.distributed.all_reduce(\n        tensor, sync_op=True, group=group)\n\n    return tensor\n\n\n\nclass SyncEvoformerResults(PyLayer):\n    \"\"\" A PyLayer Op broadcast gradient in backward stage \"\"\"\n    @staticmethod\n    def forward(ctx, outer, msa, pair):\n        broadcast(outer, 0)\n        if get_rank_in_group() == 1:\n            pair += outer\n        broadcast(pair, 1)\n        broadcast(msa, 0)\n        return msa, pair\n\n    @staticmethod\n    def backward(ctx, *grad_output):\n        msa_grad = grad_output[0]\n        pair_grad = grad_output[1]\n\n        if get_rank_in_group() == 0:\n            pair_grad = paddle.zeros_like(pair_grad)\n\n        outer_grad = pair_grad.clone()\n        broadcast(outer_grad, 1)\n        \n        return outer_grad, msa_grad, pair_grad\n\ndef sync_evoformer_results(outer, msa, pair):\n    \"\"\" a warpper for boradcast gradient in backward stage \"\"\"\n    if get_world_size() == 1:\n        return msa, pair\n\n    if outer.stop_gradient and msa.stop_gradient and pair.stop_gradient:\n        return msa, pair\n\n    msa, pair = SyncEvoformerResults.apply(outer, msa, pair)\n        \n    return msa, pair\n\n@paddle.no_grad()\ndef grad_sync(param_groups):\n    \"\"\"\n        sync the gradients of params\n    \"\"\"\n\n    nranks = get_world_size()\n\n    if nranks < 2:\n        return\n\n    comm_group = scg.bp_group\n\n    for group in param_groups:\n        if group.get(\"bp\", False):\n            for p in group['params']:\n                if p.is_distributed:\n                    continue\n\n                grad = p.grad\n                if grad is None:\n                    continue\n\n                paddle.distributed.all_reduce(\n                    grad, sync_op=True, group=comm_group)\n\n    return None\n"
  },
  {
    "path": "ppfleetx/distributed/protein_folding/dap.py",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nDynamic Axial Parallelism and Duality Async Operation helper functions\npaper ref: FastFold: Reducing AlphaFold Training Time from 11 Days to 67 Hours, https://arxiv.org/abs/2203.00854\ncode ref: https://github.com/hpcaitech/FastFold.git\n\"\"\"\n\nimport warnings\nimport time\nimport paddle\nfrom paddle import nn\nfrom paddle import distributed as dist\nfrom paddle.autograd import PyLayer\nfrom . import scg\n\n__all__ = [\n    'set_dap_sync_op', 'get_dap_sync_op', 'get_world_size',\n    'get_rank_in_group', 'scatter', 'gather', 'all_gather', 'all_gather_opp',\n    'all_to_all', 'all_to_all_opp', 'row_to_col', 'col_to_row'\n]\n\n_sync_op = True\n\n\ndef set_dap_sync_op(sync_op):\n    assert sync_op in [True, False]\n    assert sync_op is True, \"Only support sync mode now!\"\n    global _sync_op\n    _sync_op = sync_op\n\n\ndef get_dap_sync_op():\n    global _sync_op\n    return _sync_op\n\n\ndef get_world_size():\n    nranks = 1\n    if hasattr(scg, \"dap_group\"):\n        nranks = scg.dap_group.nranks\n    return nranks\n\n\ndef get_rank_in_group():\n    rank = 0\n    if hasattr(scg, \"get_rank_in_dap_group\"):\n        rank = scg.get_rank_in_dap_group()\n    return rank\n\n\ndef ensure_divisibility(numerator, denominator):\n    \"\"\"Ensure that numerator is divisible by the denominator.\"\"\"\n    assert numerator % denominator == 0, '{} is not divisible by {}'.format(\n        numerator, denominator)\n\n\ndef divide(numerator, denominator):\n    ensure_divisibility(numerator, denominator)\n    return numerator // denominator\n\n\n@paddle.no_grad()\ndef _all_gather(tensor, axis=-1, sync_op=True):\n    group = scg.dap_group\n    tensor_shape = list(tensor.shape)\n    tensor_shape[0] *= group.nranks\n    out = paddle.zeros(tensor_shape, tensor.dtype)\n    out.stop_gradient = tensor.stop_gradient\n    task = group.process_group.all_gather(tensor, out)\n    task.wait()\n    return out\n\n\n@paddle.no_grad()\ndef _gather(tensor, axis=-1):\n    output = _all_gather(tensor)\n    if axis != 0:\n        output = paddle.concat(\n            paddle.split(\n                output, get_world_size(), axis=0), axis=axis)\n    return output\n\n\n@paddle.no_grad()\ndef _split(tensor, axis=-1):\n    ensure_divisibility(tensor.shape[axis], get_world_size())\n    tensor_list = paddle.split(tensor, get_world_size(), axis=axis)\n\n    output = tensor_list[get_rank_in_group()]\n\n    return output\n\n\nclass Scatter(PyLayer):\n    \"\"\" Scatter PyLayer Op\"\"\"\n\n    @staticmethod\n    def forward(ctx, input, axis: -1):\n        ctx.axis = axis\n        return _split(input, axis=axis)\n\n    @staticmethod\n    def backward(ctx, grad_output):\n        return _gather(grad_output, axis=ctx.axis)\n\n\ndef scatter(input, axis=-1):\n    \"\"\" split a tensor according axis by dap size \"\"\"\n    if get_world_size() == 1:\n        return input\n\n    if not input.stop_gradient:\n        output = Scatter.apply(input, axis=axis)\n    else:\n        output = _split(input, axis=axis)\n    return output\n\n\nclass Gather(PyLayer):\n    \"\"\" Gather PyLayer Op \"\"\"\n\n    @staticmethod\n    def forward(ctx, input, axis=-1):\n        ctx.axis = axis\n        return _gather(input, axis=axis)\n\n    @staticmethod\n    def backward(ctx, grad_output):\n        return _split(grad_output, axis=ctx.axis)\n\n\ndef gather(input, axis=-1):\n    \"\"\" gather tensor form all rank in dap group in axis \"\"\"\n    if get_world_size() == 1:\n        return input\n\n    if not input.stop_gradient:\n        output = Gather.apply(input, axis=axis)\n    else:\n        output = _gather(input, axis=axis)\n    return output\n\n\n@paddle.no_grad()\ndef _reduce_scatter(tensor, sync_op=True):\n    group = scg.dap_group\n    tensor_shape = list(tensor.shape)\n    tensor_shape[0] = divide(tensor_shape[0], group.nranks)\n    output = paddle.zeros(tensor_shape, tensor.dtype)\n    output.stop_gradient = tensor.stop_gradient\n    dist.stream.reduce_scatter(\n        output, tensor, op=dist.ReduceOp.SUM, group=group, sync_op=True)\n    return output\n\n\nclass AllGather(PyLayer):\n    \"\"\" AllGather PyLayer Op \"\"\"\n\n    @staticmethod\n    def forward(ctx, input, axis=-1, sync_op=True):\n        ctx.axis = axis\n        ctx.sync_op = sync_op\n        output = _all_gather(input, axis=axis, sync_op=sync_op)\n        return output\n\n    @staticmethod\n    def backward(ctx, grad_output):\n        if not ctx.sync_op:\n            pass\n            # TODO(GuoxiaWang): implement wait logical\n        return grad_output\n\n\nclass AllGather_Opp(PyLayer):\n    \"\"\" Duality Async Operation for AllGather \"\"\"\n\n    @staticmethod\n    def forward(ctx, input, axis=-1, sync_op=True):\n        ctx.axis = axis\n        ctx.sync_op = sync_op\n        return input\n\n    @staticmethod\n    def backward(ctx, grad_output):\n        output = _reduce_scatter(grad_output, sync_op=ctx.sync_op)\n        return output\n\n\ndef all_gather(input, axis=-1):\n    \"\"\" gather tensors from all rank in dap group and all get the result.\n        if sync_op=None, sync will be assign according init_dap setting.\n\n        when using async communication, sync_op=False, do not use the output as same as input.\n        E.g. do not use `a = all_gather(a, ...)`, recommend to use `b = all_gather(a, ...)`\n    \"\"\"\n    if get_world_size() == 1:\n        return input\n\n    sync_op = get_dap_sync_op()\n\n    if not input.stop_gradient:\n        output = AllGather.apply(input, axis, sync_op=sync_op)\n    else:\n        output = _all_gather(input, axis, sync_op=sync_op)\n    return output\n\n\ndef all_gather_opp(output, axis=-1):\n    \"\"\" Duality Async Operation for all_gather.\n        if sync_op=None, sync will be assign according init_dap setting.\n    \"\"\"\n    nranks = get_world_size()\n    if nranks == 1:\n        return output\n\n    sync_op = get_dap_sync_op()\n\n    if not sync_op:\n        # TODO(GuoxiaWang): implement wait logical\n        pass\n\n    if not output.stop_gradient:\n        output = AllGather_Opp.apply(output, axis, sync_op=sync_op)\n\n    if axis != 0:\n        output = paddle.concat(paddle.split(output, nranks, 0), axis=axis)\n\n    return output\n\n\n@paddle.no_grad()\ndef _all_to_all(tensor, in_axis=-1, out_axis=-1, sync_op=True):\n    group = scg.dap_group\n    tensor_shape = list(tensor.shape)\n\n    out = paddle.zeros(tensor_shape, tensor.dtype)\n    out.stop_gradient = tensor.stop_gradient\n    task = group.process_group.alltoall(tensor, out)\n    task.wait()\n\n    return out\n\n\nclass All_to_All(PyLayer):\n    \"\"\" All_to_All PyLayer Op\"\"\"\n\n    @staticmethod\n    def forward(ctx, input, in_axis=-1, out_axis=-1, sync_op=True):\n        ctx.in_axis = in_axis\n        ctx.out_axis = out_axis\n        ctx.sync_op = sync_op\n        return _all_to_all(\n            input, in_axis=in_axis, out_axis=out_axis, sync_op=sync_op)\n\n    @staticmethod\n    def backward(ctx, grad_output):\n        if not ctx.sync_op:\n            # TODO(GuoxiaWang): implement wait logical\n            pass\n        return grad_output\n\n\nclass All_to_All_Opp(PyLayer):\n    \"\"\" Duality Async Operation for All_to_All \"\"\"\n\n    @staticmethod\n    def forward(ctx, output, in_axis=-1, out_axis=-1, sync_op=True):\n        ctx.in_axis = in_axis\n        ctx.out_axis = out_axis\n        ctx.sync_op = sync_op\n        return output\n\n    @staticmethod\n    def backward(ctx, grad_output):\n        return _all_to_all(\n            grad_output,\n            in_axis=ctx.out_axis,\n            out_axis=ctx.in_axis,\n            sync_op=ctx.sync_op)\n\n\ndef all_to_all(input, in_axis, out_axis):\n    \"\"\" all to all according in_axis and out_axis.\n        if sync_op=None, sync will be assign according init_dap setting.\n    \"\"\"\n    if get_world_size() == 1:\n        return input\n\n    sync_op = get_dap_sync_op()\n\n    if in_axis != 0:\n        ensure_divisibility(input.shape[in_axis], get_world_size())\n        input = paddle.concat(\n            paddle.split(\n                input, get_world_size(), axis=in_axis), axis=0)\n\n    if not input.stop_gradient:\n        output = All_to_All.apply(\n            input, in_axis=in_axis, out_axis=out_axis, sync_op=sync_op)\n    else:\n        output = _all_to_all(\n            input, in_axis=in_axis, out_axis=out_axis, sync_op=sync_op)\n\n    return output\n\n\ndef all_to_all_opp(output, in_axis, out_axis):\n    \"\"\" Duality Async Operation for all_to_all.\n        if sync_op=None, sync will be assign according init_dap setting.\n    \"\"\"\n    if get_world_size() == 1:\n        return output\n\n    sync_op = get_dap_sync_op()\n\n    if not sync_op:\n        # TODO(GuoxiaWang): implement wait logical\n        pass\n\n    if not output.stop_gradient:\n        output = All_to_All_Opp.apply(\n            output, in_axis=in_axis, out_axis=out_axis, sync_op=sync_op)\n\n    if out_axis != 0:\n        ensure_divisibility(output.shape[0], get_world_size())\n        output = paddle.concat(\n            paddle.split(\n                output, get_world_size(), axis=0), axis=out_axis)\n\n    return output\n\n\nclass All2All(PyLayer):\n    @staticmethod\n    def forward(ctx, input, in_axis=-1, out_axis=-1):\n        ctx.in_axis = in_axis\n        ctx.out_axis = out_axis\n        return _all_to_all(input, in_axis=in_axis, out_axis=out_axis)\n\n    @staticmethod\n    def backward(ctx, grad_output):\n        return _all_to_all(\n            grad_output, in_axis=ctx.out_axis, out_axis=ctx.in_axis)\n\n\ndef row_to_col(input):\n    \"\"\" N, S, R, C => N, R, S, C using sync all_to_all \"\"\"\n    if get_world_size() == 1:\n        return input\n\n    ensure_divisibility(input.shape[2], get_world_size())\n    input = paddle.concat(\n        paddle.split(\n            input, get_world_size(), axis=2), axis=0)\n\n    if not input.stop_gradient:\n        output = All2All.apply(input, in_axis=2, out_axis=1)\n    else:\n        output = _all_to_all(input, in_axis=2, out_axis=1)\n\n    output = paddle.concat(\n        paddle.split(\n            output, get_world_size(), axis=0), axis=1)\n    return output\n\n\ndef col_to_row(input):\n    \"\"\" N, R, S, C => N, S, R, C using sync all_to_all \"\"\"\n    if get_world_size() == 1:\n        return input\n\n    ensure_divisibility(input.shape[1], get_world_size())\n    input = paddle.concat(\n        paddle.split(\n            input, get_world_size(), axis=1), axis=0)\n\n    if not input.stop_gradient:\n        output = All2All.apply(input, in_axis=1, out_axis=2)\n    else:\n        output = _all_to_all(input, in_axis=1, out_axis=2)\n\n    output = paddle.concat(\n        paddle.split(\n            output, get_world_size(), axis=0), axis=2)\n    return output\n\n\n@paddle.no_grad()\ndef grad_sync(param_groups):\n    \"\"\"\n        sync the gradients of params\n    \"\"\"\n\n    nranks = get_world_size()\n\n    if nranks < 2:\n        return\n\n    comm_group = scg.dap_group\n\n    for group in param_groups:\n        if group.get(\"dap\", False):\n            for p in group['params']:\n                if p.is_distributed:\n                    continue\n\n                grad = p.grad\n                if grad is None:\n                    continue\n\n                paddle.distributed.all_reduce(\n                    grad, sync_op=True, group=comm_group)\n\n    return None\n"
  },
  {
    "path": "ppfleetx/distributed/protein_folding/dp.py",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nDistributed Data Parallel helper functions\n\"\"\"\n\nimport paddle\nfrom . import scg\n\n__all__ = [\n    'get_world_size',\n    'get_rank_in_group',\n    'grad_sync',\n    'param_sync'\n    ]\n\ndef get_world_size():\n    nranks = 1\n    if hasattr(scg, \"dp_group\"):\n        nranks = scg.dp_group.nranks\n    return nranks\n\n\ndef get_rank_in_group():\n    rank = 0\n    if hasattr(scg, \"get_rank_in_dp_group\"):\n        rank = scg.get_rank_in_dp_group()\n    return rank\n\n@paddle.no_grad()\ndef grad_sync(param_groups, grad_avg=True):\n    \"\"\"\n        sync the gradients of params\n    \"\"\"\n    \n    nranks = get_world_size()\n\n    if nranks < 2:\n        return\n\n    comm_group = scg.dp_group\n\n    for group in param_groups:\n        for p in group['params']:\n            if p.is_distributed:\n                continue\n\n            grad = p.grad\n            if grad is None:\n                continue\n\n            paddle.distributed.all_reduce(\n                grad, sync_op=True, group=comm_group)\n            if grad_avg:\n                grad = p.grad.scale_(1.0 / nranks)\n\n    return None\n\n\n@paddle.no_grad()\ndef param_sync(model, src_rank=0, comm_group=None):\n    \"\"\"\n        broadcast params to other ranks\n    \"\"\"\n\n    nranks = paddle.distributed.get_world_size(\n    ) if comm_group is None else comm_group.nranks\n\n    if nranks < 2:\n        return\n\n    for _, param in model._obtain_parameters_buffers().items():\n\n        if param.is_distributed:\n            continue\n\n        if getattr(param, \"no_sync\", False):\n            continue\n\n        paddle.distributed.broadcast(\n            param, src=src_rank, group=comm_group, sync_op=True)\n\n    return None\n\n\n@paddle.no_grad()\ndef all_reduce(tensor, op=paddle.distributed.ReduceOp.SUM):\n    \"\"\" allreduce a tensor in bp group \"\"\"\n    if get_world_size() == 1:\n        return tensor\n\n    group = scg.dp_group\n    paddle.distributed.all_reduce(\n        tensor, sync_op=True, op=op, group=group)\n\n    return tensor\n"
  },
  {
    "path": "ppfleetx/distributed/protein_folding/scg.py",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nCommunication group manager\n\"\"\"\nimport types\nimport numpy as np\nfrom paddle import distributed as dist\n\n\ndef ensure_divisibility(numerator, denominator):\n    \"\"\"Ensure that numerator is divisible by the denominator.\"\"\"\n    assert numerator % denominator == 0, '{} is not divisible by {}'.format(\n        numerator, denominator)\n\n\nclass SingletonCommunicationGroup(object):\n    \"\"\" A singleton communication group for hybrid parallel. \"\"\"\n\n    def __init__(self):\n        self.initialized = False\n\n    def init_process_group(self,\n                           parallel_degree=[('dp', None)],\n                           custom_parallel_degree=None):\n        \"\"\" init the hybrid parallel process group. In most cases, only one hybrid parallel process group is \n            initialized in a distributed program, so this is a singleton design.\n        \n            args:\n                parallel_degree(list of tuple): Each parallel strategy consists of a tuple.\n                E.g. [('dp', None), ('pp', 2), ('mp', 2)], means that the data parallel degree is obtained by \n                calculation, the pipeline parallel degree is 2, and the model parallel degree is 2. For data \n                parallelism, it is special. It is assumed that data parallelism has always been in the outermost \n                dimension. If it is not set, the data parallelism degree will be automatically calculated.\n                \n                When multiple distributed strategies fully overlap, this can be represented by setting multiple \n                parallel names in a tuple. For example, [('dp', None), ('mp', 'bp', 2)]. Default is [('dp', None)]\n                \n                custom_parallel_degree(list of tuple): Higher-order usages can be used when the automatically \n                derived parallel strategy fails to meet user needs. The user can calculate the rank id in the \n                communication group and pass it in through the `custom_parallel_degree` arg. Default is None.\n                E.g. [('dp', [[0, 2, 4, 6], [1, 3, 5, 7]]), ('mp', 'bp', [[0, 1], [2, 3], [4, 5], [6, 7]])]\n                \n            note:\n                `parallel_degree` and `custom_parallel_degree` are mutually exclusive, only one can be set at \n                the same time.\n                \n            example 1:\n                # 8 gpus on single node, dp will be 2\n                # dp_group_ranks = [[0, 4], [1, 5], [2, 6], [3, 7]]\n                # pp_group_ranks = [[0, 2], [1, 3], [4, 6], [5, 7]]\n                # mp_group_ranks = [[0, 1], [2, 3], [4, 5], [6, 7]]\n                scg = SingletonCommunicationGroup()\n                scg.init_process_group(parallel_degree=[('dp', None), ('pp', 2), ('mp', 2)])\n                print(scg.dp_group)\n                print(scg.get_rank_in_bp_group())\n                print(scg.get_dp_world_size())\n                \n            example 2:\n                # 8 gpus on single node, dp will be 2\n                # dp_group_ranks = [[0, 4], [1, 5], [2, 6], [3, 7]]\n                # pp_group_ranks = [[0, 2], [1, 3], [4, 6], [5, 7]]\n                # mp_group_ranks = [[0, 1], [2, 3], [4, 5], [6, 7]]\n                scg = SingletonCommunicationGroup()\n                scg.init_process_group(parallel_degree=[('pp', 2), ('mp', 2)])\n                \n            example 3:\n                # 8 gpus on single node, dp will be 4, mp and bp share a communication group.\n                # dp_group_ranks = [[0, 2, 4, 6], [1, 3, 5, 7]]\n                # mp_group_ranks = [[0, 1], [2, 3], [4, 5], [6, 7]]\n                # bp_group_ranks = [[0, 1], [2, 3], [4, 5], [6, 7]]\n                scg = SingletonCommunicationGroup()\n                scg.init_process_group(parallel_degree=[('dp', None), ('mp', 'bp', 2)])\n                \n            example 4:\n                # 8 gpus on single node, dp will be 8, mp will be 8, dp and mp share a communication group.\n                # dp_group_ranks = [[0, 1, 2, 3, 4, 5, 6, 7]]\n                # mp_group_ranks = [[0, 1, 2, 3, 4, 5, 6, 7]]\n                scg = SingletonCommunicationGroup()\n                scg.init_process_group(parallel_degree=[('dp', 'mp', 8)])\n                \n            example 5:\n                # Equal to example 3 but pass config by custom_parallel_degree.\n                # dp_group_ranks = [[0, 2, 4, 6], [1, 3, 5, 7]]\n                # mp_group_ranks = [[0, 1], [2, 3], [4, 5], [6, 7]]\n                # bp_group_ranks = [[0, 1], [2, 3], [4, 5], [6, 7]]\n                scg = SingletonCommunicationGroup()\n                scg.init_process_group(parallel_degree=None, custom_parallel_degree=[('dp', [[0, 2, 4, 6], [1, 3, 5, 7]]), ('mp', 'bp', [[0, 1], [2, 3], [4, 5], [6, 7]])])\n            \n        \"\"\"\n\n        assert not (parallel_degree is not None and custom_parallel_degree is not None), \\\n            f\"parallel_degree and custom_parallel_degree only can be set one.\"\n\n        assert self.initialized == False, \"Communication group is already initialized!\"\n\n        if dist.is_initialized() is not None:\n            dist.init_parallel_env()\n\n        world_size = dist.get_world_size()\n        rank = dist.get_rank()\n\n        # parse parallel_degree\n        if parallel_degree is not None and custom_parallel_degree is None:\n\n            def check_valid(inp):\n                assert isinstance(\n                    inp, list), f\"parallel_degree must be list of tuple\"\n                for item in inp:\n                    num_ele = len(item)\n                    assert num_ele >= 2, f\"each item in parallel_degree must has least two element.\"\n                    assert isinstance(item[-1], (\n                        int, type(None)\n                    )), f\"the last element in each item must be int or None\"\n                    for idx in range(num_ele - 1):\n                        assert isinstance(item[idx], str)\n\n            check_valid(parallel_degree)\n\n            dp_exist = False\n            dp_has_set = False\n            num_ranks = 1\n            for idx, item in enumerate(parallel_degree):\n                degree = item[-1]\n                if 'dp' in item:\n                    assert idx == 0, 'The data parallel dimension must be the outermost dimension.'\n                    dp_exist = True\n\n                    if degree is not None:\n                        dp_has_set = True\n                    else:\n                        degree = 1\n                assert degree is not None, 'All but dp must specify the parallel degree explicitly.'\n                num_ranks *= degree\n\n            # check and update dp\n            if not dp_exist:\n                assert world_size % num_ranks == 0, 'The total number of parallelism products set is not divisible by the total number of cards.'\n                parallel_degree.insert(0, ('dp', world_size // num_ranks))\n            elif dp_exist and not dp_has_set:\n                assert world_size % num_ranks == 0, 'The total number of parallelism products set is not divisible by the total number of cards.'\n                parallel_degree[0] = ('dp', world_size // num_ranks)\n            else:\n                assert num_ranks == world_size, 'The total number of parallelism products set is not equal to the total number of cards.'\n\n            degrees = tuple([item[-1] for item in parallel_degree])\n            num_parallel = len(parallel_degree)\n            group_arr = np.arange(0, world_size).reshape(degrees)\n\n            custom_parallel_degree = []\n\n            for idx, item in enumerate(parallel_degree):\n                parallel_name = item[0]\n                degree = item[-1]\n                transpose_axes = []\n                for axis in range(num_parallel):\n                    if axis != idx:\n                        transpose_axes.append(axis)\n                transpose_axes.append(idx)\n                arr = group_arr.transpose(transpose_axes).reshape((-1, degree))\n\n                custom_parallel_degree.append([])\n\n                for parallel_name in item[:-1]:\n                    custom_parallel_degree[idx].append(parallel_name)\n                custom_parallel_degree[idx].append([])\n\n                for i in range(world_size // degree):\n                    ranks = arr[i].tolist()\n                    custom_parallel_degree[idx][-1].append(ranks)\n                custom_parallel_degree[idx] = tuple(custom_parallel_degree[\n                    idx])\n        else:\n            print(\n                \"We do not check the validity of user-defined custom_parallel_degree.\"\n            )\n\n        # new group and set attr\n        for item in custom_parallel_degree:\n            ranks_list = item[-1]\n            for i in range(len(ranks_list)):\n                ranks = ranks_list[i]\n                for parallel_name in item[:-1]:\n                    group = dist.new_group(ranks)\n                    print(f'> {parallel_name} ranks: {ranks}')\n                    if rank in ranks:\n                        setattr(self, f'{parallel_name}_group', group)\n\n                        def get_rank_in_group(parallel_name):\n                            def func():\n                                if not self.initialized:\n                                    return -1\n                                group = getattr(self, f'{parallel_name}_group')\n                                return group.get_group_rank(dist.get_rank())\n                            return func\n\n                        setattr(self, f'get_rank_in_{parallel_name}_group',\n                                get_rank_in_group(parallel_name))\n\n                        def get_group_world_size(parallel_name):\n                            def func():\n                                if not self.initialized:\n                                    return -1\n                                group = getattr(self, f'{parallel_name}_group')\n                                return group.nranks\n                            return func\n\n                        setattr(self, f'get_{parallel_name}_world_size',\n                                get_group_world_size(parallel_name))\n\n        self.initialized = True\n\nscg = SingletonCommunicationGroup()\n"
  },
  {
    "path": "ppfleetx/models/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport sys\nimport copy\n\nfrom ppfleetx.core.module.basic_module import BasicModule\nfrom ppfleetx.models.language_model.language_module import GPTModule, GPTGenerationModule, GPTEvalModule, GPTFinetuneModule\nfrom ppfleetx.models.language_model.gpt.auto.auto_module import GPTModuleAuto, GPTGenerationModuleAuto\nfrom ppfleetx.models.vision_model.general_classification_module import GeneralClsModule, GeneralClsModuleAuto\nfrom ppfleetx.models.vision_model.moco_module import MOCOModule, MOCOClsModule\nfrom ppfleetx.models.multimodal_model.multimodal_module import ImagenModule\nfrom ppfleetx.models.language_model.ernie import ErnieModule, ErnieSeqClsModule, ErnieModuleAuto, ErnieSeqClsModuleAuto\nfrom ppfleetx.models.language_model.language_module import MoEModule\n\nfrom ppfleetx.models.multimodal_model.multimodal_module import ImagenModule\n\n\ndef build_module(config):\n    module_name = config.Model.get(\"module\", \"BasicModule\")\n    module = eval(module_name)(config)\n\n    return module\n"
  },
  {
    "path": "ppfleetx/models/language_model/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/models/language_model/auto_utils.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport sys\nimport numpy as np\nimport paddle.distributed as dist\nimport paddle.distributed.auto_parallel as auto\n\nfrom functools import reduce\n\n\ndef process_mesh_config(config):\n    class Mesh:\n        def __init__(self, config):\n            self.dp_dim = None\n            self.mp_dim = None\n            self.process_mesh = None\n            self.config = config\n\n            topology = list(\n                filter(lambda x: x > 1, [\n                    self.config['pp_degree'], self.config['dp_degree'],\n                    self.config['mp_degree']\n                ]))\n            num_proc = 1 if not topology else reduce(lambda x, y: x * y,\n                                                     topology)\n            processes = [i for i in range(num_proc)]\n\n            if self.config['pp_degree'] > 1:\n                if len(topology) > 1:\n                    # dpmppp, dppp, mppp\n                    if len(topology) > 2:\n                        # dpmppp\n                        self.process_mesh = auto.ProcessMesh(\n                            np.array(processes).reshape(topology),\n                            dim_names=['pp', 'dp', 'mp'])\n                        self.dp_dim = 'dp'\n                        self.mp_dim = 'mp'\n                    elif self.config['dp_degree'] > 1:\n                        # dppp\n                        self.process_mesh = auto.ProcessMesh(\n                            np.array(processes).reshape(topology),\n                            dim_names=['pp', 'dp'])\n                        self.dp_dim = 'dp'\n                    elif self.config['mp_degree'] > 1:\n                        # mppp\n                        self.process_mesh = auto.ProcessMesh(\n                            np.array(processes).reshape(topology),\n                            dim_names=['pp', 'mp'])\n                        self.mp_dim = 'mp'\n                elif len(topology) == 1:\n                    # pp\n                    self.process_mesh = auto.ProcessMesh(\n                        processes, dim_names=['pp'])\n            else:\n                if len(topology) > 1:\n                    # dpmp\n                    self.process_mesh = auto.ProcessMesh(\n                        np.array(processes).reshape(topology),\n                        dim_names=['dp', 'mp'])\n                    self.dp_dim = 'dp'\n                    self.mp_dim = 'mp'\n                elif self.config['dp_degree'] > 1:\n                    # dp\n                    self.process_mesh = auto.ProcessMesh(\n                        processes, dim_names=['dp'])\n                    self.dp_dim = 'dp'\n                elif self.config['mp_degree'] > 1:\n                    # mp\n                    self.process_mesh = auto.ProcessMesh(\n                        processes, dim_names=['mp'])\n                    self.mp_dim = 'mp'\n                else:\n                    # serial\n                    self.process_mesh = auto.ProcessMesh(processes)\n\n        def __getitem__(self, idx):\n\n            if 'pp' in self.process_mesh.dim_names:\n                return self.process_mesh[idx]\n\n            return self.process_mesh\n\n        def stages(self, num_layers):\n            layer_per_stage = num_layers // self.config['pp_degree']\n            return [i // layer_per_stage for i in range(num_layers)]\n\n        @property\n        def dp(self):\n            return self.dp_dim\n\n        @property\n        def mp(self):\n            return self.mp_dim\n\n    return Mesh(config)\n\n\ndef process_model_configs(config):\n    \"\"\"\n    process model configs for auto parallel\n    \"\"\"\n    cfg_model = config['Model']\n    mesh = process_mesh_config(config['Distributed'])\n    cfg_model.update({'mesh': mesh})\n    if cfg_model['ffn_hidden_size'] is None:\n        cfg_model['ffn_hidden_size'] = 4 * cfg_model['hidden_size']\n\n    if cfg_model['use_recompute']:\n        if not cfg_model.get('recompute_granularity', None):\n            cfg_model['recompute_granularity'] = 'full'\n\n\ndef process_data_configs(config):\n    \"\"\"\n    process data configs for auto parallel\n    \"\"\"\n    cfg_global = config['Global']\n    cfg_data = config['Data']\n\n    mode_to_num_samples = {\n        \"Train\":\n        cfg_global['global_batch_size'] * config['Engine']['max_steps'],\n        \"Eval\": cfg_global['global_batch_size'] *\n        (config['Engine']['max_steps'] // config['Engine']['eval_freq'] + 1) *\n        config['Engine']['eval_iters'],\n        \"Test\":\n        cfg_global['global_batch_size'] * config['Engine']['test_iters'],\n    }\n\n    for mode in (\"Train\", \"Eval\", \"Test\"):\n        if mode in cfg_data.keys():\n            cfg_data[mode]['dataset']['num_samples'] = mode_to_num_samples[\n                mode]\n            cfg_data[mode]['dataset']['mode'] = mode\n            cfg_data[mode]['dataset']['seed'] = cfg_global['seed']\n\n\ndef process_configs(config):\n\n    process_model_configs(config)\n    process_data_configs(config)\n\n    return config\n"
  },
  {
    "path": "ppfleetx/models/language_model/debertav2/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom .modeling import (get_debertav2_model, debertav2_encode_text,\n                       get_debertav2_encoded_dim)\nfrom ppfleetx.models.language_model.t5 import normal_, constant_init\n"
  },
  {
    "path": "ppfleetx/models/language_model/debertav2/modeling.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\" Paddle DeBERTa-v2 model.\"\"\"\n\nfrom collections.abc import Sequence\nfrom typing import Optional, Tuple, Union\nimport json\n\nimport paddle\nfrom paddle import nn\nfrom paddle.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss\n\nfrom ppfleetx.models.language_model.t5 import (finfo, ACT2FN, ModelOutput,\n                                               normal_, constant_init)\nfrom ppfleetx.data.tokenizers.debertav2_tokenizer import debertav2_tokenize\n\nfrom dataclasses import dataclass\n\n\nclass BaseModelOutput(ModelOutput):\n    \"\"\"\n    Base class for model's outputs, with potential hidden states and attentions.\n\n    Args:\n        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):\n            Sequence of hidden-states at the output of the last layer of the model.\n        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):\n            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +\n            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.\n\n            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.\n        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):\n            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,\n            sequence_length)`.\n\n            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention\n            heads.\n    \"\"\"\n\n    last_hidden_state = None\n    hidden_states = None\n    attentions = None\n\n\n# Copied from transformers.models.deberta.modeling_deberta.XSoftmax with deberta->deberta_v2\nclass XSoftmax(paddle.autograd.PyLayer):\n    \"\"\"\n    Masked Softmax which is optimized for saving memory\n\n    Args:\n        input (`paddle.tensor`): The input tensor that will apply softmax.\n        mask (`paddle.IntTensor`):\n            The mask matrix where 0 indicate that element will be ignored in the softmax calculation.\n        dim (int): The dimension that will apply softmax\n\n    Example:\n\n    ```python\n    >>> import paddle \n    >>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax\n\n    >>> # Make a tensor\n    >>> x = paddle.randn([4, 20, 100])\n\n    >>> # Create a mask\n    >>> mask = (x > 0).int()\n\n    >>> # Specify the dimension to apply softmax\n    >>> dim = -1\n\n    >>> y = XSoftmax.apply(x, mask, dim)\n    ```\"\"\"\n\n    @staticmethod\n    def forward(self, input, mask, dim):\n        self.dim = dim\n        #rmask = ~(mask.cast('bool'))\n        #output = input.masked_fill(rmask, paddle.to_tensor(finfo(input.dtype).min))\n        mask = mask.cast('bool')\n        output = paddle.where(mask == 0,\n                              paddle.to_tensor(finfo(input.dtype).min), input)\n        output = paddle.nn.functional.softmax(\n            output, axis=self.dim, dtype=paddle.float32)\n        output = paddle.where(mask == 0, paddle.to_tensor(0.), output)\n        return output\n\n\n# Copied from transformers.models.deberta.modeling_deberta.DropoutContext\nclass DropoutContext(object):\n    def __init__(self):\n        self.dropout = 0\n        self.mask = None\n        self.scale = 1\n        self.reuse_mask = True\n\n\n# Copied from transformers.models.deberta.modeling_deberta.get_mask\ndef get_mask(input, local_context):\n    if not isinstance(local_context, DropoutContext):\n        dropout = local_context\n        mask = None\n    else:\n        dropout = local_context.dropout\n        dropout *= local_context.scale\n        mask = local_context.mask if local_context.reuse_mask else None\n\n    if dropout > 0 and mask is None:\n        mask = (1 - paddle.bernoulli(\n            paddle.full(\n                shape=input.shape, fill_value=1 - dropout))).cast(bool)\n\n    if isinstance(local_context, DropoutContext):\n        if local_context.mask is None:\n            local_context.mask = mask\n\n    return mask, dropout\n\n\n# Copied from transformers.models.deberta.modeling_deberta.XDropout\nclass XDropout(paddle.autograd.PyLayer):\n    \"\"\"Optimized dropout function to save computation and memory by using mask operation instead of multiplication.\"\"\"\n\n    @staticmethod\n    def forward(ctx, input, local_ctx):\n        mask, dropout = get_mask(input, local_ctx)\n        ctx.scale = 1.0 / (1 - dropout)\n        if dropout > 0:\n            output = paddle.where(mask == 1, 0, input)\n            return output * ctx.scale\n        else:\n            return input\n\n\n# Copied from transformers.models.deberta.modeling_deberta.StableDropout\nclass StableDropout(nn.Layer):\n    \"\"\"\n    Optimized dropout module for stabilizing the training\n\n    Args:\n        drop_prob (float): the dropout probabilities\n    \"\"\"\n\n    def __init__(self, drop_prob):\n        super().__init__()\n        self.drop_prob = drop_prob\n        self.count = 0\n        self.context_stack = None\n\n    def forward(self, x):\n        \"\"\"\n        Call the module\n\n        Args:\n            x (`paddle.to_tensor`): The input tensor to apply dropout\n        \"\"\"\n        if self.training and self.drop_prob > 0:\n            return XDropout.apply(x, self.get_context())\n        return x\n\n    def clear_context(self):\n        self.count = 0\n        self.context_stack = None\n\n    def init_context(self, reuse_mask=True, scale=1):\n        if self.context_stack is None:\n            self.context_stack = []\n        self.count = 0\n        for c in self.context_stack:\n            c.reuse_mask = reuse_mask\n            c.scale = scale\n\n    def get_context(self):\n        if self.context_stack is not None:\n            if self.count >= len(self.context_stack):\n                self.context_stack.append(DropoutContext())\n            ctx = self.context_stack[self.count]\n            ctx.dropout = self.drop_prob\n            self.count += 1\n            return ctx\n        else:\n            return self.drop_prob\n\n\n# Copied from transformers.models.deberta.modeling_deberta.DebertaSelfOutput with DebertaLayerNorm->LayerNorm\nclass DebertaV2SelfOutput(nn.Layer):\n    def __init__(self,\n                 hidden_size=1536,\n                 layer_norm_eps=1e-7,\n                 hidden_dropout_prob=0.1):\n        super().__init__()\n        self.dense = nn.Linear(hidden_size, hidden_size)\n        self.LayerNorm = LayerNorm(hidden_size, layer_norm_eps)\n        self.dropout = StableDropout(hidden_dropout_prob)\n\n    def forward(self, hidden_states, input_tensor):\n        hidden_states = self.dense(hidden_states)\n        hidden_states = self.dropout(hidden_states)\n        hidden_states = self.LayerNorm(hidden_states + input_tensor)\n        return hidden_states\n\n\n# Copied from transformers.models.deberta.modeling_deberta.DebertaAttention with Deberta->DebertaV2\nclass DebertaV2Attention(nn.Layer):\n    def __init__(\n            self,\n            hidden_size=512,\n            num_attention_heads=24,\n            attention_head_size=64,\n            share_att_key=True,\n            pos_att_type=None,\n            relative_attention=True,\n            position_buckets=-1,\n            max_relative_positions=-1,\n            max_position_embeddings=512,\n            layer_norm_eps=1e-7,\n            hidden_dropout_prob=0.1,\n            attention_probs_dropout_prob=0.1, ):\n        super().__init__()\n        self.self = DisentangledSelfAttention(\n            hidden_size=hidden_size,\n            num_attention_heads=num_attention_heads,\n            attention_head_size=attention_head_size,\n            share_att_key=share_att_key,\n            pos_att_type=pos_att_type,\n            relative_attention=relative_attention,\n            position_buckets=position_buckets,\n            max_relative_positions=max_relative_positions,\n            max_position_embeddings=max_position_embeddings,\n            hidden_dropout_prob=hidden_dropout_prob,\n            attention_probs_dropout_prob=attention_probs_dropout_prob, )\n        self.output = DebertaV2SelfOutput(\n            hidden_size=hidden_size,\n            layer_norm_eps=layer_norm_eps,\n            hidden_dropout_prob=hidden_dropout_prob)\n\n    def forward(\n            self,\n            hidden_states,\n            attention_mask,\n            output_attentions=False,\n            query_states=None,\n            relative_pos=None,\n            rel_embeddings=None, ):\n        self_output = self.self(\n            hidden_states,\n            attention_mask,\n            output_attentions,\n            query_states=query_states,\n            relative_pos=relative_pos,\n            rel_embeddings=rel_embeddings, )\n        if output_attentions:\n            self_output, att_matrix = self_output\n        if query_states is None:\n            query_states = hidden_states\n        attention_output = self.output(self_output, query_states)\n\n        if output_attentions:\n            return (attention_output, att_matrix)\n        else:\n            return attention_output\n\n\n# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->DebertaV2\nclass DebertaV2Intermediate(nn.Layer):\n    def __init__(\n            self,\n            hidden_size=1536,\n            hidden_act='gelu',\n            intermediate_size=6144, ):\n        super().__init__()\n        self.dense = nn.Linear(hidden_size, intermediate_size)\n        if isinstance(hidden_act, str):\n            self.intermediate_act_fn = ACT2FN[hidden_act]\n        else:\n            self.intermediate_act_fn = hidden_act\n\n    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:\n        hidden_states = self.dense(hidden_states)\n        hidden_states = self.intermediate_act_fn(hidden_states)\n        return hidden_states\n\n\n# Copied from transformers.models.deberta.modeling_deberta.DebertaOutput with DebertaLayerNorm->LayerNorm\nclass DebertaV2Output(nn.Layer):\n    def __init__(\n            self,\n            hidden_size=512,\n            intermediate_size=6144,\n            layer_norm_eps=1e-7,\n            hidden_dropout_prob=0.1, ):\n        super().__init__()\n        self.dense = nn.Linear(intermediate_size, hidden_size)\n        self.LayerNorm = LayerNorm(hidden_size, layer_norm_eps)\n        self.dropout = StableDropout(hidden_dropout_prob)\n\n    def forward(self, hidden_states, input_tensor):\n        hidden_states = self.dense(hidden_states)\n        hidden_states = self.dropout(hidden_states)\n        hidden_states = self.LayerNorm(hidden_states + input_tensor)\n        return hidden_states\n\n\n# Copied from transformers.models.deberta.modeling_deberta.DebertaLayer with Deberta->DebertaV2\nclass DebertaV2Layer(nn.Layer):\n    def __init__(\n            self,\n            hidden_size=512,\n            hidden_act='gelu',\n            intermediate_size=6144,\n            num_attention_heads=24,\n            attention_head_size=64,\n            share_att_key=True,\n            pos_att_type=None,\n            relative_attention=True,\n            position_buckets=256,\n            max_relative_positions=-1,\n            max_position_embeddings=512,\n            layer_norm_eps=1e-7,\n            hidden_dropout_prob=0.1,\n            attention_probs_dropout_prob=0.1, ):\n        super().__init__()\n        self.attention = DebertaV2Attention(\n            hidden_size=hidden_size,\n            num_attention_heads=num_attention_heads,\n            attention_head_size=attention_head_size,\n            share_att_key=share_att_key,\n            pos_att_type=pos_att_type,\n            relative_attention=relative_attention,\n            position_buckets=position_buckets,\n            max_relative_positions=max_relative_positions,\n            max_position_embeddings=max_position_embeddings,\n            layer_norm_eps=layer_norm_eps,\n            hidden_dropout_prob=hidden_dropout_prob,\n            attention_probs_dropout_prob=attention_probs_dropout_prob, )\n        self.intermediate = DebertaV2Intermediate(\n            hidden_size=hidden_size,\n            hidden_act=hidden_act,\n            intermediate_size=intermediate_size, )\n        self.output = DebertaV2Output(\n            hidden_size=hidden_size,\n            intermediate_size=intermediate_size,\n            layer_norm_eps=layer_norm_eps,\n            hidden_dropout_prob=hidden_dropout_prob, )\n\n    def forward(\n            self,\n            hidden_states,\n            attention_mask,\n            query_states=None,\n            relative_pos=None,\n            rel_embeddings=None,\n            output_attentions=False, ):\n        attention_output = self.attention(\n            hidden_states,\n            attention_mask,\n            output_attentions=output_attentions,\n            query_states=query_states,\n            relative_pos=relative_pos,\n            rel_embeddings=rel_embeddings, )\n        if output_attentions:\n            attention_output, att_matrix = attention_output\n        intermediate_output = self.intermediate(attention_output)\n        layer_output = self.output(intermediate_output, attention_output)\n        if output_attentions:\n            return (layer_output, att_matrix)\n        else:\n            return layer_output\n\n\nclass ConvLayer(nn.Layer):\n    def __init__(\n            self,\n            hidden_size=512,\n            conv_kernel_size=3,\n            conv_groups=1,\n            conv_act=\"tanh\",\n            layer_norm_eps=1e-7,\n            hidden_dropout_prob=0., ):\n        super().__init__()\n        kernel_size = conv_kernel_size\n        groups = conv_groups\n        self.conv_act = conv_act\n        self.conv = nn.Conv1D(\n            hidden_size,\n            hidden_size,\n            kernel_size,\n            padding=(kernel_size - 1) // 2,\n            groups=groups)\n        self.LayerNorm = LayerNorm(hidden_size, layer_norm_eps)\n        self.dropout = StableDropout(hidden_dropout_prob)\n\n    def forward(self, hidden_states, residual_states, input_mask):\n        out = self.conv(hidden_states.transpose([0, 2, 1])).transpose(\n            [0, 2, 1])\n        out = paddle.where(\n            input_mask.cast('bool').unsqueeze(-1).expand(out.shape) == 0,\n            paddle.to_tensor(0.), out)\n        out = ACT2FN[self.conv_act](self.dropout(out))\n\n        layer_norm_input = residual_states + out\n        output = self.LayerNorm(layer_norm_input).cast(layer_norm_input.dtype)\n\n        if input_mask is None:\n            output_states = output\n        else:\n            if input_mask.dim() != layer_norm_input.dim():\n                if input_mask.dim() == 4:\n                    input_mask = input_mask.squeeze(1).squeeze(1)\n                input_mask = input_mask.unsqueeze(2)\n\n            input_mask = input_mask.cast(output.dtype)\n            output_states = output * input_mask\n\n        return output_states\n\n\nclass DebertaV2Encoder(nn.Layer):\n    \"\"\"Modified BertEncoder with relative position bias support\"\"\"\n\n    def __init__(\n            self,\n            num_hidden_layers=48,\n            num_attention_heads=24,\n            attention_head_size=64,\n            relative_attention=False,\n            max_relative_positions=-1,\n            max_position_embeddings=512,\n            position_buckets=256,\n            hidden_size=1536,\n            hidden_act='gelu',\n            conv_act='gelu',\n            intermediate_size=6144,\n            share_att_key=True,\n            pos_att_type=None,\n            norm_rel_ebd=None,\n            conv_kernel_size=0,\n            layer_norm_eps=1e-7,\n            hidden_dropout_prob=0.1,\n            attention_probs_dropout_prob=0.1, ):\n        super().__init__()\n\n        self.layer = nn.LayerList([\n            DebertaV2Layer(\n                hidden_size=hidden_size,\n                hidden_act=hidden_act,\n                intermediate_size=intermediate_size,\n                num_attention_heads=num_attention_heads,\n                attention_head_size=attention_head_size,\n                share_att_key=share_att_key,\n                pos_att_type=pos_att_type,\n                relative_attention=relative_attention,\n                position_buckets=position_buckets,\n                max_relative_positions=max_relative_positions,\n                max_position_embeddings=max_position_embeddings,\n                layer_norm_eps=layer_norm_eps,\n                hidden_dropout_prob=hidden_dropout_prob,\n                attention_probs_dropout_prob=attention_probs_dropout_prob)\n            for _ in range(num_hidden_layers)\n        ])\n        self.relative_attention = relative_attention\n\n        if self.relative_attention:\n            self.max_relative_positions = max_relative_positions\n            if self.max_relative_positions < 1:\n                self.max_relative_positions = max_position_embeddings\n\n            self.position_buckets = position_buckets\n            pos_ebd_size = self.max_relative_positions * 2\n\n            if self.position_buckets > 0:\n                pos_ebd_size = self.position_buckets * 2\n\n            self.rel_embeddings = nn.Embedding(pos_ebd_size, hidden_size)\n\n        self.norm_rel_ebd = [\n            x.strip() for x in norm_rel_ebd.lower().split(\"|\")\n        ]\n\n        if \"layer_norm\" in self.norm_rel_ebd:\n            self.LayerNorm = LayerNorm(hidden_size, layer_norm_eps)\n\n        self.conv = ConvLayer(\n            hidden_size=hidden_size,\n            conv_kernel_size=conv_kernel_size,\n            conv_act=conv_act,\n            layer_norm_eps=layer_norm_eps,\n            hidden_dropout_prob=hidden_dropout_prob,\n        ) if conv_kernel_size > 0 else None\n        self.gradient_checkpointing = False\n\n    def get_rel_embedding(self):\n        rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None\n        if rel_embeddings is not None and (\"layer_norm\" in self.norm_rel_ebd):\n            rel_embeddings = self.LayerNorm(rel_embeddings)\n        return rel_embeddings\n\n    def get_attention_mask(self, attention_mask):\n        if attention_mask.dim() <= 2:\n            extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)\n            attention_mask = extended_attention_mask * extended_attention_mask.squeeze(\n                -2).unsqueeze(-1)\n            attention_mask = attention_mask.cast(paddle.uint8)\n        elif attention_mask.dim() == 3:\n            attention_mask = attention_mask.unsqueeze(1)\n\n        return attention_mask\n\n    def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):\n        if self.relative_attention and relative_pos is None:\n            q = query_states.shape[\n                -2] if query_states is not None else hidden_states.shape[-2]\n            relative_pos = build_relative_position(\n                q,\n                hidden_states.shape[-2],\n                bucket_size=self.position_buckets,\n                max_position=self.max_relative_positions)\n        return relative_pos\n\n    def forward(\n            self,\n            hidden_states,\n            attention_mask,\n            output_hidden_states=True,\n            output_attentions=False,\n            query_states=None,\n            relative_pos=None,\n            return_dict=True, ):\n        if attention_mask.dim() <= 2:\n            input_mask = attention_mask\n        else:\n            input_mask = (attention_mask.sum(-2) > 0).cast(paddle.uint8)\n        attention_mask = self.get_attention_mask(attention_mask)\n        relative_pos = self.get_rel_pos(hidden_states, query_states,\n                                        relative_pos)\n\n        all_hidden_states = () if output_hidden_states else None\n        all_attentions = () if output_attentions else None\n\n        if isinstance(hidden_states, Sequence):\n            next_kv = hidden_states[0]\n        else:\n            next_kv = hidden_states\n        rel_embeddings = self.get_rel_embedding()\n        output_states = next_kv\n        for i, layer_module in enumerate(self.layer):\n\n            if output_hidden_states:\n                all_hidden_states = all_hidden_states + (output_states, )\n\n            if self.gradient_checkpointing and self.training:\n\n                def create_custom_forward(module):\n                    def custom_forward(*inputs):\n                        return module(*inputs, output_attentions)\n\n                    return custom_forward\n\n                output_states = paddle.utils.checkpoint.checkpoint(\n                    create_custom_forward(layer_module),\n                    next_kv,\n                    attention_mask,\n                    query_states,\n                    relative_pos,\n                    rel_embeddings, )\n            else:\n                output_states = layer_module(\n                    next_kv,\n                    attention_mask,\n                    query_states=query_states,\n                    relative_pos=relative_pos,\n                    rel_embeddings=rel_embeddings,\n                    output_attentions=output_attentions, )\n\n            if output_attentions:\n                output_states, att_m = output_states\n\n            if i == 0 and self.conv is not None:\n                output_states = self.conv(hidden_states, output_states,\n                                          input_mask)\n\n            if query_states is not None:\n                query_states = output_states\n                if isinstance(hidden_states, Sequence):\n                    next_kv = hidden_states[i + 1] if i + 1 < len(\n                        self.layer) else None\n            else:\n                next_kv = output_states\n\n            if output_attentions:\n                all_attentions = all_attentions + (att_m, )\n\n        if output_hidden_states:\n            all_hidden_states = all_hidden_states + (output_states, )\n\n        if not return_dict:\n            return tuple(\n                v for v in [output_states, all_hidden_states, all_attentions]\n                if v is not None)\n        return BaseModelOutput(\n            last_hidden_state=output_states,\n            hidden_states=all_hidden_states,\n            attentions=all_attentions)\n\n\ndef make_log_bucket_position(relative_pos, bucket_size, max_position):\n    sign = paddle.sign(relative_pos.cast('float32'))\n    mid = bucket_size // 2\n    abs_pos = paddle.where(\n        (relative_pos < mid) & (relative_pos > -mid),\n        paddle.to_tensor(mid - 1).astype(relative_pos.dtype),\n        paddle.abs(relative_pos), )\n    log_pos = (paddle.ceil(\n        paddle.log(abs_pos / mid) /\n        paddle.log(paddle.to_tensor((max_position - 1) / mid)) *\n        (mid - 1)) + mid)\n    bucket_pos = paddle.where(abs_pos <= mid,\n                              relative_pos.cast(log_pos.dtype), log_pos * sign)\n    return bucket_pos\n\n\ndef build_relative_position(query_size,\n                            key_size,\n                            bucket_size=-1,\n                            max_position=-1):\n    \"\"\"\n    Build relative position according to the query and key\n\n    We assume the absolute position of query \\\\(P_q\\\\) is range from (0, query_size) and the absolute position of key\n    \\\\(P_k\\\\) is range from (0, key_size), The relative positions from query to key is \\\\(R_{q \\\\rightarrow k} = P_q -\n    P_k\\\\)\n\n    Args:\n        query_size (int): the length of query\n        key_size (int): the length of key\n        bucket_size (int): the size of position bucket\n        max_position (int): the maximum allowed absolute position\n\n    Return:\n        `paddle.LongTensor`: A tensor with shape [1, query_size, key_size]\n\n    \"\"\"\n    q_ids = paddle.arange(0, query_size)\n    k_ids = paddle.arange(0, key_size)\n    rel_pos_ids = q_ids[:, None] - k_ids[None, :]\n    if bucket_size > 0 and max_position > 0:\n        rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size,\n                                               max_position)\n    rel_pos_ids = rel_pos_ids.cast(paddle.int64)\n    rel_pos_ids = rel_pos_ids[:query_size, :]\n    rel_pos_ids = rel_pos_ids.unsqueeze(0)\n    return rel_pos_ids\n\n\n# Copied from transformers.models.deberta.modeling_deberta.c2p_dynamic_expand\ndef c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):\n    return c2p_pos.expand([\n        query_layer.shape[1], query_layer.shape[1], query_layer.shape[2],\n        relative_pos.shape[-1]\n    ])\n\n\n# Copied from transformers.models.deberta.modeling_deberta.p2c_dynamic_expand\ndef p2c_dynamic_expand(c2p_pos, query_layer, key_layer):\n    return c2p_pos.expand([\n        query_layer.shape[0], query_layer.shape[1], key_layer.shape[-2],\n        key_layer.shape[-2]\n    ])\n\n\n# Copied from transformers.models.deberta.modeling_deberta.pos_dynamic_expand\ndef pos_dynamic_expand(pos_index, p2c_att, key_layer):\n    return pos_index.expand([\n        tuplt(p2c_att.shape[:2]) + (pos_index.shape[-2], key_layer.shape[-2])\n    ])\n\n\nclass DisentangledSelfAttention(nn.Layer):\n    \"\"\"\n    Disentangled self-attention module\n\n    Parameters:\n\n    \"\"\"\n\n    def __init__(\n            self,\n            hidden_size=1536,\n            num_attention_heads=24,\n            attention_head_size=None,\n            share_att_key=False,\n            pos_att_type=None,\n            relative_attention=False,\n            position_buckets=-1,\n            max_relative_positions=-1,\n            max_position_embeddings=512,\n            hidden_dropout_prob=0.,\n            attention_probs_dropout_prob=0., ):\n        super().__init__()\n        if hidden_size % num_attention_heads != 0:\n            raise ValueError(\n                f\"The hidden size ({hidden_size}) is not a multiple of the number of attention \"\n                f\"heads ({num_attention_heads})\")\n        self.num_attention_heads = num_attention_heads\n        _attention_head_size = hidden_size // num_attention_heads\n        self.attention_head_size = attention_head_size if attention_head_size is not None else _attention_head_size\n        self.all_head_size = self.num_attention_heads * self.attention_head_size\n        self.query_proj = nn.Linear(hidden_size, self.all_head_size)\n        self.key_proj = nn.Linear(hidden_size, self.all_head_size)\n        self.value_proj = nn.Linear(hidden_size, self.all_head_size)\n\n        self.share_att_key = share_att_key\n        self.pos_att_type = pos_att_type if pos_att_type is not None else []\n        self.relative_attention = relative_attention\n\n        if self.relative_attention:\n            self.position_buckets = position_buckets\n            self.max_relative_positions = max_relative_positions\n            if self.max_relative_positions < 1:\n                self.max_relative_positions = max_position_embeddings\n            self.pos_ebd_size = self.max_relative_positions\n            if self.position_buckets > 0:\n                self.pos_ebd_size = self.position_buckets\n\n            self.pos_dropout = StableDropout(hidden_dropout_prob)\n\n            if not self.share_att_key:\n                if \"c2p\" in self.pos_att_type:\n                    self.pos_key_proj = nn.Linear(\n                        hidden_size, self.all_head_size, bias=True)\n                if \"p2c\" in self.pos_att_type:\n                    self.pos_query_proj = nn.Linear(hidden_size,\n                                                    self.all_head_size)\n\n        self.dropout = StableDropout(attention_probs_dropout_prob)\n\n    def transpose_for_scores(self, x, attention_heads):\n        new_x_shape = tuple(x.shape[:-1]) + (attention_heads, -1)\n        x = x.reshape(new_x_shape)\n        return x.transpose([0, 2, 1, 3]).reshape([-1, x.shape[1], x.shape[-1]])\n\n    def forward(\n            self,\n            hidden_states,\n            attention_mask,\n            output_attentions=False,\n            query_states=None,\n            relative_pos=None,\n            rel_embeddings=None, ):\n        \"\"\"\n        Call the module\n\n        Args:\n            hidden_states (`paddle.FloatTensor`):\n                Input states to the module usually the output from previous layer, it will be the Q,K and V in\n                *Attention(Q,K,V)*\n\n            attention_mask (`paddle.uint8`):\n                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum\n                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*\n                th token.\n\n            output_attentions (`bool`, optional):\n                Whether return the attention matrix.\n\n            query_states (`paddle.FloatTensor`, optional):\n                The *Q* state in *Attention(Q,K,V)*.\n\n            relative_pos (`paddle.LongTensor`):\n                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with\n                values ranging in [*-max_relative_positions*, *max_relative_positions*].\n\n            rel_embeddings (`paddle.FloatTensor`):\n                The embedding of relative distances. It's a tensor of shape [\\\\(2 \\\\times\n                \\\\text{max_relative_positions}\\\\), *hidden_size*].\n\n\n        \"\"\"\n        if query_states is None:\n            query_states = hidden_states\n        query_layer = self.transpose_for_scores(\n            self.query_proj(query_states), self.num_attention_heads)\n        key_layer = self.transpose_for_scores(\n            self.key_proj(hidden_states), self.num_attention_heads)\n        value_layer = self.transpose_for_scores(\n            self.value_proj(hidden_states), self.num_attention_heads)\n\n        rel_att = None\n        # Take the dot product between \"query\" and \"key\" to get the raw attention scores.\n        scale_factor = 1\n        if \"c2p\" in self.pos_att_type:\n            scale_factor += 1\n        if \"p2c\" in self.pos_att_type:\n            scale_factor += 1\n        scale = paddle.sqrt(\n            paddle.to_tensor(\n                query_layer.shape[-1], dtype='float32') * scale_factor)\n        attention_scores = paddle.bmm(\n            query_layer, key_layer.transpose(\n                [0, 2, 1])) / scale.cast(query_layer.dtype)\n        if self.relative_attention:\n            rel_embeddings = self.pos_dropout(rel_embeddings)\n            rel_att = self.disentangled_attention_bias(\n                query_layer, key_layer, relative_pos, rel_embeddings,\n                scale_factor)\n\n        if rel_att is not None:\n            attention_scores = attention_scores + rel_att\n        attention_scores = attention_scores\n        attention_scores = attention_scores.reshape([\n            -1, self.num_attention_heads, attention_scores.shape[-2],\n            attention_scores.shape[-1]\n        ])\n\n        # bsz x height x length x dimension\n        attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)\n        attention_probs = self.dropout(attention_probs)\n        context_layer = paddle.bmm(\n            attention_probs.reshape(\n                [-1, attention_probs.shape[-2], attention_probs.shape[-1]]),\n            value_layer)\n        context_layer = (context_layer.reshape([\n            -1, self.num_attention_heads, context_layer.shape[-2],\n            context_layer.shape[-1]\n        ]).transpose([0, 2, 1, 3]))\n        new_context_layer_shape = tuple(context_layer.shape[:-2]) + (-1, )\n        context_layer = context_layer.reshape(new_context_layer_shape)\n        if output_attentions:\n            return (context_layer, attention_probs)\n        else:\n            return context_layer\n\n    def disentangled_attention_bias(self, query_layer, key_layer, relative_pos,\n                                    rel_embeddings, scale_factor):\n        if relative_pos is None:\n            q = query_layer.shape[-2]\n            relative_pos = build_relative_position(\n                q,\n                key_layer.shape[-2],\n                bucket_size=self.position_buckets,\n                max_position=self.max_relative_positions)\n        if relative_pos.dim() == 2:\n            relative_pos = relative_pos.unsqueeze(0).unsqueeze(0)\n        elif relative_pos.dim() == 3:\n            relative_pos = relative_pos.unsqueeze(1)\n        # bsz x height x query x key\n        elif relative_pos.dim() != 4:\n            raise ValueError(\n                f\"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}\"\n            )\n\n        att_span = self.pos_ebd_size\n        relative_pos = relative_pos.cast(paddle.int64)\n\n        rel_embeddings = rel_embeddings[0:att_span * 2, :].unsqueeze(0)\n        if self.share_att_key:\n            pos_query_layer = paddle.tile(\n                self.transpose_for_scores(\n                    self.query_proj(rel_embeddings), self.num_attention_heads),\n                repeat_times=[\n                    query_layer.shape[0] // self.num_attention_heads, 1, 1\n                ])\n            pos_key_layer = paddle.tile(\n                self.transpose_for_scores(\n                    self.key_proj(rel_embeddings), self.num_attention_heads),\n                repeat_times=[\n                    query_layer.shape[0] // self.num_attention_heads, 1, 1\n                ])\n        else:\n            if \"c2p\" in self.pos_att_type:\n                pos_key_layer = paddle.tile(\n                    self.transpose_for_scores(\n                        self.pos_key_proj(rel_embeddings),\n                        self.num_attention_heads),\n                    repeat_times=[\n                        query_layer.shape[0] // self.num_attention_heads, 1, 1\n                    ])  # .split(self.all_head_size, dim=-1)\n            if \"p2c\" in self.pos_att_type:\n                pos_query_layer = paddle.tile(\n                    self.transpose_for_scores(\n                        self.pos_query_proj(rel_embeddings),\n                        self.num_attention_heads),\n                    repeat_times=[\n                        query_layer.shape[0] // self.num_attention_heads, 1, 1\n                    ])  # .split(self.all_head_size, dim=-1)\n\n        score = 0\n        # content->position\n        if \"c2p\" in self.pos_att_type:\n            scale = paddle.sqrt(\n                paddle.to_tensor(\n                    pos_key_layer.shape[-1], dtype='float32') * scale_factor)\n            c2p_att = paddle.bmm(query_layer,\n                                 pos_key_layer.transpose([0, 2, 1]))\n            c2p_pos = paddle.clip(relative_pos + att_span, 0, att_span * 2 - 1)\n            c2p_att = paddle.take_along_axis(\n                c2p_att,\n                axis=-1,\n                indices=c2p_pos.squeeze(0).expand([\n                    query_layer.shape[0], query_layer.shape[1],\n                    relative_pos.shape[-1]\n                ]), )\n            score += c2p_att / scale.cast(dtype=c2p_att.dtype)\n\n        # position->content\n        if \"p2c\" in self.pos_att_type:\n            scale = paddle.sqrt(\n                paddle.to_tensor(\n                    pos_query_layer.shape[-1], dtype='float32') * scale_factor)\n            if key_layer.shape[-2] != query_layer.shape[-2]:\n                r_pos = build_relative_position(\n                    key_layer.shape[-2],\n                    key_layer.shape[-2],\n                    bucket_size=self.position_buckets,\n                    max_position=self.max_relative_positions, )\n                r_pos = r_pos.unsqueeze(0)\n            else:\n                r_pos = relative_pos\n\n            p2c_pos = paddle.clip(-r_pos + att_span, 0, att_span * 2 - 1)\n            p2c_att = paddle.bmm(key_layer,\n                                 pos_query_layer.transpose([0, 2, 1]))\n            p2c_att = paddle.take_along_axis(\n                p2c_att,\n                axis=-1,\n                indices=p2c_pos.squeeze(0).expand([\n                    query_layer.shape[0], key_layer.shape[-2],\n                    key_layer.shape[-2]\n                ]), ).transpose([0, 2, 1])\n            score += p2c_att / scale.cast(dtype=p2c_att.dtype)\n\n        return score\n\n\n# Copied from transformers.models.deberta.modeling_deberta.DebertaEmbeddings with DebertaLayerNorm->LayerNorm\nclass DebertaV2Embeddings(nn.Layer):\n    \"\"\"Construct the embeddings from word, position and token_type embeddings.\"\"\"\n\n    def __init__(\n            self,\n            max_position_embeddings=512,\n            position_biased_input=False,\n            pad_token_id=0,\n            hidden_size=1536,\n            hidden_dropout_prob=0.1,\n            embedding_size=None,\n            vocab_size=128100,\n            type_vocab_size=0,\n            layer_norm_eps=1e-7, ):\n        super().__init__()\n        self.embedding_size = hidden_size if embedding_size is None else embedding_size\n        self.word_embeddings = nn.Embedding(\n            vocab_size, self.embedding_size, padding_idx=pad_token_id)\n        self.type_vocab_size = type_vocab_size\n        self.hidden_size = hidden_size\n\n        self.position_biased_input = position_biased_input\n        if not self.position_biased_input:\n            self.position_embeddings = None\n        else:\n            self.position_embeddings = nn.Embedding(max_position_embeddings,\n                                                    self.embedding_size)\n\n        if type_vocab_size > 0:\n            self.token_type_embeddings = nn.Embedding(type_vocab_size,\n                                                      self.embedding_size)\n\n        if self.embedding_size != hidden_size:\n            self.embed_proj = nn.Linear(self.embedding_size, hidden_size)\n        self.LayerNorm = LayerNorm(hidden_size, layer_norm_eps)\n        self.dropout = StableDropout(hidden_dropout_prob)\n\n        # position_ids (1, len position emb) is contiguous in memory and exported when serialized\n        self.register_buffer(\"position_ids\",\n                             paddle.arange(max_position_embeddings).expand(\n                                 (1, -1)))\n\n    def forward(self,\n                input_ids=None,\n                token_type_ids=None,\n                position_ids=None,\n                mask=None,\n                inputs_embeds=None):\n        if input_ids is not None:\n            input_shape = input_ids.shape\n        else:\n            input_shape = inputs_embeds.shape[:-1]\n\n        seq_length = input_shape[1]\n\n        if position_ids is None:\n            position_ids = self.position_ids[:, :seq_length]\n\n        if token_type_ids is None:\n            token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64)\n\n        if inputs_embeds is None:\n            inputs_embeds = self.word_embeddings(input_ids)\n\n        if self.position_embeddings is not None:\n            position_embeddings = self.position_embeddings(\n                position_ids.cast(paddle.int64))\n        else:\n            position_embeddings = paddle.zeros_like(inputs_embeds)\n\n        embeddings = inputs_embeds\n        if self.position_biased_input:\n            embeddings += position_embeddings\n        if self.type_vocab_size > 0:\n            token_type_embeddings = self.token_type_embeddings(token_type_ids)\n            embeddings += token_type_embeddings\n\n        if self.embedding_size != self.hidden_size:\n            embeddings = self.embed_proj(embeddings)\n\n        embeddings = self.LayerNorm(embeddings)\n\n        if mask is not None:\n            if mask.dim() != embeddings.dim():\n                if mask.dim() == 4:\n                    mask = mask.squeeze(1).squeeze(1)\n                mask = mask.unsqueeze(2)\n            mask = mask.cast('float32')\n\n            embeddings = embeddings * mask\n\n        embeddings = self.dropout(embeddings)\n        return embeddings\n\n\n# Copied from transformers.models.deberta.modeling_deberta.DebertaPreTrainedModel with Deberta->DebertaV2\nclass DebertaV2PreTrainedModel(nn.Layer):\n    \"\"\"\n    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained\n    models.\n    \"\"\"\n\n    base_model_prefix = \"deberta\"\n    _keys_to_ignore_on_load_missing = [\"position_ids\"]\n    _keys_to_ignore_on_load_unexpected = [\"position_embeddings\"]\n    supports_gradient_checkpointing = True\n\n    def _init_weights(self, module):\n        \"\"\"Initialize the weights.\"\"\"\n        if isinstance(module, nn.Linear):\n            # Slightly different from the TF version which uses truncated_normal for initialization\n            # cf https://github.com/pytorch/pytorch/pull/5617\n            normal_(module.weight, mean=0.0, std=0.02)\n            if module.bias is not None:\n                constant_init(module.bias, 0.)\n        elif isinstance(module, nn.Embedding):\n            normal_(module.weight, mean=0.0, std=0.02)\n            if module.padding_idx is not None:\n                constant_init(module.weight.data[module.padding_idx], 0.)\n\n    def _set_gradient_checkpointing(self, module, value=False):\n        if isinstance(module, DebertaV2Encoder):\n            module.gradient_checkpointing = value\n\n\nDEBERTA_START_DOCSTRING = r\"\"\"\n    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled\n    Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build\n    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two\n    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.\n\n    This model is also a PyTorch [paddle.nn.Layer](https://pytorch.org/docs/stable/nn.html#paddle.nn.Layer) subclass.\n    Use it as a regular Paddle Layer and refer to the Paddle documentation for all matter related to general usage\n    and behavior.\n\n\n    Parameters:\n\"\"\"\n\n\n# Copied from transformers.models.deberta.modeling_deberta.DebertaModel with Deberta->DebertaV2\nclass DebertaV2Model(DebertaV2PreTrainedModel):\n    def __init__(self,\n                 _name_or_path=\"cache/deberta-v-xxlarge\",\n                 attention_head_size=64,\n                 attention_probs_dropout_prob=0.1,\n                 conv_act=\"gelu\",\n                 conv_kernel_size=3,\n                 hidden_act=\"gelu\",\n                 hidden_dropout_prob=0.1,\n                 hidden_size=1536,\n                 initializer_range=0.02,\n                 intermediate_size=6144,\n                 layer_norm_eps=1e-07,\n                 max_position_embeddings=512,\n                 max_relative_positions=-1,\n                 model_type=\"deberta-v2\",\n                 norm_rel_ebd=\"layer_norm\",\n                 num_attention_heads=24,\n                 num_hidden_layers=48,\n                 pad_token_id=0,\n                 pooler_dropout=0,\n                 pooler_hidden_act=\"gelu\",\n                 pooler_hidden_size=1536,\n                 pos_att_type=[\"p2c\", \"c2p\"],\n                 position_biased_input=False,\n                 position_buckets=256,\n                 relative_attention=True,\n                 share_att_key=True,\n                 type_vocab_size=0,\n                 vocab_size=128100,\n                 output_attentions=False,\n                 output_hidden_states=False,\n                 use_return_dict=True):\n        super().__init__()\n\n        self.embeddings = DebertaV2Embeddings(\n            max_position_embeddings=max_position_embeddings,\n            position_biased_input=position_biased_input,\n            pad_token_id=pad_token_id,\n            hidden_size=hidden_size,\n            hidden_dropout_prob=hidden_dropout_prob,\n            vocab_size=vocab_size,\n            type_vocab_size=type_vocab_size,\n            layer_norm_eps=layer_norm_eps)\n        self.encoder = DebertaV2Encoder(\n            num_hidden_layers=num_hidden_layers,\n            num_attention_heads=num_attention_heads,\n            attention_head_size=attention_head_size,\n            relative_attention=relative_attention,\n            max_relative_positions=max_relative_positions,\n            max_position_embeddings=max_position_embeddings,\n            position_buckets=position_buckets,\n            hidden_size=hidden_size,\n            norm_rel_ebd=norm_rel_ebd,\n            conv_kernel_size=conv_kernel_size,\n            hidden_act=hidden_act,\n            conv_act=conv_act,\n            intermediate_size=intermediate_size,\n            share_att_key=share_att_key,\n            pos_att_type=pos_att_type,\n            layer_norm_eps=layer_norm_eps,\n            hidden_dropout_prob=hidden_dropout_prob,\n            attention_probs_dropout_prob=attention_probs_dropout_prob, )\n        self.z_steps = 0\n        self.output_attentions = output_attentions\n        self.output_hidden_states = output_hidden_states\n        self.use_return_dict = use_return_dict\n\n    def get_input_embeddings(self):\n        return self.embeddings.word_embeddings\n\n    def set_input_embeddings(self, new_embeddings):\n        self.embeddings.word_embeddings = new_embeddings\n\n    def _prune_heads(self, heads_to_prune):\n        \"\"\"\n        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base\n        class PreTrainedModel\n        \"\"\"\n        raise NotImplementedError(\n            \"The prune function is not implemented in DeBERTa model.\")\n\n    def forward(\n            self,\n            input_ids: Optional[paddle.Tensor]=None,\n            attention_mask: Optional[paddle.Tensor]=None,\n            token_type_ids: Optional[paddle.Tensor]=None,\n            position_ids: Optional[paddle.Tensor]=None,\n            inputs_embeds: Optional[paddle.Tensor]=None,\n            output_attentions: Optional[bool]=None,\n            output_hidden_states: Optional[bool]=None,\n            return_dict: Optional[bool]=None, ) -> Union[Tuple,\n                                                         BaseModelOutput]:\n        output_attentions = output_attentions if output_attentions is not None else self.output_attentions\n        output_hidden_states = (output_hidden_states\n                                if output_hidden_states is not None else\n                                self.output_hidden_states)\n        return_dict = return_dict if return_dict is not None else self.use_return_dict\n\n        if input_ids is not None and inputs_embeds is not None:\n            raise ValueError(\n                \"You cannot specify both input_ids and inputs_embeds at the same time\"\n            )\n        elif input_ids is not None:\n            input_shape = input_ids.shape\n        elif inputs_embeds is not None:\n            input_shape = inputs_embeds.shape[:-1]\n        else:\n            raise ValueError(\n                \"You have to specify either input_ids or inputs_embeds\")\n\n        if attention_mask is None:\n            attention_mask = paddle.ones(input_shape)\n        if token_type_ids is None:\n            token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64)\n\n        embedding_output = self.embeddings(\n            input_ids=input_ids,\n            token_type_ids=token_type_ids,\n            position_ids=position_ids,\n            mask=attention_mask,\n            inputs_embeds=inputs_embeds, )\n\n        encoder_outputs = self.encoder(\n            embedding_output,\n            attention_mask,\n            output_hidden_states=True,\n            output_attentions=output_attentions,\n            return_dict=return_dict, )\n        encoded_layers = encoder_outputs[1]\n\n        if self.z_steps > 1:\n            hidden_states = encoded_layers[-2]\n            layers = [self.encoder.layer[-1] for _ in range(self.z_steps)]\n            query_states = encoded_layers[-1]\n            rel_embeddings = self.encoder.get_rel_embedding()\n            attention_mask = self.encoder.get_attention_mask(attention_mask)\n            rel_pos = self.encoder.get_rel_pos(embedding_output)\n            for layer in layers[1:]:\n                query_states = layer(\n                    hidden_states,\n                    attention_mask,\n                    output_attentions=False,\n                    query_states=query_states,\n                    relative_pos=rel_pos,\n                    rel_embeddings=rel_embeddings, )\n                encoded_layers.append(query_states)\n\n        sequence_output = encoded_layers[-1]\n\n        if not return_dict:\n            return (sequence_output,\n                    ) + encoder_outputs[(1 if output_hidden_states else 2):]\n\n        return BaseModelOutput(\n            last_hidden_state=sequence_output,\n            hidden_states=encoder_outputs.hidden_states\n            if output_hidden_states else None,\n            attentions=encoder_outputs.attentions, )\n\n\ndef get_debertav2_model(name, pretrained=True):\n    if name is None:\n        return None\n    model = DebertaV2Model(\n        _name_or_path=name,\n        attention_head_size=64,\n        attention_probs_dropout_prob=0.1,\n        conv_act=\"gelu\",\n        conv_kernel_size=3,\n        hidden_act=\"gelu\",\n        hidden_dropout_prob=0.1,\n        hidden_size=1536,\n        initializer_range=0.02,\n        intermediate_size=6144,\n        layer_norm_eps=1e-07,\n        max_position_embeddings=512,\n        max_relative_positions=-1,\n        model_type=\"deberta-v2\",\n        norm_rel_ebd=\"layer_norm\",\n        num_attention_heads=24,\n        num_hidden_layers=48,\n        pad_token_id=0,\n        pooler_dropout=0,\n        pooler_hidden_act=\"gelu\",\n        pooler_hidden_size=1536,\n        pos_att_type=[\"p2c\", \"c2p\"],\n        position_biased_input=False,\n        position_buckets=256,\n        relative_attention=True,\n        share_att_key=True,\n        type_vocab_size=0,\n        vocab_size=128100,\n        output_attentions=False,\n        output_hidden_states=False,\n        use_return_dict=True, )\n    if pretrained:\n        checkpoint = paddle.load(name + '/debertav2.pd', return_numpy=True)\n        model.set_state_dict(checkpoint['model'])\n    model.eval()\n    for p in model.parameters():\n        p.stop_gradient = True\n\n    return model\n\n\ndef dict_from_json_file(name):\n    with open(name + '/config.json', \"r\", encoding=\"utf-8\") as reader:\n        text = reader.read()\n        config_dict = json.loads(text)\n        return config_dict\n\n\ndef debertav2_encode_text(debertav2, texts, tokenizer, return_attn_mask=False):\n    token_ids, attn_mask = debertav2_tokenize(texts, tokenizer)\n    debertav2.eval()\n    with paddle.no_grad():\n        output = debertav2(input_ids=token_ids, attention_mask=attn_mask)\n        encoded_text = output.last_hidden_state.detach()\n    attn_mask = attn_mask.cast(bool)\n    encoded_text = paddle.where(attn_mask[:, :, None] == 0,\n                                paddle.to_tensor(0.), encoded_text)\n\n    if return_attn_mask:\n        return encoded_text, attn_mask\n\n    return encoded_text\n\n\ndef get_debertav2_encoded_dim(name):\n    return dict_from_json_file(name)['hidden_size']\n\n\nif __name__ == '__main__':\n    model = get_debertav2_model(\n        name='/dbq/codes/CL/paddle-imagen/cache/deberta-v-xxlarge',\n        pretrained=False)\n"
  },
  {
    "path": "ppfleetx/models/language_model/ernie/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom .ernie_module import ErnieModule, ErnieSeqClsModule\nfrom .auto.auto_module import ErnieModuleAuto, ErnieSeqClsModuleAuto\n"
  },
  {
    "path": "ppfleetx/models/language_model/ernie/auto/__init__.py",
    "content": ""
  },
  {
    "path": "ppfleetx/models/language_model/ernie/auto/auto_model.py",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport io\nimport copy\nimport logging\nimport json\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.distributed.auto_parallel as auto\n\nfrom paddle.nn import functional as F\nfrom paddle.nn.initializer.lazy_init import _lazy_init_helper\nfrom dataclasses import dataclass, field\n\nfrom ..layers.model_outputs import (\n    BaseModelOutputWithPoolingAndCrossAttentions,\n    ModelOutput,\n    ErnieForPreTrainingOutput,\n    SequenceClassifierOutput, )\nfrom .auto_transformer import TransformerEncoderLayer, TransformerEncoder\n\n\nclass Embedding(nn.Layer):\n    def __init__(\n            self,\n            num_embeddings,\n            embedding_dim,\n            padding_idx=None,\n            sparse=False,\n            weight_attr=None,\n            name=None, ):\n        super().__init__()\n        self._num_embeddings = num_embeddings\n        self._embedding_dim = embedding_dim\n        self._sparse = sparse\n        self._is_distributed = False\n        self._padding_idx = padding_idx\n\n        if self._num_embeddings <= 0:\n            raise ValueError(\"num_embeddings must be gather than 0\")\n\n        if self._embedding_dim <= 0:\n            raise ValueError(\"embedding_dim must be gather than 0\")\n\n        padding_idx = (-1 if padding_idx is None else padding_idx\n                       if padding_idx >= 0 else (num_embeddings + padding_idx))\n\n        if padding_idx >= num_embeddings or padding_idx < -num_embeddings:\n            raise ValueError(\"padding_idx must be within [-{}, {})\".format(\n                num_embeddings, num_embeddings))\n\n        self._dtype = self._helper.get_default_dtype()\n        self._size = [self._num_embeddings, self._embedding_dim]\n\n        self._weight_attr = weight_attr\n        self._remote_prefetch = False\n        self._name = name\n        self.weight = self.create_parameter(\n            attr=self._weight_attr,\n            shape=self._size,\n            dtype=self._dtype,\n            is_bias=False, )\n\n        if paddle.in_dynamic_mode(\n        ) and padding_idx != -1 and not _lazy_init_helper.state:\n            with paddle.no_grad():\n                self.weight[padding_idx] = 0.0\n\n    def forward(self, x):\n        return F.embedding(\n            x,\n            weight=self.weight,\n            padding_idx=self._padding_idx,\n            sparse=self._sparse,\n            name=self._name, )\n\n    def extra_repr(self):\n        main_str = '{_num_embeddings}, {_embedding_dim}'\n        if self._padding_idx is not None:\n            main_str += ', padding_idx={_padding_idx}'\n        main_str += ', sparse={_sparse}'\n        if self._name is not None:\n            main_str += ', name={_name}'\n        return main_str.format(**self.__dict__)\n\n\nclass ErnieEmbeddings(nn.Layer):\n    r\"\"\"\n    Include embeddings from word, position and token_type embeddings.\n    \"\"\"\n\n    def __init__(self,\n                 vocab_size,\n                 hidden_size=768,\n                 hidden_dropout_prob=0.1,\n                 max_position_embeddings=512,\n                 type_vocab_size=2,\n                 pad_token_id=0,\n                 weight_attr=None,\n                 task_type_vocab_size=3,\n                 task_id=0,\n                 use_task_id=False,\n                 mesh=None):\n        super(ErnieEmbeddings, self).__init__()\n        self.mesh = mesh\n\n        self.word_embeddings = Embedding(\n            vocab_size,\n            hidden_size,\n            padding_idx=pad_token_id,\n            weight_attr=weight_attr)\n        self.position_embeddings = nn.Embedding(\n            max_position_embeddings, hidden_size, weight_attr=weight_attr)\n        self.type_vocab_size = type_vocab_size\n        if self.type_vocab_size > 0:\n            self.token_type_embeddings = nn.Embedding(\n                type_vocab_size, hidden_size, weight_attr=weight_attr)\n        self.use_task_id = use_task_id\n        self.task_id = task_id\n        if self.use_task_id:\n            self.task_type_embeddings = nn.Embedding(\n                task_type_vocab_size, hidden_size, weight_attr=weight_attr)\n        self.layer_norm = nn.LayerNorm(hidden_size)\n        self.dropout = nn.Dropout(hidden_dropout_prob)\n\n    def forward(self,\n                input_ids,\n                token_type_ids=None,\n                position_ids=None,\n                task_type_ids=None,\n                inputs_embeds=None,\n                past_key_values_length=None):\n        if input_ids is not None:\n            auto.shard_tensor(self.word_embeddings.weight, self.mesh[0],\n                              [self.mesh.mp, None])\n            input_shape = paddle.shape(input_ids)\n            input_embeddings = self.word_embeddings(input_ids)\n\n        else:\n            input_shape = paddle.shape(inputs_embeds)[:-1]\n            input_embeddings = inputs_embeds\n\n        if position_ids is None:\n            # maybe need use shape op to unify static graph and dynamic graph\n            #seq_length = input_ids.shape[1]\n            ones = paddle.ones(input_shape, dtype=\"int64\")\n            seq_length = paddle.cumsum(ones, axis=1)\n            position_ids = seq_length - ones\n            if past_key_values_length is not None:\n                position_ids += past_key_values_length\n            position_ids.stop_gradient = True\n\n        position_embeddings = self.position_embeddings(position_ids)\n        embeddings = input_embeddings + position_embeddings\n\n        if self.type_vocab_size > 0:\n            if token_type_ids is None:\n                token_type_ids = paddle.zeros(input_shape, dtype=\"int64\")\n            token_type_embeddings = self.token_type_embeddings(token_type_ids)\n\n            embeddings = embeddings + token_type_embeddings\n\n        if self.use_task_id:\n            if task_type_ids is None:\n                task_type_ids = paddle.ones(\n                    input_shape, dtype=\"int64\") * self.task_id\n            task_type_embeddings = self.task_type_embeddings(task_type_ids)\n            embeddings = embeddings + task_type_embeddings\n        embeddings = self.layer_norm(embeddings)\n        embeddings = self.dropout(embeddings)\n        return embeddings\n\n\nclass ErniePooler(nn.Layer):\n    def __init__(self, hidden_size, weight_attr=None):\n        super(ErniePooler, self).__init__()\n        self.dense = nn.Linear(\n            hidden_size, hidden_size, weight_attr=weight_attr)\n        self.activation = nn.Tanh()\n\n    def forward(self, hidden_states):\n        # We \"pool\" the model by simply taking the hidden state corresponding\n        # to the first token.\n        first_token_tensor = hidden_states[:, 0]\n        pooled_output = self.dense(first_token_tensor)\n        pooled_output = self.activation(pooled_output)\n        return pooled_output\n\n\nclass ErnieModelAuto(nn.Layer):\n    r\"\"\"\n    The bare ERNIE Model transformer outputting raw hidden-states.\n\n    This model is a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation\n    /docs/en/api/paddle/fluid/dygraph/layers/Layer_en.html>`__ subclass. Use it as a regular Paddle Layer\n    and refer to the Paddle documentation for all matter related to general usage and behavior.\n\n    Args:\n        vocab_size (int):\n            Vocabulary size of `inputs_ids` in `ErnieModel`. Also is the vocab size of token embedding matrix.\n            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `ErnieModel`.\n        hidden_size (int, optional):\n            Dimensionality of the embedding layer, encoder layers and pooler layer. Defaults to `768`.\n        num_hidden_layers (int, optional):\n            Number of hidden layers in the Transformer encoder. Defaults to `12`.\n        num_attention_heads (int, optional):\n            Number of attention heads for each attention layer in the Transformer encoder.\n            Defaults to `12`.\n        intermediate_size (int, optional):\n            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors\n            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,\n            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.\n            Defaults to `3072`.\n        hidden_act (str, optional):\n            The non-linear activation function in the feed-forward layer.\n            ``\"gelu\"``, ``\"relu\"`` and any other paddle supported activation functions\n            are supported. Defaults to `\"gelu\"`.\n        hidden_dropout_prob (float, optional):\n            The dropout probability for all fully connected layers in the embeddings and encoder.\n            Defaults to `0.1`.\n        attention_probs_dropout_prob (float, optional):\n            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.\n            Defaults to `0.1`.\n        max_position_embeddings (int, optional):\n            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input\n            sequence. Defaults to `512`.\n        type_vocab_size (int, optional):\n            The vocabulary size of the `token_type_ids`.\n            Defaults to `2`.\n        initializer_range (float, optional):\n            The standard deviation of the normal initializer for initializing all weight matrices.\n            Defaults to `0.02`.\n            \n            .. note::\n                A normal_initializer initializes weight matrices as normal distributions.\n                See :meth:`ErniePretrainedModel._init_weights()` for how weights are initialized in `ErnieModel`.\n\n        pad_token_id(int, optional):\n            The index of padding token in the token vocabulary.\n            Defaults to `0`.\n\n    \"\"\"\n\n    def __init__(self,\n                 vocab_size,\n                 hidden_size=768,\n                 num_hidden_layers=12,\n                 num_attention_heads=12,\n                 intermediate_size=3072,\n                 hidden_act=\"gelu\",\n                 hidden_dropout_prob=0.1,\n                 attention_probs_dropout_prob=0.1,\n                 max_position_embeddings=512,\n                 type_vocab_size=2,\n                 initializer_range=0.02,\n                 pad_token_id=0,\n                 task_type_vocab_size=3,\n                 task_id=0,\n                 use_task_id=False,\n                 use_recompute=False,\n                 mesh=None):\n        super(ErnieModelAuto, self).__init__()\n        self.pad_token_id = pad_token_id\n        self.initializer_range = initializer_range\n\n        self.hidden_size = hidden_size\n        self.vocab_size = vocab_size\n        self.hidden_act = hidden_act\n        self.hidden_dropout_prob = hidden_dropout_prob\n\n        weight_attr = paddle.ParamAttr(\n            initializer=nn.initializer.TruncatedNormal(\n                mean=0.0, std=self.initializer_range))\n        self.embeddings = ErnieEmbeddings(\n            vocab_size, hidden_size, hidden_dropout_prob,\n            max_position_embeddings, type_vocab_size, pad_token_id,\n            weight_attr, task_type_vocab_size, task_id, use_task_id, mesh)\n\n        encoder_layer = TransformerEncoderLayer(\n            hidden_size,\n            num_attention_heads,\n            intermediate_size,\n            dropout=hidden_dropout_prob,\n            activation=hidden_act,\n            attn_dropout=attention_probs_dropout_prob,\n            act_dropout=0,\n            weight_attr=weight_attr,\n            normalize_before=False,\n            mesh=mesh,\n            mesh_idx=0)\n        self.encoder = TransformerEncoder(\n            encoder_layer,\n            num_hidden_layers,\n            enable_recompute=use_recompute,\n            mesh=mesh)\n\n        self.pooler = ErniePooler(hidden_size, weight_attr)\n        self.apply(self.init_weights)\n\n    def get_input_embeddings(self):\n        return self.embeddings.word_embeddings\n\n    def set_input_embeddings(self, value):\n        self.embeddings.word_embeddings = value\n\n    def forward(self,\n                input_ids,\n                token_type_ids=None,\n                position_ids=None,\n                attention_mask=None,\n                task_type_ids=None,\n                past_key_values=None,\n                inputs_embeds=None,\n                use_cache=None,\n                output_hidden_states=False,\n                output_attentions=False,\n                return_dict=False):\n        r\"\"\"\n        Args:\n            input_ids (Tensor):\n                Indices of input sequence tokens in the vocabulary. They are\n                numerical representations of tokens that build the input sequence.\n                It's data type should be `int64` and has a shape of [batch_size, sequence_length].\n            token_type_ids (Tensor, optional):\n                Segment token indices to indicate different portions of the inputs.\n                Selected in the range ``[0, type_vocab_size - 1]``.\n                If `type_vocab_size` is 2, which means the inputs have two portions.\n                Indices can either be 0 or 1:\n\n                - 0 corresponds to a *sentence A* token,\n                - 1 corresponds to a *sentence B* token.\n\n                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].\n                Defaults to `None`, which means we don't add segment embeddings.\n            position_ids (Tensor, optional):\n                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,\n                max_position_embeddings - 1]``.\n                Shape as `[batch_size, num_tokens]` and dtype as int64. Defaults to `None`.\n            attention_mask (Tensor, optional):\n                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,\n                usually the paddings or the subsequent positions.\n                Its data type can be int, float and bool.\n                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.\n                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.\n                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.\n                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.\n                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],\n                [batch_size, num_attention_heads, sequence_length, sequence_length].\n                We use whole-word-mask in ERNIE, so the whole word will have the same value. For example, \"使用\" as a word,\n                \"使\" and \"用\" will have the same value.\n                Defaults to `None`, which means nothing needed to be prevented attention to.\n             inputs_embeds (Tensor, optional):\n                If you want to control how to convert `inputs_ids` indices into associated vectors, you can\n                pass an embedded representation directly instead of passing `inputs_ids`.\n            past_key_values (tuple(tuple(Tensor)), optional):\n                The length of tuple equals to the number of layers, and each inner\n                tuple haves 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`)\n                which contains precomputed key and value hidden states of the attention blocks.\n                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that\n                don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all\n                `input_ids` of shape `(batch_size, sequence_length)`.\n            use_cache (`bool`, optional):\n                If set to `True`, `past_key_values` key value states are returned.\n                Defaults to `None`.\n            output_hidden_states (bool, optional):\n                Whether to return the hidden states of all layers.\n                Defaults to `False`.\n            output_attentions (bool, optional):\n                Whether to return the attentions tensors of all attention layers.\n                Defaults to `False`.\n            return_dict (bool, optional):\n                Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ModelOutput` object. \n                If `False`, the output will be a tuple of tensors. Defaults to `False`.\n\n        Returns:\n            An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions` if\n            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding\n            to ordered and not None (depending on the input arguments) fields of\n            :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions`.\n\n        \"\"\"\n        if input_ids is not None and inputs_embeds is not None:\n            raise ValueError(\n                \"You cannot specify both input_ids and inputs_embeds at the same time.\"\n            )\n        elif input_ids is not None:\n            input_shape = paddle.shape(input_ids)\n        elif inputs_embeds is not None:\n            input_shape = paddle.shape(inputs_embeds)[:-1]\n        else:\n            raise ValueError(\n                \"You have to specify either input_ids or inputs_embeds\")\n\n        past_key_values_length = None\n        if past_key_values is not None:\n            past_key_values_length = past_key_values[0][0].shape[2]\n\n        if attention_mask is None:\n            attention_mask = paddle.unsqueeze(\n                (input_ids == self.pad_token_id\n                 ).astype(self.pooler.dense.weight.dtype) * -1e4,\n                axis=[1, 2])\n            if past_key_values is not None:\n                batch_size = past_key_values[0][0].shape[0]\n                past_mask = paddle.zeros(\n                    [batch_size, 1, 1, past_key_values_length],\n                    dtype=attention_mask.dtype)\n                attention_mask = paddle.concat(\n                    [past_mask, attention_mask], axis=-1)\n\n        # For 2D attention_mask from tokenizer\n        elif attention_mask.ndim == 2:\n            attention_mask = paddle.unsqueeze(\n                attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())\n            attention_mask = (1.0 - attention_mask) * -1e4\n        attention_mask.stop_gradient = True\n\n        embedding_output = self.embeddings(\n            input_ids=input_ids,\n            position_ids=position_ids,\n            token_type_ids=token_type_ids,\n            task_type_ids=task_type_ids,\n            inputs_embeds=inputs_embeds,\n            past_key_values_length=past_key_values_length)\n\n        self.encoder._use_cache = use_cache  # To be consistent with HF\n        encoder_outputs = self.encoder(\n            embedding_output,\n            src_mask=attention_mask,\n            cache=past_key_values,\n            output_attentions=output_attentions,\n            output_hidden_states=output_hidden_states,\n            return_dict=return_dict)\n        if isinstance(encoder_outputs, type(embedding_output)):\n            sequence_output = encoder_outputs\n            pooled_output = self.pooler(sequence_output)\n            return (sequence_output, pooled_output)\n        else:\n            sequence_output = encoder_outputs[0]\n            pooled_output = self.pooler(sequence_output)\n            if not return_dict:\n                return (sequence_output, pooled_output) + encoder_outputs[1:]\n            return BaseModelOutputWithPoolingAndCrossAttentions(\n                last_hidden_state=sequence_output,\n                pooler_output=pooled_output,\n                past_key_values=encoder_outputs.past_key_values,\n                hidden_states=encoder_outputs.hidden_states,\n                attentions=encoder_outputs.attentions)\n\n    def init_weights(self, layer):\n        \"\"\" Initialization hook \"\"\"\n        if isinstance(layer, (nn.Linear, nn.Embedding)):\n            # only support dygraph, use truncated_normal and make it inplace\n            # and configurable later\n            if isinstance(layer.weight, paddle.Tensor):\n                layer.weight.set_value(\n                    paddle.tensor.normal(\n                        mean=0.0,\n                        std=self.initializer_range\n                        if hasattr(self, \"initializer_range\") else\n                        self.ernie.initializer_range,\n                        shape=layer.weight.shape))\n        elif isinstance(layer, nn.LayerNorm):\n            layer._epsilon = 1e-12\n\n\nclass ErnieLMPredictionHead(nn.Layer):\n    r\"\"\"\n    Ernie Model with a `language modeling` head on top.\n    \"\"\"\n\n    def __init__(\n            self,\n            hidden_size,\n            vocab_size,\n            activation,\n            embedding_weights=None,\n            weight_attr=None, ):\n        super(ErnieLMPredictionHead, self).__init__()\n\n        self.transform = nn.Linear(\n            hidden_size, hidden_size, weight_attr=weight_attr)\n        self.activation = getattr(nn.functional, activation)\n        self.layer_norm = nn.LayerNorm(hidden_size)\n        self.decoder_weight = self.create_parameter(\n            shape=[vocab_size, hidden_size],\n            dtype=self.transform.weight.dtype,\n            attr=weight_attr,\n            is_bias=False)\n        # if embedding_weights is None else embedding_weights\n        self.decoder_bias = self.create_parameter(\n            shape=[self.decoder_weight.shape[0]],\n            dtype=self.decoder_weight.dtype,\n            is_bias=True)\n\n    def forward(self, hidden_states, masked_positions=None):\n        if masked_positions is not None:\n            hidden_states = paddle.reshape(hidden_states,\n                                           [-1, hidden_states.shape[-1]])\n            hidden_states = paddle.tensor.gather(hidden_states,\n                                                 masked_positions)\n        # gather masked tokens might be more quick\n        hidden_states = self.transform(hidden_states)\n        hidden_states = self.activation(hidden_states)\n        hidden_states = self.layer_norm(hidden_states)\n        # hidden_states = parallel_matmul(hidden_states, self.decoder_weight, True) + self.decoder_bias\n\n        hidden_states = paddle.matmul(\n            hidden_states, self.decoder_weight,\n            transpose_y=True) + self.decoder_bias\n\n        return hidden_states\n\n\nclass ErniePretrainingHeads(nn.Layer):\n    def __init__(\n            self,\n            hidden_size,\n            vocab_size,\n            activation,\n            embedding_weights=None,\n            weight_attr=None, ):\n        super(ErniePretrainingHeads, self).__init__()\n        self.predictions = ErnieLMPredictionHead(hidden_size, vocab_size,\n                                                 activation, embedding_weights,\n                                                 weight_attr)\n        self.seq_relationship = nn.Linear(\n            hidden_size, 2, weight_attr=weight_attr)\n\n    def forward(self, sequence_output, pooled_output, masked_positions=None):\n        prediction_scores = self.predictions(sequence_output, masked_positions)\n        seq_relationship_score = self.seq_relationship(pooled_output)\n        return prediction_scores, seq_relationship_score\n\n\nclass ErnieForPretrainingAuto(nn.Layer):\n    r\"\"\"\n    Ernie Model with a `masked language modeling` head and a `sentence order prediction` head\n    on top.\n\n    \"\"\"\n\n    def __init__(self, ernie):\n        super(ErnieForPretrainingAuto, self).__init__()\n        self.ernie = ernie\n        weight_attr = paddle.ParamAttr(\n            initializer=nn.initializer.TruncatedNormal(\n                mean=0.0, std=self.ernie.initializer_range))\n        self.cls = ErniePretrainingHeads(\n            self.ernie.hidden_size,\n            self.ernie.vocab_size,\n            self.ernie.hidden_act,\n            embedding_weights=self.ernie.embeddings.word_embeddings.weight,\n            weight_attr=weight_attr, )\n\n        self.apply(self.init_weights)\n\n    def forward(self,\n                input_ids,\n                token_type_ids=None,\n                attention_mask=None,\n                masked_positions=None,\n                position_ids=None,\n                inputs_embeds=None,\n                labels=None,\n                next_sentence_label=None,\n                output_hidden_states=False,\n                output_attentions=False,\n                return_dict=False):\n        r\"\"\"\n        Args:\n            input_ids (Tensor):\n                See :class:`ErnieModel`.\n            token_type_ids (Tensor, optional):\n                See :class:`ErnieModel`.\n            position_ids (Tensor, optional):\n                See :class:`ErnieModel`.\n            attention_mask (Tensor, optional):\n                See :class:`ErnieModel`.\n            inputs_embeds(Tensor, optional):\n                See :class:`ErnieModel`.\n            labels (Tensor of shape `(batch_size, sequence_length)`, optional):\n                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,\n                vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),\n                the loss is only computed for the tokens with labels in `[0, ..., vocab_size]`.\n            next_sentence_label (Tensor of shape `(batch_size,)`, optional):\n                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence\n                pair (see `input_ids` docstring) Indices should be in `[0, 1]`:\n\n                - 0 indicates sequence B is a continuation of sequence A,\n                - 1 indicates sequence B is a random sequence.\n            output_hidden_states (bool, optional):\n                Whether to return the hidden states of all layers.\n                Defaults to `False`.\n            output_attentions (bool, optional):\n                Whether to return the attentions tensors of all attention layers.\n                Defaults to `False`.\n            return_dict (bool, optional):\n                Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput` object. If\n                `False`, the output will be a tuple of tensors. Defaults to `False`.\n\n        Returns:\n            An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput` if `return_dict=True`.\n            Otherwise it returns a tuple of tensors corresponding to ordered and\n            not None (depending on the input arguments) fields of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput`.\n\n        \"\"\"\n        # with paddle.static.amp.fp16_guard():\n        outputs = self.ernie(\n            input_ids,\n            token_type_ids=token_type_ids,\n            position_ids=position_ids,\n            attention_mask=attention_mask,\n            inputs_embeds=inputs_embeds,\n            output_attentions=output_attentions,\n            output_hidden_states=output_hidden_states,\n            return_dict=return_dict)\n        sequence_output, pooled_output = outputs[:2]\n        prediction_scores, seq_relationship_score = self.cls(\n            sequence_output, pooled_output, masked_positions)\n\n        total_loss = None\n        if labels is not None and next_sentence_label is not None:\n            loss_fct = paddle.nn.CrossEntropyLoss()\n            masked_lm_loss = loss_fct(\n                prediction_scores.reshape(\n                    (-1, paddle.shape(prediction_scores)[-1])),\n                labels.reshape((-1, )))\n            next_sentence_loss = loss_fct(\n                seq_relationship_score.reshape((-1, 2)),\n                next_sentence_label.reshape((-1, )))\n            total_loss = masked_lm_loss + next_sentence_loss\n\n        if not return_dict:\n            output = (prediction_scores, seq_relationship_score) + outputs[2:]\n            return (\n                (total_loss, ) + output) if total_loss is not None else output\n\n        return ErnieForPreTrainingOutput(\n            loss=total_loss,\n            prediction_logits=prediction_scores,\n            seq_relationship_logits=seq_relationship_score,\n            hidden_states=outputs.hidden_states,\n            attentions=outputs.attentions, )\n\n    def init_weights(self, layer):\n        \"\"\" Initialization hook \"\"\"\n        if isinstance(layer, (nn.Linear, nn.Embedding)):\n            # only support dygraph, use truncated_normal and make it inplace\n            # and configurable later\n            if isinstance(layer.weight, paddle.Tensor):\n                layer.weight.set_value(\n                    paddle.tensor.normal(\n                        mean=0.0,\n                        std=self.initializer_range\n                        if hasattr(self, \"initializer_range\") else\n                        self.ernie.initializer_range,\n                        shape=layer.weight.shape))\n        elif isinstance(layer, nn.LayerNorm):\n            layer._epsilon = 1e-12\n\n\nclass ErniePretrainingCriterionAuto(paddle.nn.Layer):\n    r\"\"\"\n    The loss output of Ernie Model during the pretraining:\n    a `masked language modeling` head and a `next sentence prediction (classification)` head.\n\n    \"\"\"\n\n    def __init__(self, with_nsp_loss=True):\n        super(ErniePretrainingCriterionAuto, self).__init__()\n        self.with_nsp_loss = with_nsp_loss\n\n    def forward(self,\n                prediction_scores,\n                seq_relationship_score,\n                masked_lm_labels,\n                next_sentence_labels=None):\n        \"\"\"\n        Args:\n            prediction_scores(Tensor):\n                The scores of masked token prediction. Its data type should be float32.\n                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].\n                Otherwise, its shape is [batch_size, mask_token_num, vocab_size]\n            seq_relationship_score(Tensor):\n                The scores of next sentence prediction. Its data type should be float32 and\n                its shape is [batch_size, 2]\n            masked_lm_labels(Tensor):\n                The labels of the masked language modeling, its dimensionality is equal to `prediction_scores`.\n                Its data type should be int64. If `masked_positions` is None, its shape is [batch_size, sequence_length, 1].\n                Otherwise, its shape is [batch_size, mask_token_num, 1]\n            next_sentence_labels(Tensor):\n                The labels of the next sentence prediction task, the dimensionality of `next_sentence_labels`\n                is equal to `seq_relation_labels`. Its data type should be int64 and\n                its shape is [batch_size, 1]\n\n        Returns:\n            Tensor: The pretraining loss, equals to the sum of `masked_lm_loss` plus the mean of `next_sentence_loss`.\n            Its data type should be float32 and its shape is [1].\n\n        \"\"\"\n\n        # with paddle.static.amp.fp16_guard():\n        masked_lm_loss = F.cross_entropy(\n            prediction_scores,\n            masked_lm_labels,\n            ignore_index=-1,\n            reduction='none')\n\n        if not self.with_nsp_loss:\n            return paddle.mean(masked_lm_loss)\n\n        next_sentence_loss = F.cross_entropy(\n            seq_relationship_score, next_sentence_labels, reduction='none')\n        loss = paddle.mean(masked_lm_loss) + paddle.mean(next_sentence_loss)\n        return loss\n\n\nclass ErnieForSequenceClassificationAuto(nn.Layer):\n    \"\"\"\n    Ernie Model with a linear layer on top of the output layer,\n    designed for sequence classification/regression tasks like GLUE tasks.\n\n    Args:\n        ernie (:class:`ErnieModel`):\n            An instance of ErnieModel.\n        num_classes (int, optional):\n            The number of classes. Defaults to `2`.\n        dropout (float, optional):\n            The dropout probability for output of ERNIE.\n            If None, use the same value as `hidden_dropout_prob` of `ErnieModel`\n            instance `ernie`. Defaults to None.\n    \"\"\"\n\n    def __init__(self, ernie, num_classes=2, dropout=None):\n        super(ErnieForSequenceClassificationAuto, self).__init__()\n        self.num_classes = num_classes\n        self.ernie = ernie  # allow ernie to be config\n        self.dropout = nn.Dropout(dropout if dropout is not None else\n                                  self.ernie.hidden_dropout_prob)\n        self.classifier = nn.Linear(self.ernie.hidden_size, num_classes)\n        self.apply(self.init_weights)\n\n    def forward(self,\n                input_ids,\n                token_type_ids=None,\n                position_ids=None,\n                attention_mask=None,\n                labels=None,\n                output_hidden_states=False,\n                output_attentions=False,\n                return_dict=False):\n        r\"\"\"\n        The ErnieForSequenceClassification forward method, overrides the __call__() special method.\n\n        Args:\n            input_ids (Tensor):\n                See :class:`ErnieModelAuto`.\n            token_type_ids (Tensor, optional):\n                See :class:`ErnieModelAuto`.\n            position_ids(Tensor, optional):\n                See :class:`ErnieModelAuto`.\n            attention_mask (Tensor, optional):\n                See :class:`ErnieModelAuto`.\n            labels (Tensor of shape `(batch_size,)`, optional):\n                Labels for computing the sequence classification/regression loss.\n                Indices should be in `[0, ..., num_classes - 1]`. If `num_classes == 1`\n                a regression loss is computed (Mean-Square loss), If `num_classes > 1`\n                a classification loss is computed (Cross-Entropy).\n            output_hidden_states (bool, optional):\n                Whether to return the hidden states of all layers.\n                Defaults to `False`.\n            output_attentions (bool, optional):\n                Whether to return the attentions tensors of all attention layers.\n                Defaults to `False`.\n            return_dict (bool, optional):\n                Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput` object. If\n                `False`, the output will be a tuple of tensors. Defaults to `False`.\n\n        Returns:\n            An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput` if `return_dict=True`.\n            Otherwise it returns a tuple of tensors corresponding to ordered and\n            not None (depending on the input arguments) fields of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput`.\n\n        \"\"\"\n\n        outputs = self.ernie(\n            input_ids,\n            token_type_ids=token_type_ids,\n            position_ids=position_ids,\n            attention_mask=attention_mask,\n            output_attentions=output_attentions,\n            output_hidden_states=output_hidden_states,\n            return_dict=return_dict)\n        pooled_output = outputs[1]\n\n        pooled_output = self.dropout(pooled_output)\n        logits = self.classifier(pooled_output)\n\n        loss = None\n        if labels is not None:\n            if self.num_classes == 1:\n                loss_fct = paddle.nn.MSELoss()\n                loss = loss_fct(logits, labels)\n            elif labels.dtype == paddle.int64 or labels.dtype == paddle.int32:\n                loss_fct = paddle.nn.CrossEntropyLoss()\n                loss = loss_fct(\n                    logits.reshape((-1, self.num_classes)),\n                    labels.reshape((-1, )))\n            else:\n                loss_fct = paddle.nn.BCEWithLogitsLoss()\n                loss = loss_fct(logits, labels)\n\n        if not return_dict:\n            output = (logits, ) + outputs[2:]\n            return ((loss, ) + output) if loss is not None else (\n                output[0] if len(output) == 1 else output)\n\n        return SequenceClassifierOutput(\n            loss=loss,\n            logits=logits,\n            hidden_states=outputs.hidden_states,\n            attentions=outputs.attentions, )\n\n    def init_weights(self, layer):\n        \"\"\" Initialization hook \"\"\"\n        if isinstance(layer, (nn.Linear, nn.Embedding)):\n            if isinstance(layer.weight, paddle.Tensor):\n                layer.weight.set_value(\n                    paddle.tensor.normal(\n                        mean=0.0,\n                        std=self.initializer_range\n                        if hasattr(self, \"initializer_range\") else\n                        self.ernie.initializer_range,\n                        shape=layer.weight.shape))\n        elif isinstance(layer, nn.LayerNorm):\n            layer._epsilon = 1e-12\n"
  },
  {
    "path": "ppfleetx/models/language_model/ernie/auto/auto_module.py",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport sys\nimport copy\n\nimport paddle\nfrom paddle import LazyGuard\nfrom ppfleetx.core.module.basic_module import BasicModule\nfrom ppfleetx.utils.log import logger\n\nfrom .auto_model import (\n    ErnieModelAuto,\n    ErnieForPretrainingAuto,\n    ErniePretrainingCriterionAuto,\n    ErnieForSequenceClassificationAuto, )\n\nfrom ppfleetx.models.language_model.auto_utils import process_configs, process_mesh_config\n\nimport numpy as np\n\n\ndef process_data_configs(config):\n    \"\"\"\n    process data configs for hybrid parallel\n    \"\"\"\n    cfg_global = config['Global']\n    cfg_data = config['Data']\n\n    mode_to_num_samples = {\n        \"Train\":\n        cfg_global['global_batch_size'] * config['Engine']['max_steps'],\n        \"Eval\": cfg_global['global_batch_size'] *\n        (config['Engine']['max_steps'] // config['Engine']['eval_freq'] + 1) *\n        config['Engine']['eval_iters'],\n        \"Test\":\n        cfg_global['global_batch_size'] * config['Engine']['test_iters'],\n    }\n\n    for mode in (\"Train\", \"Eval\", \"Test\"):\n        if mode in cfg_data.keys():\n            cfg_data[mode]['dataset']['num_samples'] = mode_to_num_samples[\n                mode]\n            cfg_data[mode]['dataset']['mode'] = mode\n            cfg_data[mode]['dataset']['seed'] = cfg_global['seed']\n            cfg_data[mode]['dataset'].setdefault('binary_head',\n                                                 cfg_global['binary_head'])\n            cfg_data[mode]['collate_fn'].setdefault(\n                'micro_batch_size', cfg_global['micro_batch_size'])\n\n\ndef process_model_configs(config):\n    mesh = process_mesh_config(config['Distributed'])\n    cfg_model = config['Model']\n    hidden_size = cfg_model['hidden_size']\n    cfg_model.update({'mesh': mesh})\n    cfg_model.setdefault(\"intermediate_size\", hidden_size * 4)\n\n\nclass ErnieModuleAuto(BasicModule):\n    def __init__(self, configs):\n        self.nranks = paddle.distributed.get_world_size()\n        super(ErnieModuleAuto, self).__init__(configs)\n        self.nranks = paddle.distributed.get_world_size()\n        self.binary_head = self.configs['Global']['binary_head']\n\n        self.loss_fn = ErniePretrainingCriterionAuto(self.binary_head)\n\n    def process_configs(self, configs):\n        process_data_configs(configs)\n        process_model_configs(configs)\n        return configs\n\n    def get_model(self):\n        model_setting = copy.deepcopy(self.configs.Model)\n        model_setting.pop(\"module\")\n        model_setting.pop(\"name\")\n\n        with LazyGuard():\n            model = ErnieForPretrainingAuto(ErnieModelAuto(**model_setting))\n\n        return model\n\n    def input_spec(self):\n        inputs_spec = [\n            paddle.static.InputSpec(\n                shape=[None, None], name=\"input_ids\", dtype=\"int64\"),\n            paddle.static.InputSpec(\n                shape=[None, None], name=\"token_type_ids\", dtype=\"int64\"),\n            paddle.static.InputSpec(\n                shape=[None, None], name=\"position_ids\", dtype=\"int64\"),\n        ]\n\n        return inputs_spec\n\n\nclass ErnieSeqClsModuleAuto(BasicModule):\n    def __init__(self, configs):\n        self.nranks = paddle.distributed.get_world_size()\n        super(ErnieSeqClsModuleAuto, self).__init__(configs)\n\n    def process_configs(self, configs):\n        process_model_configs(configs)\n\n        cfg_global = configs['Global']\n        cfg_data = configs['Data']\n\n        for mode in (\"Train\", \"Eval\", \"Test\"):\n            if mode in cfg_data.keys():\n                cfg_data[mode]['dataset']['mode'] = mode\n                cfg_data[mode]['collate_fn'].setdefault(\n                    'tokenizer_type',\n                    cfg_data[mode]['dataset']['tokenizer_type'])\n\n        return configs\n\n    def get_model(self):\n        model_setting = copy.deepcopy(self.configs.Model)\n        model_setting.pop(\"module\")\n        model_setting.pop(\"name\")\n\n        with LazyGuard():\n            model = ErnieForSequenceClassificationAuto(\n                ErnieModelAuto(**model_setting))\n\n        return model\n\n    def input_spec(self):\n        input_spec = [\n            paddle.static.InputSpec(\n                shape=[None, None], dtype=\"int64\", name='input_ids'),\n            paddle.static.InputSpec(\n                shape=[None, None], dtype=\"int64\", name='token_type_ids')\n        ]\n        return input_spec\n"
  },
  {
    "path": "ppfleetx/models/language_model/ernie/auto/auto_transformer.py",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# TODO: define the classes of Transformer neural network\n\nimport copy\nimport collections\nimport numpy as np\n\nimport paddle\nimport paddle.nn.functional as F\nimport paddle.nn as nn\nimport paddle.distributed.auto_parallel as auto\n\nfrom paddle.nn import Linear, Dropout, LayerNorm, LayerList, Layer\nimport paddle.tensor as tensor\nfrom paddle.fluid import layers\nfrom paddle import ParamAttr\nfrom paddle.fluid.data_feeder import convert_dtype\nfrom ..layers.model_outputs import BaseModelOutputWithPastAndCrossAttentions\n\n__all__ = []\n\n\ndef _convert_param_attr_to_list(param_attr, n):\n    \"\"\"\n    If `param_attr` is a list or tuple, convert every element in it to a\n    ParamAttr instance. Otherwise, repeat `param_attr` `n` times to\n    construct a list, and rename every one by appending a increasing index\n    suffix to avoid having same names when `param_attr` contains a name.\n\n    Parameters:\n        param_attr (list|tuple|ParamAttr): A list, tuple or something can be\n            converted to a ParamAttr instance by `ParamAttr._to_attr`.\n        n (int): The times to repeat to construct a list when `param_attr`\n            is not a list or tuple.\n\n    Returns:\n        list: A list composed of each including cell's `param_attr`.\n    \"\"\"\n    if isinstance(param_attr, (list, tuple)):\n        assert len(param_attr) == n, (\n            \"length of param_attr should be %d when it is a list/tuple\" % n)\n        param_attrs = []\n        for attr in param_attr:\n            if isinstance(attr, bool):\n                if attr:\n                    param_attrs.append(ParamAttr._to_attr(None))\n                else:\n                    param_attrs.append(False)\n            else:\n                param_attrs.append(ParamAttr._to_attr(attr))\n        # param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]\n    elif isinstance(param_attr, bool):\n        param_attrs = []\n        if param_attr:\n            param_attrs = [ParamAttr._to_attr(None) for i in range(n)]\n        else:\n            param_attrs = [False] * n\n    else:\n        param_attrs = []\n        attr = ParamAttr._to_attr(param_attr)\n        for i in range(n):\n            attr_i = copy.deepcopy(attr)\n            if attr.name:\n                attr_i.name = attr_i.name + \"_\" + str(i)\n            param_attrs.append(attr_i)\n    return param_attrs\n\n\ndef _convert_attention_mask(attn_mask, dtype):\n    \"\"\"\n    Convert the attention mask to the target dtype we expect.\n\n    Parameters:\n        attn_mask (Tensor, optional): A tensor used in multi-head attention\n                to prevents attention to some unwanted positions, usually the\n                paddings or the subsequent positions. It is a tensor with shape\n                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.\n                When the data type is bool, the unwanted positions have `False`\n                values and the others have `True` values. When the data type is\n                int, the unwanted positions have 0 values and the others have 1\n                values. When the data type is float, the unwanted positions have\n                `-INF` values and the others have 0 values. It can be None when\n                nothing wanted or needed to be prevented attention to. Default None.\n        dtype (VarType): The target type of `attn_mask` we expect.\n\n    Returns:\n        Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`.\n    \"\"\"\n    if attn_mask is not None and attn_mask.dtype != dtype:\n        attn_mask_dtype = convert_dtype(attn_mask.dtype)\n        if attn_mask_dtype == 'bool' or 'int' in attn_mask_dtype:\n            attn_mask = (paddle.cast(attn_mask, dtype) - 1.0) * 1e9\n        else:\n            attn_mask = paddle.cast(attn_mask, dtype)\n    return attn_mask\n\n\nclass MultiHeadAttention(Layer):\n    \"\"\"\n    Attention mapps queries and a set of key-value pairs to outputs, and\n    Multi-Head Attention performs multiple parallel attention to jointly attending\n    to information from different representation subspaces.\n\n    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_\n    for more details.\n\n    Parameters:\n        embed_dim (int): The expected feature size in the input and output.\n        num_heads (int): The number of heads in multi-head attention.\n        dropout (float, optional): The dropout probability used on attention\n            weights to drop some attention targets. 0 for no dropout. Default 0\n        kdim (int, optional): The feature size in key. If None, assumed equal to\n            `embed_dim`. Default None.\n        vdim (int, optional): The feature size in value. If None, assumed equal to\n            `embed_dim`. Default None.\n        need_weights (bool, optional): Indicate whether to return the attention\n            weights. Default False.\n        weight_attr(ParamAttr, optional):  To specify the weight parameter property.\n            Default: None, which means the default weight parameter property is used.\n            See usage for details in :code:`ParamAttr` .\n        bias_attr (ParamAttr|bool, optional): To specify the bias parameter property.\n            Default: None, which means the default bias parameter property is used.\n            If it is set to False, this layer will not have trainable bias parameter.\n            See usage for details in :code:`ParamAttr` .\n\n    Examples:\n\n        .. code-block:: python\n\n            import paddle\n\n            # encoder input: [batch_size, sequence_length, d_model]\n            query = paddle.rand((2, 4, 128))\n            # self attention mask: [batch_size, num_heads, query_len, query_len]\n            attn_mask = paddle.rand((2, 2, 4, 4))\n            multi_head_attn = paddle.nn.MultiHeadAttention(128, 2)\n            output = multi_head_attn(query, None, None, attn_mask=attn_mask)  # [2, 4, 128]\n    \"\"\"\n\n    Cache = collections.namedtuple(\"Cache\", [\"k\", \"v\"])\n    StaticCache = collections.namedtuple(\"StaticCache\", [\"k\", \"v\"])\n\n    def __init__(self,\n                 embed_dim,\n                 num_heads,\n                 dropout=0.,\n                 kdim=None,\n                 vdim=None,\n                 need_weights=False,\n                 weight_attr=None,\n                 bias_attr=None,\n                 mesh=None,\n                 mesh_idx=None):\n        super(MultiHeadAttention, self).__init__()\n\n        assert embed_dim > 0, (\"Expected embed_dim to be greater than 0, \"\n                               \"but received {}\".format(embed_dim))\n        assert num_heads > 0, (\"Expected num_heads to be greater than 0, \"\n                               \"but received {}\".format(num_heads))\n\n        self.embed_dim = embed_dim\n        self.kdim = kdim if kdim is not None else embed_dim\n        self.vdim = vdim if vdim is not None else embed_dim\n        self.num_heads = num_heads\n        self.dropout = dropout\n        self.need_weights = need_weights\n        self.mesh = mesh\n        self.mesh_idx = mesh_idx\n\n        self.head_dim = embed_dim // num_heads\n        assert self.head_dim * num_heads == self.embed_dim, \"embed_dim must be divisible by num_heads\"\n\n        self.q_proj = Linear(\n            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)\n        self.k_proj = Linear(\n            self.kdim, embed_dim, weight_attr, bias_attr=bias_attr)\n        self.v_proj = Linear(\n            self.vdim, embed_dim, weight_attr, bias_attr=bias_attr)\n        self.out_proj = Linear(\n            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)\n\n    def _prepare_qkv(self, query, key, value, cache=None):\n        r\"\"\"\n        Prapares linear projected queries, keys and values for usage of subsequnt\n        multiple parallel attention. If `cache` is not None, using cached results\n        to reduce redundant calculations.\n\n        Parameters:\n            query (Tensor): The queries for multi-head attention. It is a\n                tensor with shape `[batch_size, query_length, embed_dim]`. The\n                data type should be float32 or float64.\n            key (Tensor): The keys for multi-head attention. It is\n                a tensor with shape `[batch_size, key_length, kdim]`. The\n                data type should be float32 or float64. If None, use `query` as\n                `key`.\n            value (Tensor): The values for multi-head attention. It\n                is a tensor with shape `[batch_size, value_length, vdim]`.\n                The data type should be float32 or float64. If None, use `query` as\n                `value`.\n            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):\n                It is a namedtuple with `k` and `v` as fields, and stores tensors\n                shaped `[batch_size, num_heads, length, embed_dim]` which are results\n                of linear projection, reshape and transpose calculations in\n                MultiHeadAttention. If is an instance of `Cache`, `k` and `v`\n                fields reserve intermediate results of previous positions, which\n                mostly used for decoder self attention. If it is an instance of\n                `StaticCache`, `key` and `value` args would be ignored, `k` and\n                `v` fields would be used as calculated results on `key` and\n                `value`, which mostly used for decoder-encoder cross attention.\n                It is only used for inference and should be None for training.\n                Default None.\n\n        Returns:\n            tuple: A tuple including linear projected keys and values. These two \\\n                tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \\\n                and `[batch_size, n_head, sequence_length, d_value]` separately, \\\n                and their data types are same as inputs.\n        \"\"\"\n        auto.shard_tensor(self.q_proj.weight, self.mesh[self.mesh_idx],\n                          [None, self.mesh.mp])\n\n        q = self.q_proj(query)\n        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])\n        q = tensor.transpose(x=q, perm=[0, 2, 1, 3])\n\n        if isinstance(cache, self.StaticCache):\n            # for encoder-decoder attention in inference and has cached\n            k, v = cache.k, cache.v\n        else:\n            k, v = self.compute_kv(key, value)\n\n        if isinstance(cache, self.Cache):\n            # for decoder self-attention in inference\n            k = tensor.concat([cache.k, k], axis=2)\n            v = tensor.concat([cache.v, v], axis=2)\n            cache = self.Cache(k, v)\n\n        return (q, k, v) if cache is None else (q, k, v, cache)\n\n    def compute_kv(self, key, value):\n        r\"\"\"\n        Applies linear projection on input keys and values, then splits heads\n        (reshape and transpose) to get keys and values from different representation\n        subspaces. The results are used as key-values pairs for subsequent multiple\n        parallel attention.\n\n        It is part of calculations in multi-head attention, and is provided as\n        a method to pre-compute and prefetch these results, thus we can use them\n        to construct cache for inference.\n\n        Parameters:\n            key (Tensor): The keys for multi-head attention. It is a tensor\n                with shape `[batch_size, sequence_length, kdim]`. The data type\n                should be float32 or float64.\n            value (Tensor): The values for multi-head attention. It is a tensor\n                with shape `[batch_size, sequence_length, vdim]`. The data type\n                should be float32 or float64.\n\n        Returns:\n            tuple: A tuple including transformed keys and values. Their shapes \\\n                both are `[batch_size, num_heads, sequence_length, embed_dim // num_heads]`, \\\n                and their data types are same as inputs.\n        \"\"\"\n        auto.shard_tensor(self.k_proj.weight, self.mesh[self.mesh_idx],\n                          [None, self.mesh.mp])\n        auto.shard_tensor(self.v_proj.weight, self.mesh[self.mesh_idx],\n                          [None, self.mesh.mp])\n\n        k = self.k_proj(key)\n        v = self.v_proj(value)\n        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])\n        k = tensor.transpose(x=k, perm=[0, 2, 1, 3])\n        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])\n        v = tensor.transpose(x=v, perm=[0, 2, 1, 3])\n        return k, v\n\n    def gen_cache(self, key, value=None, type=Cache):\n        \"\"\"\n        Generates cache for `forward` usage in inference accroding to arguments.\n        The generated cache is an instance of `MultiHeadAttention.Cache` or an\n        instance of `MultiHeadAttention.StaticCache`.\n\n        `Cache` or `StaticCache` is namedtuple with `k` and `v` as fields,\n        and it stores tensors shaped `[batch_size, num_heads, length, embed_dim]`\n        which are results of linear projection, reshape and transpose calculations\n        in MultiHeadAttention.\n\n        If the generated cache is an instance of `Cache`, `k` and `v` fields\n        reserve intermediate result tensors of previous positions, and the tensors\n        are incremental among decoding steps, which mostly are used for decoder\n        decoder self attention.\n\n        If the generated cache is an instance of `StaticCache`, `k` and `v` fields\n        would be used as calculated result tensors on keys an values in `forward`,\n        and the tensors keep unchanged among decoding steps, which are mostly used\n        for decoder-encoder cross attention.\n\n        The cache is generated as follows:\n\n        1. If `type` is `StaticCache`, apply `compute_kv(key, value)` and use the\n        results to create an instance of `StaticCache`.\n\n        2. If `type` is `Cache` and `value` is None, generate empty tensors shaped\n        `[batch_size, num_heads, 0, embed_dim // num_heads]` and use the results\n        to create an instance of `Cache`, where `batch_size` is from the first\n        dimension of `key`.\n\n        3. If `type` is `Cache` and `value` is not None, use `key`, `value` to create\n        an instance of `Cache`.\n\n        Parameters:\n            key (Tensor): The keys for multi-head attention. It is\n                a tensor with shape `[batch_size, key_length, kdim]`. The\n                data type should be float32 or float64. If `value` is None,\n                it is only for batch size and data type reference.\n            value (Tensor, optional): The values for multi-head attention. It\n                is a tensor with shape `[batch_size, value_length, vdim]`.\n                The data type should be float32 or float64. If None, `key` is only\n                for batch size reference. Default None.\n            type (type): It should be `MultiHeadAttention.StaticCache` or\n                `MultiHeadAttention.Cache` to indicate the cache type to generate.\n\n        Returns:\n            namedtuple: an instance of `Cache` or `StaticCache` accordingly.\n        \"\"\"\n        if type == MultiHeadAttention.StaticCache:  # static_kv\n            k, v = self.compute_kv(key, value)\n            return self.StaticCache(k, v)\n        elif value is None:  # incremental_state\n            k = layers.fill_constant_batch_size_like(\n                input=key,\n                shape=[-1, self.num_heads, 0, self.head_dim],\n                dtype=key.dtype,\n                value=0)\n            v = layers.fill_constant_batch_size_like(\n                input=key,\n                shape=[-1, self.num_heads, 0, self.head_dim],\n                dtype=key.dtype,\n                value=0)\n            return self.Cache(k, v)\n        else:\n            # incremental_state with initial value, mainly for usage like UniLM\n            return self.Cache(key, value)\n\n    def forward(self, query, key=None, value=None, attn_mask=None, cache=None):\n        r\"\"\"\n        Applies multi-head attention to map queries and a set of key-value pairs\n        to outputs.\n\n        Parameters:\n            query (Tensor): The queries for multi-head attention. It is a\n                tensor with shape `[batch_size, query_length, embed_dim]`. The\n                data type should be float32 or float64.\n            key (Tensor, optional): The keys for multi-head attention. It is\n                a tensor with shape `[batch_size, key_length, kdim]`. The\n                data type should be float32 or float64. If None, use `query` as\n                `key`. Default None.\n            value (Tensor, optional): The values for multi-head attention. It\n                is a tensor with shape `[batch_size, value_length, vdim]`.\n                The data type should be float32 or float64. If None, use `query` as\n                `value`. Default None.\n            attn_mask (Tensor, optional): A tensor used in multi-head attention\n                to prevents attention to some unwanted positions, usually the\n                paddings or the subsequent positions. It is a tensor with shape\n                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.\n                When the data type is bool, the unwanted positions have `False`\n                values and the others have `True` values. When the data type is\n                int, the unwanted positions have 0 values and the others have 1\n                values. When the data type is float, the unwanted positions have\n                `-INF` values and the others have 0 values. It can be None when\n                nothing wanted or needed to be prevented attention to. Default None.\n            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):\n                It is a namedtuple with `k` and `v` as fields, and stores tensors\n                shaped `[batch_size, num_heads, length, embed_dim]` which are results\n                of linear projection, reshape and transpose calculations in\n                MultiHeadAttention. If it is an instance of `Cache`, `k` and `v`\n                fields reserve intermediate results of previous positions, which\n                mostly used for decoder self attention. If it is an instance of\n                `StaticCache`, `key` and `value` args would be ignored, `k` and\n                `v` fields would be used as calculated results on `key` and\n                `value`, which mostly used for decoder-encoder cross attention.\n                It is only used for inference and should be None for training.\n                Default None.\n\n        Returns:\n            Tensor|tuple: It is a tensor that has the same shape and data type \\\n                as `query`, representing attention output. Or a tuple if \\\n                `need_weights` is True or `cache` is not None. If `need_weights` \\\n                is True, except for attention output, the tuple also includes \\\n                the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \\\n                If `cache` is not None, the tuple then includes the new cache \\\n                having the same type as `cache`, and if it is `StaticCache`, it \\\n                is same as the input `cache`, if it is `Cache`, the new cache \\\n                reserves tensors concatanating raw tensors with intermediate \\\n                results of current query.\n        \"\"\"\n        key = query if key is None else key\n        value = query if value is None else value\n        # compute q ,k ,v\n        if cache is None:\n            q, k, v = self._prepare_qkv(query, key, value, cache)\n        else:\n            q, k, v, cache = self._prepare_qkv(query, key, value, cache)\n\n        # scale dot product attention\n        product = paddle.matmul(\n            x=q * (self.head_dim**-0.5), y=k, transpose_y=True)\n\n        if attn_mask is not None:\n            # Support bool or int mask\n            attn_mask = _convert_attention_mask(attn_mask, product.dtype)\n            product = product + attn_mask\n\n        weights = F.softmax(product)\n\n        if self.dropout:\n            # with get_rng_state_tracker().rng_state('local_seed'):\n            weights = F.dropout(\n                weights,\n                self.dropout,\n                training=self.training,\n                mode=\"upscale_in_train\")\n\n        out = paddle.matmul(weights, v)\n\n        # combine heads\n        out = tensor.transpose(out, perm=[0, 2, 1, 3])\n        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])\n\n        auto.shard_tensor(self.out_proj.weight, self.mesh[self.mesh_idx],\n                          [self.mesh.mp, None])\n        # project to output\n        out = self.out_proj(out)\n\n        outs = [out]\n        if self.need_weights:\n            outs.append(weights)\n        if cache is not None:\n            outs.append(cache)\n        return out if len(outs) == 1 else tuple(outs)\n\n\nclass TransformerEncoderLayer(Layer):\n    \"\"\"\n    TransformerEncoderLayer is composed of two sub-layers which are self (multi-head)\n    attention and feedforward network. Before and after each sub-layer, pre-process\n    and post-precess would be applied on the input and output accordingly. If\n    `normalize_before` is True, pre-process is layer normalization and post-precess\n    includes dropout, residual connection. Otherwise, no pre-process and post-precess\n    includes dropout, residual connection, layer normalization.\n\n    Parameters:\n        d_model (int): The expected feature size in the input and output.\n        nhead (int): The number of heads in multi-head attention(MHA).\n        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).\n        dropout (float, optional): The dropout probability used in pre-process\n            and post-precess of MHA and FFN sub-layer. Default 0.1\n        activation (str, optional): The activation function in the feedforward\n            network. Default relu.\n        attn_dropout (float, optional): The dropout probability used\n            in MHA to drop some attention target. If None, use the value of\n            `dropout`. Default None\n        act_dropout (float, optional): The dropout probability used after FFN\n            activition.  If None, use the value of `dropout`. Default None\n        normalize_before (bool, optional): Indicate whether to put layer normalization\n            into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer\n            normalization and post-precess includes dropout, residual connection.\n            Otherwise, no pre-process and post-precess includes dropout, residual\n            connection, layer normalization. Default False\n        weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property.\n            If it is a list/tuple, `weight_attr[0]` would be used as `weight_attr` for\n            MHA, and `weight_attr[1]` would be used as `weight_attr` for linear in FFN.\n            Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.\n            Default: None, which means the default weight parameter property is used.\n            See usage for details in :code:`ParamAttr` .\n        bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.\n            If it is a list/tuple, `bias_attr[0]` would be used as `bias_attr` for\n            MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.\n            Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.\n            The `False` value means the corresponding layer would not have trainable\n            bias parameter. See usage for details in :code:`ParamAttr` . Default: None,\n            which means the default bias parameter property is used.\n\n\n    Examples:\n\n        .. code-block:: python\n\n            import paddle\n            from paddle.nn import TransformerEncoderLayer\n\n            # encoder input: [batch_size, src_len, d_model]\n            enc_input = paddle.rand((2, 4, 128))\n            # self attention mask: [batch_size, n_head, src_len, src_len]\n            attn_mask = paddle.rand((2, 2, 4, 4))\n            encoder_layer = TransformerEncoderLayer(128, 2, 512)\n            enc_output = encoder_layer(enc_input, attn_mask)  # [2, 4, 128]\n    \"\"\"\n\n    def __init__(self,\n                 d_model,\n                 nhead,\n                 dim_feedforward,\n                 dropout=0.1,\n                 activation=\"relu\",\n                 attn_dropout=None,\n                 act_dropout=None,\n                 normalize_before=False,\n                 weight_attr=None,\n                 bias_attr=None,\n                 mesh=None,\n                 mesh_idx=None):\n        self._config = locals()\n        self._config.pop(\"self\")\n        self._config.pop(\"__class__\", None)  # py3\n\n        super(TransformerEncoderLayer, self).__init__()\n\n        assert d_model > 0, (\"Expected d_model to be greater than 0, \"\n                             \"but received {}\".format(d_model))\n        assert nhead > 0, (\"Expected nhead to be greater than 0, \"\n                           \"but received {}\".format(nhead))\n        assert dim_feedforward > 0, (\n            \"Expected dim_feedforward to be greater than 0, \"\n            \"but received {}\".format(dim_feedforward))\n\n        attn_dropout = dropout if attn_dropout is None else attn_dropout\n        act_dropout = dropout if act_dropout is None else act_dropout\n        self.normalize_before = normalize_before\n        self.mesh = mesh\n        self.mesh_idx = mesh_idx\n\n        weight_attrs = _convert_param_attr_to_list(weight_attr, 2)\n        bias_attrs = _convert_param_attr_to_list(bias_attr, 2)\n\n        self.self_attn = MultiHeadAttention(\n            d_model,\n            nhead,\n            dropout=attn_dropout,\n            weight_attr=weight_attrs[0],\n            bias_attr=bias_attrs[0],\n            mesh=mesh,\n            mesh_idx=mesh_idx)\n        self.linear1 = Linear(\n            d_model, dim_feedforward, weight_attrs[1], bias_attr=bias_attrs[1])\n        self.dropout = Dropout(act_dropout, mode=\"upscale_in_train\")\n        self.linear2 = Linear(\n            dim_feedforward, d_model, weight_attrs[1], bias_attr=bias_attrs[1])\n        self.norm1 = LayerNorm(d_model)\n        self.norm2 = LayerNorm(d_model)\n        self.dropout1 = Dropout(dropout, mode=\"upscale_in_train\")\n        self.dropout2 = Dropout(dropout, mode=\"upscale_in_train\")\n        self.activation = getattr(F, activation)\n\n    def forward(self, src, src_mask=None, cache=None, output_attentions=False):\n        r\"\"\"\n        Applies a Transformer encoder layer on the input.\n\n        Parameters:\n            src (Tensor): The input of Transformer encoder layer. It is\n                a tensor with shape `[batch_size, sequence_length, d_model]`.\n                The data type should be float32 or float64.\n            src_mask (Tensor, optional): A tensor used in multi-head attention\n                to prevents attention to some unwanted positions, usually the\n                paddings or the subsequent positions. It is a tensor with shape\n                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.\n                When the data type is bool, the unwanted positions have `False`\n                values and the others have `True` values. When the data type is\n                int, the unwanted positions have 0 values and the others have 1\n                values. When the data type is float, the unwanted positions have\n                `-INF` values and the others have 0 values. It can be None when\n                nothing wanted or needed to be prevented attention to. Default None.\n            cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`.\n                See `TransformerEncoderLayer.gen_cache` for more details. It is\n                only used for inference and should be None for training. Default\n                None.\n\n        Returns:\n            Tensor|tuple: It is a tensor that has the same shape and data type \\\n                as `enc_input`, representing the output of Transformer encoder \\\n                layer. Or a tuple if `cache` is not None, except for encoder \\\n                layer output, the tuple includes the new cache which is same \\\n                as input `cache` argument but `incremental_cache` has an \\\n                incremental length. See `MultiHeadAttention.gen_cache` and \\\n                `MultiHeadAttention.forward` for more details.\n        \"\"\"\n        self.self_attn.need_weights = output_attentions\n        src_mask = _convert_attention_mask(src_mask, src.dtype)\n\n        auto.shard_tensor(self.linear1.weight, self.mesh[self.mesh_idx],\n                          [None, self.mesh.mp])\n        auto.shard_tensor(self.linear2.weight, self.mesh[self.mesh_idx],\n                          [self.mesh.mp, None])\n\n        residual = src\n        if self.normalize_before:\n            src = self.norm1(src)\n\n        attn_outputs = self.self_attn(src, src, src, src_mask, cache)\n        if isinstance(attn_outputs, tuple):\n            src = attn_outputs[0]\n            outputs = attn_outputs[1:]\n        else:\n            src = attn_outputs\n            outputs = None\n\n        src = residual + self.dropout1(src)\n        if not self.normalize_before:\n            src = self.norm1(src)\n\n        residual = src\n        if self.normalize_before:\n            src = self.norm2(src)\n        src = self.linear2(self.dropout(self.activation(self.linear1(src))))\n        src = residual + self.dropout2(src)\n        if not self.normalize_before:\n            src = self.norm2(src)\n\n        return src if outputs is None else (\n            (src, ) + outputs[::-1])  # hidden_states, cache, attentions\n\n    def gen_cache(self, src):\n        r\"\"\"\n        Generates cache for `forward` usage. The generated cache is an\n        instance of `MultiHeadAttention.Cache`.\n\n        Parameters:\n            src (Tensor): The input of Transformer encoder. It is a tensor\n                with shape `[batch_size, source_length, d_model]`. The data\n                type should be float32 or float64.\n\n        Returns:\n            incremental_cache: It is an instance of `MultiHeadAttention.Cache` \\\n                produced by `self_attn.gen_cache`, it reserves two tensors\n                shaped `[batch_size, nhead, 0, d_model // nhead]`. See \\\n                `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \\\n                for more details.\n        \"\"\"\n        incremental_cache = self.self_attn.gen_cache(\n            src, type=self.self_attn.Cache)\n        return incremental_cache\n\n\nclass TransformerEncoder(Layer):\n    \"\"\"\n    TransformerEncoder is a stack of N encoder layers.\n\n    Parameters:\n        encoder_layer (Layer): an instance of the `TransformerEncoderLayer`. It\n            would be used as the first layer, and the other layers would be created\n            according to the configurations of it.\n        num_layers (int): The number of encoder layers to be stacked.\n        norm (LayerNorm, optional): the layer normalization component. If provided,\n            apply layer normalization on the output of last encoder layer.\n\n    Examples:\n\n        .. code-block:: python\n\n            import paddle\n            from paddle.nn import TransformerEncoderLayer, TransformerEncoder\n\n            # encoder input: [batch_size, src_len, d_model]\n            enc_input = paddle.rand((2, 4, 128))\n            # self attention mask: [batch_size, n_head, src_len, src_len]\n            attn_mask = paddle.rand((2, 2, 4, 4))\n            encoder_layer = TransformerEncoderLayer(128, 2, 512)\n            encoder = TransformerEncoder(encoder_layer, 2)\n            enc_output = encoder(enc_input, attn_mask)  # [2, 4, 128]\n    \"\"\"\n\n    def __init__(self,\n                 encoder_layer,\n                 num_layers,\n                 norm=None,\n                 enable_recompute=False,\n                 mesh=None):\n        super(TransformerEncoder, self).__init__()\n        self.stages = mesh.stages(num_layers)\n        self.layers = nn.LayerList()\n        for i in range(num_layers):\n            if i == 0:\n                self.layers.append(encoder_layer)\n            else:\n                encoder_layer._config.update({\n                    \"mesh\": mesh,\n                    \"mesh_idx\": self.stages[i]\n                })\n                self.layers.append(\n                    type(encoder_layer)(**encoder_layer._config))\n\n        self.num_layers = num_layers\n        self.norm = norm\n        self.enable_recompute = enable_recompute\n\n    def forward(self,\n                src,\n                src_mask=None,\n                cache=None,\n                output_attentions=False,\n                output_hidden_states=False,\n                return_dict=False):\n        r\"\"\"\n        Applies a stack of N Transformer encoder layers on inputs. If `norm` is\n        provided, also applies layer normalization on the output of last encoder\n        layer.\n\n        Parameters:\n            src (Tensor): The input of Transformer encoder. It is a tensor\n                with shape `[batch_size, sequence_length, d_model]`. The data\n                type should be float32 or float64.\n            src_mask (Tensor, optional): A tensor used in multi-head attention\n                to prevents attention to some unwanted positions, usually the\n                paddings or the subsequent positions. It is a tensor with shape\n                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.\n                When the data type is bool, the unwanted positions have `False`\n                values and the others have `True` values. When the data type is\n                int, the unwanted positions have 0 values and the others have 1\n                values. When the data type is float, the unwanted positions have\n                `-INF` values and the others have 0 values. It can be None when\n                nothing wanted or needed to be prevented attention to. Default None.\n            cache (list, optional): It is a list, and each element in the list\n                is `incremental_cache` produced by `TransformerEncoderLayer.gen_cache`.\n                See `TransformerEncoder.gen_cache` for more details. It is only\n                used for inference and should be None for training. Default None.\n\n        Returns:\n            Tensor|tuple: It is a tensor that has the same shape and data type \\\n                as `src`, representing the output of Transformer encoder. \\\n                Or a tuple if `cache` is not None, except for encoder output, \\\n                the tuple includes the new cache which is same as input `cache` \\\n                argument but `incremental_cache` in it has an incremental length. \\\n                See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \\\n                for more details.\n        \"\"\"\n        src_mask = _convert_attention_mask(src_mask, src.dtype)\n\n        output = src\n        # To get cache from None when use_cache is True, which is compatible with HF\n        # while HF requires decoder. The implementation here uses cache update in the\n        # MultiHeadAttention not so efficiently, and maybe optimize it later.\n        if cache is None and getattr(self, \"_use_cache\", False):\n            cache = [tuple(self.layers[0].gen_cache(src))] * len(self.layers)\n        # To be compatible with `TransformerEncoder.forward`, `_use_cache` defualts\n        # to True when cache is not None.\n        new_caches = [] if cache is not None and getattr(self, \"_use_cache\",\n                                                         True) else None\n        all_attentions = [] if output_attentions else None\n        # NOTE: Also includes embeding output which is same as HF.\n        all_hidden_states = [output] if output_hidden_states else None\n        for i, mod in enumerate(self.layers):\n            auto.shard_tensor(\n                output, mod.mesh[mod.mesh_idx],\n                [mod.mesh.dp] + [None for i in range(len(output.shape) - 1)])\n\n            if self.enable_recompute:\n                layer_outputs = auto.recompute(mod)(\n                    output, src_mask, None if cache is None else cache[i]\n                    if isinstance(cache[i], MultiHeadAttention.Cache) else\n                    MultiHeadAttention.Cache(*cache[i]), output_attentions)\n            else:\n                layer_outputs = mod(\n                    output,\n                    src_mask=src_mask,\n                    cache=None if cache is None else cache[i]\n                    if isinstance(cache[i], MultiHeadAttention.Cache) else\n                    MultiHeadAttention.Cache(*cache[i]),\n                    output_attentions=output_attentions)\n\n            if isinstance(layer_outputs, tuple):\n                output = layer_outputs[0]\n                outputs = layer_outputs[1:]\n            else:\n                output = layer_outputs\n                outputs = None\n\n            if output_hidden_states:\n                all_hidden_states.append(output)\n            if output_attentions:\n                all_attentions.append(outputs[-1])\n            if new_caches is not None:\n                new_caches.append(outputs[0] if isinstance(cache[\n                    i], MultiHeadAttention.Cache) else (tuple(outputs[0])))\n\n        if self.norm is not None:\n            output = self.norm(output)\n\n            if output_hidden_states:\n                all_hidden_states[-1] = output\n\n        if not return_dict:\n            return output\n\n        return BaseModelOutputWithPastAndCrossAttentions(\n            last_hidden_state=output,\n            past_key_values=new_caches,\n            hidden_states=all_hidden_states,\n            attentions=all_attentions)\n\n    def gen_cache(self, src):\n        r\"\"\"\n        Generates cache for `forward` usage. The generated cache is a list, and\n        each element in it is `incremental_cache` produced by\n        `TransformerEncoderLayer.gen_cache`. See `TransformerEncoderLayer.gen_cache`\n        for more details.\n\n        Parameters:\n            src (Tensor): The input of Transformer encoder. It is a tensor\n                with shape `[batch_size, source_length, d_model]`. The data type\n                should be float32 or float64.\n\n        Returns:\n            list: It is a list, and each element in the list is `incremental_cache`\n            produced by `TransformerEncoderLayer.gen_cache`. See\n            `TransformerEncoderLayer.gen_cache` for more details.\n        \"\"\"\n        cache = [layer.gen_cache(src) for layer in self.layers]\n        return cache\n"
  },
  {
    "path": "ppfleetx/models/language_model/ernie/dygraph/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/models/language_model/ernie/dygraph/hybrid_model.py",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport io\nimport copy\nimport logging\nimport json\n\nimport paddle\nimport paddle.nn as nn\nfrom paddle.nn import functional as F\nfrom dataclasses import dataclass, field\n\nfrom ..layers.model_outputs import (\n    BaseModelOutputWithPoolingAndCrossAttentions,\n    ModelOutput,\n    ErnieForPreTrainingOutput,\n    SequenceClassifierOutput, )\n\nfrom ..layers.distributed_transformer import TransformerEncoderLayer, TransformerEncoder\nfrom paddle.distributed import fleet\nfrom paddle.distributed.fleet.meta_parallel import get_rng_state_tracker\nfrom paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer, SharedLayerDesc\n\nfrom ppfleetx.distributed.apis import env\n\n\ndef parallel_matmul(lm_output, logit_weights, parallel_output):\n    \"\"\"\n    \"\"\"\n    hcg = env.get_hcg()\n    model_parallel_group = hcg.get_model_parallel_group()\n    world_size = hcg.get_model_parallel_world_size()\n    rank = hcg.get_model_parallel_rank()\n\n    if world_size > 1:\n        input_parallel = paddle.distributed.collective._c_identity(\n            lm_output, group=model_parallel_group)\n\n        logits = paddle.matmul(input_parallel, logit_weights, transpose_y=True)\n\n        if parallel_output:\n            return logits\n\n        return paddle.distributed.collective._c_concat(\n            logits, group=model_parallel_group)\n    else:\n        logits = paddle.matmul(lm_output, logit_weights, transpose_y=True)\n        return logits\n\n\nclass ErnieEmbeddings(nn.Layer):\n    r\"\"\"\n    Include embeddings from word, position and token_type embeddings.\n    \"\"\"\n\n    def __init__(self,\n                 vocab_size,\n                 hidden_size=768,\n                 hidden_dropout_prob=0.1,\n                 max_position_embeddings=512,\n                 type_vocab_size=2,\n                 pad_token_id=0,\n                 weight_attr=None,\n                 task_type_vocab_size=3,\n                 task_id=0,\n                 use_task_id=False):\n        super(ErnieEmbeddings, self).__init__()\n\n        # self.word_embeddings = nn.Embedding(\n        #     vocab_size,\n        #     hidden_size,\n        #     padding_idx=pad_token_id,\n        #     weight_attr=weight_attr)\n\n        self.word_embeddings = fleet.meta_parallel.VocabParallelEmbedding(\n            vocab_size, hidden_size, weight_attr=weight_attr)\n\n        self.position_embeddings = nn.Embedding(\n            max_position_embeddings, hidden_size, weight_attr=weight_attr)\n        self.type_vocab_size = type_vocab_size\n        if self.type_vocab_size > 0:\n            self.token_type_embeddings = nn.Embedding(\n                type_vocab_size, hidden_size, weight_attr=weight_attr)\n        self.use_task_id = use_task_id\n        self.task_id = task_id\n        if self.use_task_id:\n            self.task_type_embeddings = nn.Embedding(\n                task_type_vocab_size, hidden_size, weight_attr=weight_attr)\n        self.layer_norm = nn.LayerNorm(hidden_size)\n        self.dropout = nn.Dropout(hidden_dropout_prob)\n\n    def forward(self,\n                input_ids,\n                token_type_ids=None,\n                position_ids=None,\n                task_type_ids=None,\n                inputs_embeds=None,\n                past_key_values_length=None):\n        if input_ids is not None:\n            input_shape = paddle.shape(input_ids)\n            input_embeddings = self.word_embeddings(input_ids)\n\n        else:\n            input_shape = paddle.shape(inputs_embeds)[:-1]\n            input_embeddings = inputs_embeds\n\n        if position_ids is None:\n            # maybe need use shape op to unify static graph and dynamic graph\n            #seq_length = input_ids.shape[1]\n            ones = paddle.ones(input_shape, dtype=\"int64\")\n            seq_length = paddle.cumsum(ones, axis=1)\n            position_ids = seq_length - ones\n            if past_key_values_length is not None:\n                position_ids += past_key_values_length\n            position_ids.stop_gradient = True\n\n        position_embeddings = self.position_embeddings(position_ids)\n        embeddings = input_embeddings + position_embeddings\n\n        if self.type_vocab_size > 0:\n            if token_type_ids is None:\n                token_type_ids = paddle.zeros(input_shape, dtype=\"int64\")\n            token_type_embeddings = self.token_type_embeddings(token_type_ids)\n\n            embeddings = embeddings + token_type_embeddings\n\n        if self.use_task_id:\n            if task_type_ids is None:\n                task_type_ids = paddle.ones(\n                    input_shape, dtype=\"int64\") * self.task_id\n            task_type_embeddings = self.task_type_embeddings(task_type_ids)\n            embeddings = embeddings + task_type_embeddings\n        embeddings = self.layer_norm(embeddings)\n        embeddings = self.dropout(embeddings)\n        return embeddings\n\n\nclass ErniePooler(nn.Layer):\n    def __init__(self, hidden_size, weight_attr=None):\n        super(ErniePooler, self).__init__()\n        self.dense = nn.Linear(\n            hidden_size, hidden_size, weight_attr=weight_attr)\n        self.activation = nn.Tanh()\n\n    def forward(self, hidden_states):\n        # We \"pool\" the model by simply taking the hidden state corresponding\n        # to the first token.\n        first_token_tensor = hidden_states[:, 0]\n        pooled_output = self.dense(first_token_tensor)\n        pooled_output = self.activation(pooled_output)\n        return pooled_output\n\n\nclass ErnieModelHybrid(nn.Layer):\n    r\"\"\"\n    The bare ERNIE Model transformer outputting raw hidden-states.\n\n    This model is a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation\n    /docs/en/api/paddle/fluid/dygraph/layers/Layer_en.html>`__ subclass. Use it as a regular Paddle Layer\n    and refer to the Paddle documentation for all matter related to general usage and behavior.\n\n    Args:\n        vocab_size (int):\n            Vocabulary size of `inputs_ids` in `ErnieModel`. Also is the vocab size of token embedding matrix.\n            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `ErnieModel`.\n        hidden_size (int, optional):\n            Dimensionality of the embedding layer, encoder layers and pooler layer. Defaults to `768`.\n        num_hidden_layers (int, optional):\n            Number of hidden layers in the Transformer encoder. Defaults to `12`.\n        num_attention_heads (int, optional):\n            Number of attention heads for each attention layer in the Transformer encoder.\n            Defaults to `12`.\n        intermediate_size (int, optional):\n            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors\n            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,\n            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.\n            Defaults to `3072`.\n        hidden_act (str, optional):\n            The non-linear activation function in the feed-forward layer.\n            ``\"gelu\"``, ``\"relu\"`` and any other paddle supported activation functions\n            are supported. Defaults to `\"gelu\"`.\n        hidden_dropout_prob (float, optional):\n            The dropout probability for all fully connected layers in the embeddings and encoder.\n            Defaults to `0.1`.\n        attention_probs_dropout_prob (float, optional):\n            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.\n            Defaults to `0.1`.\n        max_position_embeddings (int, optional):\n            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input\n            sequence. Defaults to `512`.\n        type_vocab_size (int, optional):\n            The vocabulary size of the `token_type_ids`.\n            Defaults to `2`.\n        initializer_range (float, optional):\n            The standard deviation of the normal initializer for initializing all weight matrices.\n            Defaults to `0.02`.\n            \n            .. note::\n                A normal_initializer initializes weight matrices as normal distributions.\n                See :meth:`ErniePretrainedModel._init_weights()` for how weights are initialized in `ErnieModel`.\n\n        pad_token_id(int, optional):\n            The index of padding token in the token vocabulary.\n            Defaults to `0`.\n\n    \"\"\"\n\n    def __init__(self,\n                 vocab_size,\n                 hidden_size=768,\n                 num_hidden_layers=12,\n                 num_attention_heads=12,\n                 intermediate_size=3072,\n                 hidden_act=\"gelu\",\n                 hidden_dropout_prob=0.1,\n                 attention_probs_dropout_prob=0.1,\n                 max_position_embeddings=512,\n                 type_vocab_size=2,\n                 initializer_range=0.02,\n                 pad_token_id=0,\n                 task_type_vocab_size=3,\n                 task_id=0,\n                 use_task_id=False,\n                 use_recompute=False,\n                 num_partitions=1):\n        super(ErnieModelHybrid, self).__init__()\n        self.pad_token_id = pad_token_id\n        self.initializer_range = initializer_range\n\n        self.hidden_size = hidden_size\n        self.vocab_size = vocab_size\n        self.hidden_act = hidden_act\n        self.hidden_dropout_prob = hidden_dropout_prob\n\n        weight_attr = paddle.ParamAttr(\n            initializer=nn.initializer.TruncatedNormal(\n                mean=0.0, std=self.initializer_range))\n        self.embeddings = ErnieEmbeddings(\n            vocab_size, hidden_size, hidden_dropout_prob,\n            max_position_embeddings, type_vocab_size, pad_token_id,\n            weight_attr, task_type_vocab_size, task_id, use_task_id)\n\n        encoder_layer = TransformerEncoderLayer(\n            hidden_size,\n            num_attention_heads,\n            intermediate_size,\n            dropout=hidden_dropout_prob,\n            activation=hidden_act,\n            attn_dropout=attention_probs_dropout_prob,\n            act_dropout=0,\n            weight_attr=weight_attr,\n            normalize_before=False,\n            num_partitions=num_partitions)\n        self.encoder = TransformerEncoder(\n            encoder_layer, num_hidden_layers, enable_recompute=use_recompute)\n\n        self.pooler = ErniePooler(hidden_size, weight_attr)\n        self.apply(self.init_weights)\n\n    def get_input_embeddings(self):\n        return self.embeddings.word_embeddings\n\n    def set_input_embeddings(self, value):\n        self.embeddings.word_embeddings = value\n\n    def forward(self,\n                input_ids,\n                token_type_ids=None,\n                position_ids=None,\n                attention_mask=None,\n                task_type_ids=None,\n                past_key_values=None,\n                inputs_embeds=None,\n                use_cache=None,\n                output_hidden_states=False,\n                output_attentions=False,\n                return_dict=False):\n        r\"\"\"\n        Args:\n            input_ids (Tensor):\n                Indices of input sequence tokens in the vocabulary. They are\n                numerical representations of tokens that build the input sequence.\n                It's data type should be `int64` and has a shape of [batch_size, sequence_length].\n            token_type_ids (Tensor, optional):\n                Segment token indices to indicate different portions of the inputs.\n                Selected in the range ``[0, type_vocab_size - 1]``.\n                If `type_vocab_size` is 2, which means the inputs have two portions.\n                Indices can either be 0 or 1:\n\n                - 0 corresponds to a *sentence A* token,\n                - 1 corresponds to a *sentence B* token.\n\n                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].\n                Defaults to `None`, which means we don't add segment embeddings.\n            position_ids (Tensor, optional):\n                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,\n                max_position_embeddings - 1]``.\n                Shape as `[batch_size, num_tokens]` and dtype as int64. Defaults to `None`.\n            attention_mask (Tensor, optional):\n                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,\n                usually the paddings or the subsequent positions.\n                Its data type can be int, float and bool.\n                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.\n                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.\n                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.\n                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.\n                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],\n                [batch_size, num_attention_heads, sequence_length, sequence_length].\n                We use whole-word-mask in ERNIE, so the whole word will have the same value. For example, \"使用\" as a word,\n                \"使\" and \"用\" will have the same value.\n                Defaults to `None`, which means nothing needed to be prevented attention to.\n             inputs_embeds (Tensor, optional):\n                If you want to control how to convert `inputs_ids` indices into associated vectors, you can\n                pass an embedded representation directly instead of passing `inputs_ids`.\n            past_key_values (tuple(tuple(Tensor)), optional):\n                The length of tuple equals to the number of layers, and each inner\n                tuple haves 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`)\n                which contains precomputed key and value hidden states of the attention blocks.\n                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that\n                don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all\n                `input_ids` of shape `(batch_size, sequence_length)`.\n            use_cache (`bool`, optional):\n                If set to `True`, `past_key_values` key value states are returned.\n                Defaults to `None`.\n            output_hidden_states (bool, optional):\n                Whether to return the hidden states of all layers.\n                Defaults to `False`.\n            output_attentions (bool, optional):\n                Whether to return the attentions tensors of all attention layers.\n                Defaults to `False`.\n            return_dict (bool, optional):\n                Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ModelOutput` object. \n                If `False`, the output will be a tuple of tensors. Defaults to `False`.\n\n        Returns:\n            An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions` if\n            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding\n            to ordered and not None (depending on the input arguments) fields of\n            :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions`.\n\n        \"\"\"\n        if input_ids is not None and inputs_embeds is not None:\n            raise ValueError(\n                \"You cannot specify both input_ids and inputs_embeds at the same time.\"\n            )\n        elif input_ids is not None:\n            input_shape = paddle.shape(input_ids)\n        elif inputs_embeds is not None:\n            input_shape = paddle.shape(inputs_embeds)[:-1]\n        else:\n            raise ValueError(\n                \"You have to specify either input_ids or inputs_embeds\")\n\n        past_key_values_length = None\n        if past_key_values is not None:\n            past_key_values_length = past_key_values[0][0].shape[2]\n\n        if attention_mask is None:\n            attention_mask = paddle.unsqueeze(\n                (input_ids == self.pad_token_id\n                 ).astype(self.pooler.dense.weight.dtype) * -1e4,\n                axis=[1, 2])\n            if past_key_values is not None:\n                batch_size = past_key_values[0][0].shape[0]\n                past_mask = paddle.zeros(\n                    [batch_size, 1, 1, past_key_values_length],\n                    dtype=attention_mask.dtype)\n                attention_mask = paddle.concat(\n                    [past_mask, attention_mask], axis=-1)\n\n        # For 2D attention_mask from tokenizer\n        elif attention_mask.ndim == 2:\n            attention_mask = paddle.unsqueeze(\n                attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())\n            attention_mask = (1.0 - attention_mask) * -1e4\n        attention_mask.stop_gradient = True\n\n        embedding_output = self.embeddings(\n            input_ids=input_ids,\n            position_ids=position_ids,\n            token_type_ids=token_type_ids,\n            task_type_ids=task_type_ids,\n            inputs_embeds=inputs_embeds,\n            past_key_values_length=past_key_values_length)\n\n        self.encoder._use_cache = use_cache  # To be consistent with HF\n        encoder_outputs = self.encoder(\n            embedding_output,\n            src_mask=attention_mask,\n            cache=past_key_values,\n            output_attentions=output_attentions,\n            output_hidden_states=output_hidden_states,\n            return_dict=return_dict)\n        if isinstance(encoder_outputs, type(embedding_output)):\n            sequence_output = encoder_outputs\n            pooled_output = self.pooler(sequence_output)\n            return (sequence_output, pooled_output)\n        else:\n            sequence_output = encoder_outputs[0]\n            pooled_output = self.pooler(sequence_output)\n            if not return_dict:\n                return (sequence_output, pooled_output) + encoder_outputs[1:]\n            return BaseModelOutputWithPoolingAndCrossAttentions(\n                last_hidden_state=sequence_output,\n                pooler_output=pooled_output,\n                past_key_values=encoder_outputs.past_key_values,\n                hidden_states=encoder_outputs.hidden_states,\n                attentions=encoder_outputs.attentions)\n\n    def init_weights(self, layer):\n        \"\"\" Initialization hook \"\"\"\n        if isinstance(layer, (nn.Linear, nn.Embedding)):\n            # only support dygraph, use truncated_normal and make it inplace\n            # and configurable later\n            if isinstance(layer.weight, paddle.Tensor):\n                layer.weight.set_value(\n                    paddle.tensor.normal(\n                        mean=0.0,\n                        std=self.initializer_range\n                        if hasattr(self, \"initializer_range\") else\n                        self.ernie.initializer_range,\n                        shape=layer.weight.shape))\n        elif isinstance(layer, nn.LayerNorm):\n            layer._epsilon = 1e-12\n\n\nclass ErnieLMPredictionHead(nn.Layer):\n    r\"\"\"\n    Ernie Model with a `language modeling` head on top.\n    \"\"\"\n\n    def __init__(\n            self,\n            hidden_size,\n            vocab_size,\n            activation,\n            embedding_weights=None,\n            weight_attr=None, ):\n        super(ErnieLMPredictionHead, self).__init__()\n\n        self.transform = nn.Linear(\n            hidden_size, hidden_size, weight_attr=weight_attr)\n        self.activation = getattr(nn.functional, activation)\n        self.layer_norm = nn.LayerNorm(hidden_size)\n\n        # TODO(shenliang03): to support shared weights in future\n        self.decoder_weight = self.create_parameter(\n            shape=[vocab_size, hidden_size],\n            dtype=self.transform.weight.dtype,\n            attr=weight_attr,\n            is_bias=False)\n        # if embedding_weights is None else embedding_weights\n        self.decoder_bias = self.create_parameter(\n            shape=[self.decoder_weight.shape[0]],\n            dtype=self.decoder_weight.dtype,\n            is_bias=True)\n\n    def forward(self, hidden_states, masked_positions=None):\n        if masked_positions is not None:\n            hidden_states = paddle.reshape(hidden_states,\n                                           [-1, hidden_states.shape[-1]])\n            hidden_states = paddle.tensor.gather(hidden_states,\n                                                 masked_positions)\n        # gather masked tokens might be more quick\n        hidden_states = self.transform(hidden_states)\n        hidden_states = self.activation(hidden_states)\n        hidden_states = self.layer_norm(hidden_states)\n        # hidden_states = parallel_matmul(hidden_states, self.decoder_weight, True) + self.decoder_bias\n\n        hidden_states = paddle.matmul(\n            hidden_states, self.decoder_weight,\n            transpose_y=True) + self.decoder_bias\n\n        return hidden_states\n\n\nclass ErniePretrainingHeads(nn.Layer):\n    def __init__(\n            self,\n            hidden_size,\n            vocab_size,\n            activation,\n            embedding_weights=None,\n            weight_attr=None, ):\n        super(ErniePretrainingHeads, self).__init__()\n        self.predictions = ErnieLMPredictionHead(hidden_size, vocab_size,\n                                                 activation, embedding_weights,\n                                                 weight_attr)\n        self.seq_relationship = nn.Linear(\n            hidden_size, 2, weight_attr=weight_attr)\n\n    def forward(self, sequence_output, pooled_output, masked_positions=None):\n        prediction_scores = self.predictions(sequence_output, masked_positions)\n        seq_relationship_score = self.seq_relationship(pooled_output)\n        return prediction_scores, seq_relationship_score\n\n\nclass ErnieForPretrainingHybrid(nn.Layer):\n    r\"\"\"\n    Ernie Model with a `masked language modeling` head and a `sentence order prediction` head\n    on top.\n\n    \"\"\"\n\n    def __init__(self, ernie):\n        super(ErnieForPretrainingHybrid, self).__init__()\n        self.ernie = ernie\n        weight_attr = paddle.ParamAttr(\n            initializer=nn.initializer.TruncatedNormal(\n                mean=0.0, std=self.ernie.initializer_range))\n        self.cls = ErniePretrainingHeads(\n            self.ernie.hidden_size,\n            self.ernie.vocab_size,\n            self.ernie.hidden_act,\n            embedding_weights=self.ernie.embeddings.word_embeddings.weight,\n            weight_attr=weight_attr, )\n\n        self.apply(self.init_weights)\n\n    def forward(self,\n                input_ids,\n                token_type_ids=None,\n                position_ids=None,\n                attention_mask=None,\n                masked_positions=None,\n                inputs_embeds=None,\n                labels=None,\n                next_sentence_label=None,\n                output_hidden_states=False,\n                output_attentions=False,\n                return_dict=False):\n        r\"\"\"\n        Args:\n            input_ids (Tensor):\n                See :class:`ErnieModel`.\n            token_type_ids (Tensor, optional):\n                See :class:`ErnieModel`.\n            position_ids (Tensor, optional):\n                See :class:`ErnieModel`.\n            attention_mask (Tensor, optional):\n                See :class:`ErnieModel`.\n            inputs_embeds(Tensor, optional):\n                See :class:`ErnieModel`.\n            labels (Tensor of shape `(batch_size, sequence_length)`, optional):\n                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,\n                vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),\n                the loss is only computed for the tokens with labels in `[0, ..., vocab_size]`.\n            next_sentence_label (Tensor of shape `(batch_size,)`, optional):\n                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence\n                pair (see `input_ids` docstring) Indices should be in `[0, 1]`:\n\n                - 0 indicates sequence B is a continuation of sequence A,\n                - 1 indicates sequence B is a random sequence.\n            output_hidden_states (bool, optional):\n                Whether to return the hidden states of all layers.\n                Defaults to `False`.\n            output_attentions (bool, optional):\n                Whether to return the attentions tensors of all attention layers.\n                Defaults to `False`.\n            return_dict (bool, optional):\n                Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput` object. If\n                `False`, the output will be a tuple of tensors. Defaults to `False`.\n\n        Returns:\n            An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput` if `return_dict=True`.\n            Otherwise it returns a tuple of tensors corresponding to ordered and\n            not None (depending on the input arguments) fields of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput`.\n\n        \"\"\"\n        # with paddle.static.amp.fp16_guard():\n        outputs = self.ernie(\n            input_ids,\n            token_type_ids=token_type_ids,\n            position_ids=position_ids,\n            attention_mask=attention_mask,\n            inputs_embeds=inputs_embeds,\n            output_attentions=output_attentions,\n            output_hidden_states=output_hidden_states,\n            return_dict=return_dict)\n        sequence_output, pooled_output = outputs[:2]\n        prediction_scores, seq_relationship_score = self.cls(\n            sequence_output, pooled_output, masked_positions)\n\n        total_loss = None\n        if labels is not None and next_sentence_label is not None:\n            if env.get_hcg().get_model_parallel_world_size > 1 and paddle.is_compiled_with_cuda():\n                loss_fct = fleet.meta_parallel.ParallelCrossEntropy()\n            else:\n                loss_fct = paddle.nn.CrossEntropyLoss()\n\n            masked_lm_loss = loss_fct(\n                prediction_scores.reshape(\n                    (-1, paddle.shape(prediction_scores)[-1])),\n                labels.reshape((-1, )))\n            next_sentence_loss = loss_fct(\n                seq_relationship_score.reshape((-1, 2)),\n                next_sentence_label.reshape((-1, )))\n            total_loss = masked_lm_loss + next_sentence_loss\n\n        if not return_dict:\n            output = (prediction_scores, seq_relationship_score) + outputs[2:]\n            return (\n                (total_loss, ) + output) if total_loss is not None else output\n\n        return ErnieForPreTrainingOutput(\n            loss=total_loss,\n            prediction_logits=prediction_scores,\n            seq_relationship_logits=seq_relationship_score,\n            hidden_states=outputs.hidden_states,\n            attentions=outputs.attentions, )\n\n    def init_weights(self, layer):\n        \"\"\" Initialization hook \"\"\"\n        if isinstance(layer, (nn.Linear, nn.Embedding)):\n            # only support dygraph, use truncated_normal and make it inplace\n            # and configurable later\n            if isinstance(layer.weight, paddle.Tensor):\n                layer.weight.set_value(\n                    paddle.tensor.normal(\n                        mean=0.0,\n                        std=self.initializer_range\n                        if hasattr(self, \"initializer_range\") else\n                        self.ernie.initializer_range,\n                        shape=layer.weight.shape))\n        elif isinstance(layer, nn.LayerNorm):\n            layer._epsilon = 1e-12\n\n\nclass ErniePretrainingCriterionHybrid(paddle.nn.Layer):\n    r\"\"\"\n    The loss output of Ernie Model during the pretraining:\n    a `masked language modeling` head and a `next sentence prediction (classification)` head.\n\n    \"\"\"\n\n    def __init__(self, with_nsp_loss=True):\n        super(ErniePretrainingCriterionHybrid, self).__init__()\n        self.with_nsp_loss = with_nsp_loss\n\n    def forward(self,\n                prediction_scores,\n                seq_relationship_score,\n                masked_lm_labels,\n                next_sentence_labels=None):\n        \"\"\"\n        Args:\n            prediction_scores(Tensor):\n                The scores of masked token prediction. Its data type should be float32.\n                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].\n                Otherwise, its shape is [batch_size, mask_token_num, vocab_size]\n            seq_relationship_score(Tensor):\n                The scores of next sentence prediction. Its data type should be float32 and\n                its shape is [batch_size, 2]\n            masked_lm_labels(Tensor):\n                The labels of the masked language modeling, its dimensionality is equal to `prediction_scores`.\n                Its data type should be int64. If `masked_positions` is None, its shape is [batch_size, sequence_length, 1].\n                Otherwise, its shape is [batch_size, mask_token_num, 1]\n            next_sentence_labels(Tensor):\n                The labels of the next sentence prediction task, the dimensionality of `next_sentence_labels`\n                is equal to `seq_relation_labels`. Its data type should be int64 and\n                its shape is [batch_size, 1]\n\n        Returns:\n            Tensor: The pretraining loss, equals to the sum of `masked_lm_loss` plus the mean of `next_sentence_loss`.\n            Its data type should be float32 and its shape is [1].\n\n        \"\"\"\n\n        # with paddle.static.amp.fp16_guard():\n        # hcg = env.get_hcg()\n        # mp_size = hcg.get_model_parallel_world_size()\n\n        # if mp_size > 1:\n        #     mask = (masked_lm_labels == -1)\n        #     masked_lm_labels[mask] = 0\n        #     masked_lm_loss = self.parallel_loss_func(\n        #         prediction_scores, masked_lm_labels)\n        #     masked_lm_loss[mask] = 0.\n        # else:\n        # masked_lm_loss = self.loss_func(prediction_scores,\n        #                                 masked_lm_labels,\n        #                                 ignore_index=-1)\n        masked_lm_loss = F.cross_entropy(\n            prediction_scores,\n            masked_lm_labels,\n            ignore_index=-1,\n            reduction='none')\n\n        if not self.with_nsp_loss:\n            return paddle.mean(masked_lm_loss)\n\n        next_sentence_loss = F.cross_entropy(\n            seq_relationship_score, next_sentence_labels, reduction='none')\n        return paddle.mean(masked_lm_loss), paddle.mean(next_sentence_loss)\n\n\n# these Layers is just for PipelineParallel\n\n\nclass EmbeddingsPipe(ErnieEmbeddings):\n    @property\n    def embedding_weight(self):\n        return self.word_embeddings.weight\n\n    def forward(self, tensors):\n        input_ids, token_type_ids, attention_mask = tensors\n\n        past_key_values_length = None\n\n        if attention_mask is None:\n            attention_mask = paddle.unsqueeze(\n                (input_ids == self.pad_token_id\n                 ).astype(self.pooler.dense.weight.dtype) * -1e4,\n                axis=[1, 2])\n            if past_key_values is not None:\n                batch_size = past_key_values[0][0].shape[0]\n                past_mask = paddle.zeros(\n                    [batch_size, 1, 1, past_key_values_length],\n                    dtype=attention_mask.dtype)\n                attention_mask = paddle.concat(\n                    [past_mask, attention_mask], axis=-1)\n\n        # For 2D attention_mask from tokenizer\n        elif attention_mask.ndim == 2:\n            attention_mask = paddle.unsqueeze(\n                attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())\n            attention_mask = (1.0 - attention_mask) * -1e4\n        attention_mask.stop_gradient = True\n\n        embeddings = super().forward(\n            input_ids=input_ids,\n            position_ids=None,\n            token_type_ids=token_type_ids,\n            task_type_ids=None,\n            inputs_embeds=None,\n            past_key_values_length=past_key_values_length)\n\n        return attention_mask, embeddings\n\n\nclass TransformerEncoderLayerPipe(TransformerEncoderLayer):\n    def forward(self, tensors):\n        attention_mask, inputs = tensors\n        outputs = super().forward(src=inputs, src_mask=attention_mask)\n        return attention_mask, outputs\n\n\nclass LayerNormPipe(nn.LayerNorm):\n    def forward(self, tensors):\n        _, inputs = tensors\n        output = super().forward(inputs)\n        return output\n\n\nclass ErniePoolerPipe(ErniePooler):\n    def forward(self, args):\n        sequence_output = args\n        pooled_output = super().forward(sequence_output)\n        return sequence_output, pooled_output\n\n\nclass ErniePretrainingCriterionPipe(ErniePretrainingCriterionHybrid):\n    def __init__(self, *heads_args, **heads_kargs):\n        super(ErniePretrainingCriterionPipe, self).__init__()\n        self.heads = ErniePretrainingHeads(*heads_args, **heads_kargs)\n\n    def forward(self, outputs, data):\n        sequence_output, pooled_output = outputs\n        masked_lm_positions, masked_lm_labels, next_sentence_labels = data\n\n        prediction_scores, seq_relationship_score = self.heads(\n            sequence_output, pooled_output, masked_lm_positions)\n\n        lm_loss, sop_loss = super().forward(\n            prediction_scores=prediction_scores,\n            seq_relationship_score=seq_relationship_score,\n            masked_lm_labels=masked_lm_labels,\n            next_sentence_labels=next_sentence_labels)\n\n        return lm_loss + sop_loss\n\n\nclass ErnieForPretrainingPipe(PipelineLayer):\n    def __init__(self,\n                 vocab_size,\n                 hidden_size=768,\n                 num_hidden_layers=12,\n                 num_attention_heads=12,\n                 intermediate_size=3072,\n                 hidden_act=\"gelu\",\n                 hidden_dropout_prob=0.1,\n                 attention_probs_dropout_prob=0.1,\n                 max_position_embeddings=512,\n                 type_vocab_size=2,\n                 initializer_range=0.02,\n                 pad_token_id=0,\n                 task_type_vocab_size=3,\n                 task_id=0,\n                 use_task_id=False,\n                 use_recompute=False,\n                 num_partitions=1):\n\n        self.descs = []\n        self.descs.append(\n            LayerDesc(\n                EmbeddingsPipe,\n                vocab_size=vocab_size,\n                hidden_size=hidden_size,\n                hidden_dropout_prob=hidden_dropout_prob,\n                max_position_embeddings=max_position_embeddings,\n                type_vocab_size=type_vocab_size,\n                pad_token_id=pad_token_id,\n                weight_attr=None,\n                task_type_vocab_size=task_type_vocab_size,\n                task_id=task_id,\n                use_task_id=use_task_id))\n\n        for _ in range(num_hidden_layers):\n            self.descs.append(\n                LayerDesc(\n                    TransformerEncoderLayerPipe,\n                    d_model=hidden_size,\n                    nhead=num_attention_heads,\n                    dim_feedforward=intermediate_size,\n                    dropout=hidden_dropout_prob,\n                    activation=hidden_act,\n                    attn_dropout=attention_probs_dropout_prob,\n                    act_dropout=hidden_dropout_prob,\n                    normalize_before=False,\n                    weight_attr=None,\n                    bias_attr=None,\n                    num_partitions=num_partitions))\n\n        self.descs.append(\n            LayerDesc(\n                LayerNormPipe, normalized_shape=hidden_size))\n        self.descs.append(LayerDesc(ErniePoolerPipe, hidden_size=hidden_size))\n\n        loss_fun = ErniePretrainingCriterionPipe(\n            hidden_size=hidden_size,\n            vocab_size=vocab_size,\n            activation=hidden_act,\n            embedding_weights=None,\n            weight_attr=paddle.ParamAttr(\n                initializer=nn.initializer.TruncatedNormal(\n                    mean=0.0, std=initializer_range)))\n\n        super().__init__(\n            layers=self.descs,\n            loss_fn=loss_fun,\n            topology=env.get_hcg().topology(),\n            seg_method=\"layer:TransformerEncoderLayer\",\n            recompute_interval=1 if use_recompute else 0,\n            recompute_ctx={\n                \"mp_group\": env.get_hcg().get_model_parallel_group(),\n                \"offload\": False,\n                \"partition\": False\n            })\n\n\nclass ErnieForSequenceClassificationHybrid(nn.Layer):\n    \"\"\"\n    Ernie Model with a linear layer on top of the output layer,\n    designed for sequence classification/regression tasks like GLUE tasks.\n\n    Args:\n        ernie (:class:`ErnieModel`):\n            An instance of ErnieModel.\n        num_classes (int, optional):\n            The number of classes. Defaults to `2`.\n        dropout (float, optional):\n            The dropout probability for output of ERNIE.\n            If None, use the same value as `hidden_dropout_prob` of `ErnieModel`\n            instance `ernie`. Defaults to None.\n    \"\"\"\n\n    def __init__(self, ernie, num_classes=2, dropout=None):\n        super(ErnieForSequenceClassificationHybrid, self).__init__()\n        self.num_classes = num_classes\n        self.ernie = ernie  # allow ernie to be config\n        self.dropout = nn.Dropout(dropout if dropout is not None else\n                                  self.ernie.hidden_dropout_prob)\n        self.classifier = nn.Linear(self.ernie.hidden_size, num_classes)\n        self.apply(self.init_weights)\n\n    def forward(self,\n                input_ids,\n                token_type_ids=None,\n                position_ids=None,\n                attention_mask=None,\n                labels=None,\n                output_hidden_states=False,\n                output_attentions=False,\n                return_dict=False):\n        r\"\"\"\n        The ErnieForSequenceClassification forward method, overrides the __call__() special method.\n\n        Args:\n            input_ids (Tensor):\n                See :class:`ErnieModelHybrid`.\n            token_type_ids (Tensor, optional):\n                See :class:`ErnieModelHybrid`.\n            position_ids(Tensor, optional):\n                See :class:`ErnieModelHybrid`.\n            attention_mask (Tensor, optional):\n                See :class:`ErnieModelHybrid`.\n            labels (Tensor of shape `(batch_size,)`, optional):\n                Labels for computing the sequence classification/regression loss.\n                Indices should be in `[0, ..., num_classes - 1]`. If `num_classes == 1`\n                a regression loss is computed (Mean-Square loss), If `num_classes > 1`\n                a classification loss is computed (Cross-Entropy).\n            output_hidden_states (bool, optional):\n                Whether to return the hidden states of all layers.\n                Defaults to `False`.\n            output_attentions (bool, optional):\n                Whether to return the attentions tensors of all attention layers.\n                Defaults to `False`.\n            return_dict (bool, optional):\n                Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput` object. If\n                `False`, the output will be a tuple of tensors. Defaults to `False`.\n\n        Returns:\n            An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput` if `return_dict=True`.\n            Otherwise it returns a tuple of tensors corresponding to ordered and\n            not None (depending on the input arguments) fields of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput`.\n\n        \"\"\"\n\n        outputs = self.ernie(\n            input_ids,\n            token_type_ids=token_type_ids,\n            position_ids=position_ids,\n            attention_mask=attention_mask,\n            output_attentions=output_attentions,\n            output_hidden_states=output_hidden_states,\n            return_dict=return_dict)\n        pooled_output = outputs[1]\n\n        pooled_output = self.dropout(pooled_output)\n        logits = self.classifier(pooled_output)\n\n        loss = None\n        if labels is not None:\n            if self.num_classes == 1:\n                loss_fct = paddle.nn.MSELoss()\n                loss = loss_fct(logits, labels)\n            elif labels.dtype == paddle.int64 or labels.dtype == paddle.int32:\n                loss_fct = paddle.nn.CrossEntropyLoss()\n                loss = loss_fct(\n                    logits.reshape((-1, self.num_classes)),\n                    labels.reshape((-1, )))\n            else:\n                loss_fct = paddle.nn.BCEWithLogitsLoss()\n                loss = loss_fct(logits, labels)\n\n        if not return_dict:\n            output = (logits, ) + outputs[2:]\n            return ((loss, ) + output) if loss is not None else (\n                output[0] if len(output) == 1 else output)\n\n        return SequenceClassifierOutput(\n            loss=loss,\n            logits=logits,\n            hidden_states=outputs.hidden_states,\n            attentions=outputs.attentions, )\n\n    def init_weights(self, layer):\n        \"\"\" Initialization hook \"\"\"\n        if isinstance(layer, (nn.Linear, nn.Embedding)):\n            if isinstance(layer.weight, paddle.Tensor):\n                layer.weight.set_value(\n                    paddle.tensor.normal(\n                        mean=0.0,\n                        std=self.initializer_range\n                        if hasattr(self, \"initializer_range\") else\n                        self.ernie.initializer_range,\n                        shape=layer.weight.shape))\n        elif isinstance(layer, nn.LayerNorm):\n            layer._epsilon = 1e-12\n"
  },
  {
    "path": "ppfleetx/models/language_model/ernie/dygraph/single_model.py",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport io\nimport copy\nimport logging\nimport json\n\nimport paddle\nimport paddle.nn as nn\nfrom paddle.nn import functional as F\nfrom dataclasses import dataclass, field\n\nfrom ..layers.model_outputs import (\n    BaseModelOutputWithPoolingAndCrossAttentions,\n    ModelOutput,\n    ErnieForPreTrainingOutput,\n    SequenceClassifierOutput, )\nfrom ..layers.transformer import TransformerEncoderLayer, TransformerEncoder\n\n\nclass ErnieEmbeddings(nn.Layer):\n    r\"\"\"\n    Include embeddings from word, position and token_type embeddings.\n    \"\"\"\n\n    def __init__(self,\n                 vocab_size,\n                 hidden_size=768,\n                 hidden_dropout_prob=0.1,\n                 max_position_embeddings=512,\n                 type_vocab_size=2,\n                 pad_token_id=0,\n                 weight_attr=None,\n                 task_type_vocab_size=3,\n                 task_id=0,\n                 use_task_id=False):\n        super(ErnieEmbeddings, self).__init__()\n\n        self.word_embeddings = nn.Embedding(\n            vocab_size,\n            hidden_size,\n            padding_idx=pad_token_id,\n            weight_attr=weight_attr)\n        self.position_embeddings = nn.Embedding(\n            max_position_embeddings, hidden_size, weight_attr=weight_attr)\n        self.type_vocab_size = type_vocab_size\n        if self.type_vocab_size > 0:\n            self.token_type_embeddings = nn.Embedding(\n                type_vocab_size, hidden_size, weight_attr=weight_attr)\n        self.use_task_id = use_task_id\n        self.task_id = task_id\n        if self.use_task_id:\n            self.task_type_embeddings = nn.Embedding(\n                task_type_vocab_size, hidden_size, weight_attr=weight_attr)\n        self.layer_norm = nn.LayerNorm(hidden_size)\n        self.dropout = nn.Dropout(hidden_dropout_prob)\n\n    def forward(self,\n                input_ids,\n                token_type_ids=None,\n                position_ids=None,\n                task_type_ids=None,\n                inputs_embeds=None,\n                past_key_values_length=None):\n        if input_ids is not None:\n            input_shape = paddle.shape(input_ids)\n            input_embeddings = self.word_embeddings(input_ids)\n        else:\n            input_shape = paddle.shape(inputs_embeds)[:-1]\n            input_embeddings = inputs_embeds\n\n        if position_ids is None:\n            # maybe need use shape op to unify static graph and dynamic graph\n            #seq_length = input_ids.shape[1]\n            ones = paddle.ones(input_shape, dtype=\"int64\")\n            seq_length = paddle.cumsum(ones, axis=1)\n            position_ids = seq_length - ones\n            if past_key_values_length is not None:\n                position_ids += past_key_values_length\n            position_ids.stop_gradient = True\n\n        position_embeddings = self.position_embeddings(position_ids)\n        embeddings = input_embeddings + position_embeddings\n\n        if self.type_vocab_size > 0:\n            if token_type_ids is None:\n                token_type_ids = paddle.zeros(input_shape, dtype=\"int64\")\n            token_type_embeddings = self.token_type_embeddings(token_type_ids)\n            embeddings = embeddings + token_type_embeddings\n\n        if self.use_task_id:\n            if task_type_ids is None:\n                task_type_ids = paddle.ones(\n                    input_shape, dtype=\"int64\") * self.task_id\n            task_type_embeddings = self.task_type_embeddings(task_type_ids)\n            embeddings = embeddings + task_type_embeddings\n        embeddings = self.layer_norm(embeddings)\n        embeddings = self.dropout(embeddings)\n        return embeddings\n\n\nclass ErniePooler(nn.Layer):\n    def __init__(self, hidden_size, weight_attr=None):\n        super(ErniePooler, self).__init__()\n        self.dense = nn.Linear(\n            hidden_size, hidden_size, weight_attr=weight_attr)\n        self.activation = nn.Tanh()\n\n    def forward(self, hidden_states):\n        # We \"pool\" the model by simply taking the hidden state corresponding\n        # to the first token.\n        first_token_tensor = hidden_states[:, 0]\n        pooled_output = self.dense(first_token_tensor)\n        pooled_output = self.activation(pooled_output)\n        return pooled_output\n\n\nclass ErnieModel(nn.Layer):\n    r\"\"\"\n    The bare ERNIE Model transformer outputting raw hidden-states.\n\n    This model is a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation\n    /docs/en/api/paddle/fluid/dygraph/layers/Layer_en.html>`__ subclass. Use it as a regular Paddle Layer\n    and refer to the Paddle documentation for all matter related to general usage and behavior.\n\n    Args:\n        vocab_size (int):\n            Vocabulary size of `inputs_ids` in `ErnieModel`. Also is the vocab size of token embedding matrix.\n            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `ErnieModel`.\n        hidden_size (int, optional):\n            Dimensionality of the embedding layer, encoder layers and pooler layer. Defaults to `768`.\n        num_hidden_layers (int, optional):\n            Number of hidden layers in the Transformer encoder. Defaults to `12`.\n        num_attention_heads (int, optional):\n            Number of attention heads for each attention layer in the Transformer encoder.\n            Defaults to `12`.\n        intermediate_size (int, optional):\n            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors\n            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,\n            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.\n            Defaults to `3072`.\n        hidden_act (str, optional):\n            The non-linear activation function in the feed-forward layer.\n            ``\"gelu\"``, ``\"relu\"`` and any other paddle supported activation functions\n            are supported. Defaults to `\"gelu\"`.\n        hidden_dropout_prob (float, optional):\n            The dropout probability for all fully connected layers in the embeddings and encoder.\n            Defaults to `0.1`.\n        attention_probs_dropout_prob (float, optional):\n            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.\n            Defaults to `0.1`.\n        max_position_embeddings (int, optional):\n            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input\n            sequence. Defaults to `512`.\n        type_vocab_size (int, optional):\n            The vocabulary size of the `token_type_ids`.\n            Defaults to `2`.\n        initializer_range (float, optional):\n            The standard deviation of the normal initializer for initializing all weight matrices.\n            Defaults to `0.02`.\n            \n            .. note::\n                A normal_initializer initializes weight matrices as normal distributions.\n                See :meth:`ErniePretrainedModel._init_weights()` for how weights are initialized in `ErnieModel`.\n\n        pad_token_id(int, optional):\n            The index of padding token in the token vocabulary.\n            Defaults to `0`.\n\n    \"\"\"\n\n    def __init__(self,\n                 vocab_size,\n                 hidden_size=768,\n                 num_hidden_layers=12,\n                 num_attention_heads=12,\n                 intermediate_size=3072,\n                 hidden_act=\"gelu\",\n                 hidden_dropout_prob=0.1,\n                 attention_probs_dropout_prob=0.1,\n                 max_position_embeddings=512,\n                 type_vocab_size=2,\n                 initializer_range=0.02,\n                 pad_token_id=0,\n                 task_type_vocab_size=3,\n                 task_id=0,\n                 use_task_id=False,\n                 use_recompute=False):\n        super(ErnieModel, self).__init__()\n        self.pad_token_id = pad_token_id\n        self.initializer_range = initializer_range\n\n        self.hidden_size = hidden_size\n        self.vocab_size = vocab_size\n        self.hidden_act = hidden_act\n        self.hidden_dropout_prob = hidden_dropout_prob\n\n        weight_attr = paddle.ParamAttr(\n            initializer=nn.initializer.TruncatedNormal(\n                mean=0.0, std=self.initializer_range))\n        self.embeddings = ErnieEmbeddings(\n            vocab_size, hidden_size, hidden_dropout_prob,\n            max_position_embeddings, type_vocab_size, pad_token_id,\n            weight_attr, task_type_vocab_size, task_id, use_task_id)\n\n        encoder_layer = TransformerEncoderLayer(\n            hidden_size,\n            num_attention_heads,\n            intermediate_size,\n            dropout=hidden_dropout_prob,\n            activation=hidden_act,\n            attn_dropout=attention_probs_dropout_prob,\n            act_dropout=0,\n            weight_attr=weight_attr,\n            normalize_before=False)\n        self.encoder = TransformerEncoder(\n            encoder_layer, num_hidden_layers, enable_recompute=use_recompute)\n\n        self.pooler = ErniePooler(hidden_size, weight_attr)\n        self.apply(self.init_weights)\n\n    def get_input_embeddings(self):\n        return self.embeddings.word_embeddings\n\n    def set_input_embeddings(self, value):\n        self.embeddings.word_embeddings = value\n\n    def forward(self,\n                input_ids,\n                token_type_ids=None,\n                position_ids=None,\n                attention_mask=None,\n                task_type_ids=None,\n                past_key_values=None,\n                inputs_embeds=None,\n                use_cache=None,\n                output_hidden_states=False,\n                output_attentions=False,\n                return_dict=False):\n        r\"\"\"\n        Args:\n            input_ids (Tensor):\n                Indices of input sequence tokens in the vocabulary. They are\n                numerical representations of tokens that build the input sequence.\n                It's data type should be `int64` and has a shape of [batch_size, sequence_length].\n            token_type_ids (Tensor, optional):\n                Segment token indices to indicate different portions of the inputs.\n                Selected in the range ``[0, type_vocab_size - 1]``.\n                If `type_vocab_size` is 2, which means the inputs have two portions.\n                Indices can either be 0 or 1:\n\n                - 0 corresponds to a *sentence A* token,\n                - 1 corresponds to a *sentence B* token.\n\n                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].\n                Defaults to `None`, which means we don't add segment embeddings.\n            position_ids (Tensor, optional):\n                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,\n                max_position_embeddings - 1]``.\n                Shape as `[batch_size, num_tokens]` and dtype as int64. Defaults to `None`.\n            attention_mask (Tensor, optional):\n                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,\n                usually the paddings or the subsequent positions.\n                Its data type can be int, float and bool.\n                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.\n                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.\n                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.\n                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.\n                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],\n                [batch_size, num_attention_heads, sequence_length, sequence_length].\n                We use whole-word-mask in ERNIE, so the whole word will have the same value. For example, \"使用\" as a word,\n                \"使\" and \"用\" will have the same value.\n                Defaults to `None`, which means nothing needed to be prevented attention to.\n             inputs_embeds (Tensor, optional):\n                If you want to control how to convert `inputs_ids` indices into associated vectors, you can\n                pass an embedded representation directly instead of passing `inputs_ids`.\n            past_key_values (tuple(tuple(Tensor)), optional):\n                The length of tuple equals to the number of layers, and each inner\n                tuple haves 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`)\n                which contains precomputed key and value hidden states of the attention blocks.\n                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that\n                don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all\n                `input_ids` of shape `(batch_size, sequence_length)`.\n            use_cache (`bool`, optional):\n                If set to `True`, `past_key_values` key value states are returned.\n                Defaults to `None`.\n            output_hidden_states (bool, optional):\n                Whether to return the hidden states of all layers.\n                Defaults to `False`.\n            output_attentions (bool, optional):\n                Whether to return the attentions tensors of all attention layers.\n                Defaults to `False`.\n            return_dict (bool, optional):\n                Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ModelOutput` object. \n                If `False`, the output will be a tuple of tensors. Defaults to `False`.\n\n        Returns:\n            An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions` if\n            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding\n            to ordered and not None (depending on the input arguments) fields of\n            :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions`.\n\n        \"\"\"\n        if input_ids is not None and inputs_embeds is not None:\n            raise ValueError(\n                \"You cannot specify both input_ids and inputs_embeds at the same time.\"\n            )\n        elif input_ids is not None:\n            input_shape = paddle.shape(input_ids)\n        elif inputs_embeds is not None:\n            input_shape = paddle.shape(inputs_embeds)[:-1]\n        else:\n            raise ValueError(\n                \"You have to specify either input_ids or inputs_embeds\")\n\n        past_key_values_length = None\n        if past_key_values is not None:\n            past_key_values_length = past_key_values[0][0].shape[2]\n\n        if attention_mask is None:\n            attention_mask = paddle.unsqueeze(\n                (input_ids == self.pad_token_id\n                 ).astype(self.pooler.dense.weight.dtype) * -1e4,\n                axis=[1, 2])\n            if past_key_values is not None:\n                batch_size = past_key_values[0][0].shape[0]\n                past_mask = paddle.zeros(\n                    [batch_size, 1, 1, past_key_values_length],\n                    dtype=attention_mask.dtype)\n                attention_mask = paddle.concat(\n                    [past_mask, attention_mask], axis=-1)\n        # For 2D attention_mask from tokenizer\n        elif attention_mask.ndim == 2:\n            attention_mask = paddle.unsqueeze(\n                attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())\n            attention_mask = (1.0 - attention_mask) * -1e4\n        attention_mask.stop_gradient = True\n\n        embedding_output = self.embeddings(\n            input_ids=input_ids,\n            position_ids=position_ids,\n            token_type_ids=token_type_ids,\n            task_type_ids=task_type_ids,\n            inputs_embeds=inputs_embeds,\n            past_key_values_length=past_key_values_length)\n\n        self.encoder._use_cache = use_cache  # To be consistent with HF\n        encoder_outputs = self.encoder(\n            embedding_output,\n            src_mask=attention_mask,\n            cache=past_key_values,\n            output_attentions=output_attentions,\n            output_hidden_states=output_hidden_states,\n            return_dict=return_dict)\n        if isinstance(encoder_outputs, type(embedding_output)):\n            sequence_output = encoder_outputs\n            pooled_output = self.pooler(sequence_output)\n            return (sequence_output, pooled_output)\n        else:\n            sequence_output = encoder_outputs[0]\n            pooled_output = self.pooler(sequence_output)\n            if not return_dict:\n                return (sequence_output, pooled_output) + encoder_outputs[1:]\n            return BaseModelOutputWithPoolingAndCrossAttentions(\n                last_hidden_state=sequence_output,\n                pooler_output=pooled_output,\n                past_key_values=encoder_outputs.past_key_values,\n                hidden_states=encoder_outputs.hidden_states,\n                attentions=encoder_outputs.attentions)\n\n    def init_weights(self, layer):\n        \"\"\" Initialization hook \"\"\"\n        if isinstance(layer, (nn.Linear, nn.Embedding)):\n            # only support dygraph, use truncated_normal and make it inplace\n            # and configurable later\n            if isinstance(layer.weight, paddle.Tensor):\n                layer.weight.set_value(\n                    paddle.tensor.normal(\n                        mean=0.0,\n                        std=self.initializer_range\n                        if hasattr(self, \"initializer_range\") else\n                        self.ernie.initializer_range,\n                        shape=layer.weight.shape))\n        elif isinstance(layer, nn.LayerNorm):\n            layer._epsilon = 1e-12\n\n\nclass ErnieLMPredictionHead(nn.Layer):\n    r\"\"\"\n    Ernie Model with a `language modeling` head on top.\n    \"\"\"\n\n    def __init__(\n            self,\n            hidden_size,\n            vocab_size,\n            activation,\n            embedding_weights=None,\n            weight_attr=None, ):\n        super(ErnieLMPredictionHead, self).__init__()\n\n        self.transform = nn.Linear(\n            hidden_size, hidden_size, weight_attr=weight_attr)\n        self.activation = getattr(nn.functional, activation)\n        self.layer_norm = nn.LayerNorm(hidden_size)\n        self.decoder_weight = self.create_parameter(\n            shape=[vocab_size, hidden_size],\n            dtype=self.transform.weight.dtype,\n            attr=weight_attr,\n            is_bias=False) if embedding_weights is None else embedding_weights\n        self.decoder_bias = self.create_parameter(\n            shape=[vocab_size], dtype=self.decoder_weight.dtype, is_bias=True)\n\n    def forward(self, hidden_states, masked_positions=None):\n        if masked_positions is not None:\n            hidden_states = paddle.reshape(hidden_states,\n                                           [-1, hidden_states.shape[-1]])\n            hidden_states = paddle.tensor.gather(hidden_states,\n                                                 masked_positions)\n        # gather masked tokens might be more quick\n        hidden_states = self.transform(hidden_states)\n        hidden_states = self.activation(hidden_states)\n        hidden_states = self.layer_norm(hidden_states)\n        hidden_states = paddle.matmul(\n            hidden_states, self.decoder_weight,\n            transpose_y=True) + self.decoder_bias\n        return hidden_states\n\n\nclass ErniePretrainingHeads(nn.Layer):\n    def __init__(\n            self,\n            hidden_size,\n            vocab_size,\n            activation,\n            embedding_weights=None,\n            weight_attr=None, ):\n        super(ErniePretrainingHeads, self).__init__()\n        self.predictions = ErnieLMPredictionHead(hidden_size, vocab_size,\n                                                 activation, embedding_weights,\n                                                 weight_attr)\n        self.seq_relationship = nn.Linear(\n            hidden_size, 2, weight_attr=weight_attr)\n\n    def forward(self, sequence_output, pooled_output, masked_positions=None):\n        prediction_scores = self.predictions(sequence_output, masked_positions)\n        seq_relationship_score = self.seq_relationship(pooled_output)\n        return prediction_scores, seq_relationship_score\n\n\nclass ErnieForPretraining(nn.Layer):\n    r\"\"\"\n    Ernie Model with a `masked language modeling` head and a `sentence order prediction` head\n    on top.\n\n    \"\"\"\n\n    def __init__(self, ernie):\n        super(ErnieForPretraining, self).__init__()\n        self.ernie = ernie\n        weight_attr = paddle.ParamAttr(\n            initializer=nn.initializer.TruncatedNormal(\n                mean=0.0, std=self.ernie.initializer_range))\n        self.cls = ErniePretrainingHeads(\n            self.ernie.hidden_size,\n            self.ernie.vocab_size,\n            self.ernie.hidden_act,\n            embedding_weights=self.ernie.embeddings.word_embeddings.weight,\n            weight_attr=weight_attr, )\n\n        self.apply(self.init_weights)\n\n    def forward(self,\n                input_ids,\n                token_type_ids=None,\n                position_ids=None,\n                attention_mask=None,\n                masked_positions=None,\n                inputs_embeds=None,\n                labels=None,\n                next_sentence_label=None,\n                output_hidden_states=False,\n                output_attentions=False,\n                return_dict=False):\n        r\"\"\"\n        Args:\n            input_ids (Tensor):\n                See :class:`ErnieModel`.\n            token_type_ids (Tensor, optional):\n                See :class:`ErnieModel`.\n            position_ids (Tensor, optional):\n                See :class:`ErnieModel`.\n            attention_mask (Tensor, optional):\n                See :class:`ErnieModel`.\n            inputs_embeds(Tensor, optional):\n                See :class:`ErnieModel`.\n            labels (Tensor of shape `(batch_size, sequence_length)`, optional):\n                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,\n                vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),\n                the loss is only computed for the tokens with labels in `[0, ..., vocab_size]`.\n            next_sentence_label (Tensor of shape `(batch_size,)`, optional):\n                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence\n                pair (see `input_ids` docstring) Indices should be in `[0, 1]`:\n\n                - 0 indicates sequence B is a continuation of sequence A,\n                - 1 indicates sequence B is a random sequence.\n            output_hidden_states (bool, optional):\n                Whether to return the hidden states of all layers.\n                Defaults to `False`.\n            output_attentions (bool, optional):\n                Whether to return the attentions tensors of all attention layers.\n                Defaults to `False`.\n            return_dict (bool, optional):\n                Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput` object. If\n                `False`, the output will be a tuple of tensors. Defaults to `False`.\n\n        Returns:\n            An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput` if `return_dict=True`.\n            Otherwise it returns a tuple of tensors corresponding to ordered and\n            not None (depending on the input arguments) fields of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput`.\n\n        \"\"\"\n\n        # with paddle.static.amp.fp16_guard():\n        outputs = self.ernie(\n            input_ids,\n            token_type_ids=token_type_ids,\n            position_ids=position_ids,\n            attention_mask=attention_mask,\n            inputs_embeds=inputs_embeds,\n            output_attentions=output_attentions,\n            output_hidden_states=output_hidden_states,\n            return_dict=return_dict)\n        sequence_output, pooled_output = outputs[:2]\n        prediction_scores, seq_relationship_score = self.cls(\n            sequence_output, pooled_output, masked_positions)\n\n        total_loss = None\n        if labels is not None and next_sentence_label is not None:\n            loss_fct = paddle.nn.CrossEntropyLoss()\n            masked_lm_loss = loss_fct(\n                prediction_scores.reshape(\n                    (-1, paddle.shape(prediction_scores)[-1])),\n                labels.reshape((-1, )))\n            next_sentence_loss = loss_fct(\n                seq_relationship_score.reshape((-1, 2)),\n                next_sentence_label.reshape((-1, )))\n            total_loss = masked_lm_loss + next_sentence_loss\n        if not return_dict:\n            output = (prediction_scores, seq_relationship_score) + outputs[2:]\n            return (\n                (total_loss, ) + output) if total_loss is not None else output\n\n        return ErnieForPreTrainingOutput(\n            loss=total_loss,\n            prediction_logits=prediction_scores,\n            seq_relationship_logits=seq_relationship_score,\n            hidden_states=outputs.hidden_states,\n            attentions=outputs.attentions, )\n\n    def init_weights(self, layer):\n        \"\"\" Initialization hook \"\"\"\n        if isinstance(layer, (nn.Linear, nn.Embedding)):\n            # only support dygraph, use truncated_normal and make it inplace\n            # and configurable later\n            if isinstance(layer.weight, paddle.Tensor):\n                layer.weight.set_value(\n                    paddle.tensor.normal(\n                        mean=0.0,\n                        std=self.initializer_range\n                        if hasattr(self, \"initializer_range\") else\n                        self.ernie.initializer_range,\n                        shape=layer.weight.shape))\n        elif isinstance(layer, nn.LayerNorm):\n            layer._epsilon = 1e-12\n\n\nclass ErniePretrainingCriterion(paddle.nn.Layer):\n    r\"\"\"\n    The loss output of Ernie Model during the pretraining:\n    a `masked language modeling` head and a `next sentence prediction (classification)` head.\n\n    \"\"\"\n\n    def __init__(self, with_nsp_loss=True):\n        super(ErniePretrainingCriterion, self).__init__()\n        self.with_nsp_loss = with_nsp_loss\n        #self.loss_fn = paddle.nn.loss.CrossEntropyLoss(ignore_index=-1)\n\n    def forward(self,\n                prediction_scores,\n                seq_relationship_score,\n                masked_lm_labels,\n                next_sentence_labels=None):\n        \"\"\"\n        Args:\n            prediction_scores(Tensor):\n                The scores of masked token prediction. Its data type should be float32.\n                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].\n                Otherwise, its shape is [batch_size, mask_token_num, vocab_size]\n            seq_relationship_score(Tensor):\n                The scores of next sentence prediction. Its data type should be float32 and\n                its shape is [batch_size, 2]\n            masked_lm_labels(Tensor):\n                The labels of the masked language modeling, its dimensionality is equal to `prediction_scores`.\n                Its data type should be int64. If `masked_positions` is None, its shape is [batch_size, sequence_length, 1].\n                Otherwise, its shape is [batch_size, mask_token_num, 1]\n            next_sentence_labels(Tensor):\n                The labels of the next sentence prediction task, the dimensionality of `next_sentence_labels`\n                is equal to `seq_relation_labels`. Its data type should be int64 and\n                its shape is [batch_size, 1]\n\n        Returns:\n            Tensor: The pretraining loss, equals to the sum of `masked_lm_loss` plus the mean of `next_sentence_loss`.\n            Its data type should be float32 and its shape is [1].\n\n        \"\"\"\n\n        with paddle.static.amp.fp16_guard():\n            masked_lm_loss = F.cross_entropy(\n                prediction_scores,\n                masked_lm_labels,\n                ignore_index=-1,\n                reduction='none')\n\n            if not self.with_nsp_loss:\n                return paddle.mean(masked_lm_loss)\n\n            next_sentence_loss = F.cross_entropy(\n                seq_relationship_score, next_sentence_labels, reduction='none')\n            return paddle.mean(masked_lm_loss), paddle.mean(next_sentence_loss)\n\n\nclass ErnieForSequenceClassification(nn.Layer):\n    \"\"\"\n    Ernie Model with a linear layer on top of the output layer,\n    designed for sequence classification/regression tasks like GLUE tasks.\n\n    Args:\n        ernie (:class:`ErnieModel`):\n            An instance of ErnieModel.\n        num_classes (int, optional):\n            The number of classes. Defaults to `2`.\n        dropout (float, optional):\n            The dropout probability for output of ERNIE.\n            If None, use the same value as `hidden_dropout_prob` of `ErnieModel`\n            instance `ernie`. Defaults to None.\n    \"\"\"\n\n    def __init__(self, ernie, num_classes=2, dropout=None):\n        super(ErnieForSequenceClassification, self).__init__()\n        self.num_classes = num_classes\n        self.ernie = ernie  # allow ernie to be config\n        self.dropout = nn.Dropout(dropout if dropout is not None else\n                                  self.ernie.hidden_dropout_prob)\n        self.classifier = nn.Linear(self.ernie.hidden_size, num_classes)\n        self.apply(self.init_weights)\n\n    def forward(self,\n                input_ids,\n                token_type_ids=None,\n                position_ids=None,\n                attention_mask=None,\n                labels=None,\n                output_hidden_states=False,\n                output_attentions=False,\n                return_dict=False):\n        r\"\"\"\n        The ErnieForSequenceClassification forward method, overrides the __call__() special method.\n\n        Args:\n            input_ids (Tensor):\n                See :class:`ErnieModel`.\n            token_type_ids (Tensor, optional):\n                See :class:`ErnieModel`.\n            position_ids(Tensor, optional):\n                See :class:`ErnieModel`.\n            attention_mask (Tensor, optional):\n                See :class:`ErnieModel`.\n            labels (Tensor of shape `(batch_size,)`, optional):\n                Labels for computing the sequence classification/regression loss.\n                Indices should be in `[0, ..., num_classes - 1]`. If `num_classes == 1`\n                a regression loss is computed (Mean-Square loss), If `num_classes > 1`\n                a classification loss is computed (Cross-Entropy).\n            output_hidden_states (bool, optional):\n                Whether to return the hidden states of all layers.\n                Defaults to `False`.\n            output_attentions (bool, optional):\n                Whether to return the attentions tensors of all attention layers.\n                Defaults to `False`.\n            return_dict (bool, optional):\n                Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput` object. If\n                `False`, the output will be a tuple of tensors. Defaults to `False`.\n\n        Returns:\n            An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput` if `return_dict=True`.\n            Otherwise it returns a tuple of tensors corresponding to ordered and\n            not None (depending on the input arguments) fields of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput`.\n\n        \"\"\"\n\n        outputs = self.ernie(\n            input_ids,\n            token_type_ids=token_type_ids,\n            position_ids=position_ids,\n            attention_mask=attention_mask,\n            output_attentions=output_attentions,\n            output_hidden_states=output_hidden_states,\n            return_dict=return_dict)\n        pooled_output = outputs[1]\n\n        pooled_output = self.dropout(pooled_output)\n        logits = self.classifier(pooled_output)\n\n        loss = None\n        if labels is not None:\n            if self.num_classes == 1:\n                loss_fct = paddle.nn.MSELoss()\n                loss = loss_fct(logits, labels)\n            elif labels.dtype == paddle.int64 or labels.dtype == paddle.int32:\n                loss_fct = paddle.nn.CrossEntropyLoss()\n                loss = loss_fct(\n                    logits.reshape((-1, self.num_classes)),\n                    labels.reshape((-1, )))\n            else:\n                loss_fct = paddle.nn.BCEWithLogitsLoss()\n                loss = loss_fct(logits, labels)\n\n        if not return_dict:\n            output = (logits, ) + outputs[2:]\n            return ((loss, ) + output) if loss is not None else (\n                output[0] if len(output) == 1 else output)\n\n        return SequenceClassifierOutput(\n            loss=loss,\n            logits=logits,\n            hidden_states=outputs.hidden_states,\n            attentions=outputs.attentions, )\n\n    def init_weights(self, layer):\n        \"\"\" Initialization hook \"\"\"\n        if isinstance(layer, (nn.Linear, nn.Embedding)):\n            if isinstance(layer.weight, paddle.Tensor):\n                layer.weight.set_value(\n                    paddle.tensor.normal(\n                        mean=0.0,\n                        std=self.initializer_range\n                        if hasattr(self, \"initializer_range\") else\n                        self.ernie.initializer_range,\n                        shape=layer.weight.shape))\n        elif isinstance(layer, nn.LayerNorm):\n            layer._epsilon = 1e-12\n"
  },
  {
    "path": "ppfleetx/models/language_model/ernie/ernie_module.py",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport sys\nimport copy\nimport yaml\nimport codecs\nfrom collections.abc import Mapping\n\nimport paddle\nfrom paddle.static import InputSpec\nimport paddle.nn as nn\n\nfrom ppfleetx.core.module.basic_module import BasicModule\nimport ppfleetx.models.language_model.gpt as gpt\nfrom ppfleetx.utils.log import logger\n\nfrom .dygraph.single_model import (\n    ErnieModel,\n    ErnieForPretraining,\n    ErniePretrainingCriterion,\n    ErnieForSequenceClassification, )\nfrom .dygraph.hybrid_model import (ErnieModelHybrid, ErnieForPretrainingHybrid,\n                                   ErniePretrainingCriterionHybrid,\n                                   ErnieForPretrainingPipe,\n                                   ErnieForSequenceClassificationHybrid)\n\nfrom ppfleetx.models.language_model.utils import process_configs\n\nimport numpy as np\n\n\ndef process_data_configs(config):\n    \"\"\"\n    process data configs for hybrid parallel\n    \"\"\"\n    cfg_global = config['Global']\n    cfg_data = config['Data']\n\n    mode_to_num_samples = {\n        \"Train\":\n        cfg_global['global_batch_size'] * config['Engine']['max_steps'],\n        \"Eval\": cfg_global['global_batch_size'] *\n        (config['Engine']['max_steps'] // config['Engine']['eval_freq'] + 1) *\n        config['Engine']['eval_iters'],\n        \"Test\":\n        cfg_global['global_batch_size'] * config['Engine']['test_iters'],\n    }\n\n    for mode in (\"Train\", \"Eval\", \"Test\"):\n        if mode in cfg_data.keys():\n            cfg_data[mode]['dataset']['num_samples'] = mode_to_num_samples[\n                mode]\n            cfg_data[mode]['dataset']['mode'] = mode\n            cfg_data[mode]['dataset']['seed'] = cfg_global['seed']\n            cfg_data[mode]['sampler']['batch_size'] = cfg_global[\n                'local_batch_size']\n            cfg_data[mode]['dataset'].setdefault('binary_head',\n                                                 cfg_global['binary_head'])\n            cfg_data[mode]['loader']['collate_fn'].setdefault(\n                'micro_batch_size', cfg_global['micro_batch_size'])\n\n\ndef process_model_configs(config):\n    cfg_model = config['Model']\n    hidden_size = cfg_model['hidden_size']\n    cfg_model.setdefault(\"intermediate_size\", hidden_size * 4)\n\n\ndef process_finetune_configs(task, config):\n    cfg_data = config['Data']\n    cfg_dist = config['Distributed']\n    cfg_optim = config['Optimizer']\n    cfg_global = config['Global']\n    cfg_engine = config['Engine']\n\n    path = \"./ppfleetx/models/language_model/ernie/finetune_configs.yaml\"\n    with codecs.open(path, 'r', 'utf-8') as file:\n        dic = yaml.load(file, Loader=yaml.FullLoader)\n\n    dataset_type = cfg_data.Train.dataset.dataset_type\n    assert dataset_type in dic[task].keys(\n    ), \"{} is an invalid dataset type ! Only support the types of dataset shown in {}\".format(\n        dataset_type, path)\n\n    num_train_epochs = dic[task][dataset_type].get('num_train_epochs', None)\n    if num_train_epochs is not None:\n        cfg_engine['num_train_epochs'] = num_train_epochs\n\n    learning_rate = dic[task][dataset_type].get(\"learning_rate\", None)\n    if learning_rate is not None:\n        cfg_optim['lr']['max_lr'] = learning_rate\n\n    max_seq_length = dic[task][dataset_type].get(\"max_seq_length\", None)\n    if max_seq_length is not None:\n        for mode in (\"Train\", \"Eval\", \"Test\"):\n            if mode in cfg_data.keys():\n                cfg_data[mode]['dataset']['max_seq_len'] = max_seq_length\n\n    batch_size = dic[task][dataset_type].get(\"batch_size\", None)\n    if batch_size is not None:\n        assert batch_size % cfg_global['micro_batch_size'] == 0\n\n        cfg_global['local_batch_size'] = batch_size\n        cfg_global['global_batch_size'] = batch_size * cfg_dist[\n            'dp_degree'] * cfg_dist['pp_degree']\n\n\nclass ErnieModule(BasicModule):\n    def __init__(self, configs):\n        self.nranks = paddle.distributed.get_world_size()\n        super(ErnieModule, self).__init__(configs)\n        self.nranks = paddle.distributed.get_world_size()\n        self.binary_head = self.configs['Global']['binary_head']\n\n        if self.nranks > 1:\n            self.criterion = ErniePretrainingCriterionHybrid(self.binary_head)\n        else:\n            self.criterion = ErniePretrainingCriterion(self.binary_head)\n\n    def get_model_size(self, l, h, v, s):\n        P = 12 * l * h * h * (1 + 13 / (12 * h) + (v + s) / (12 * l * h))\n        logger.info('Model Size: {:.2f} B'.format(P / 1000.0 / 1000.0 /\n                                                  1000.0))\n\n    def process_configs(self, configs):\n        process_data_configs(configs)\n        process_model_configs(configs)\n        return configs\n\n    def get_model(self):\n        model_setting = copy.deepcopy(self.configs.Model)\n        model_setting.pop(\"module\")\n        model_setting.pop(\"name\")\n\n        l = model_setting['num_hidden_layers']\n        h = model_setting['hidden_size']\n        v = model_setting['vocab_size']\n        s = self.configs.Data.Train.dataset.max_seq_length\n        self.get_model_size(l, h, v, s)\n\n        if self.nranks > 1:\n            model_setting[\n                'num_partitions'] = self.configs.Distributed.mp_degree\n            # model = ErnieForPretrainingHybrid(ErnieModelHybrid(**model_setting))\n\n            if self.configs.Distributed.pp_degree == 1:\n                model = ErnieForPretrainingHybrid(\n                    ErnieModelHybrid(**model_setting))\n            else:\n                model = ErnieForPretrainingPipe(**model_setting)\n        else:\n            model = ErnieForPretraining(ErnieModel(**model_setting))\n\n        return model\n\n    def forward(self, tokens):\n        return self.model(tokens)\n\n    def pretreating_batch(self, batch):\n        if self.configs.Distributed.pp_degree > 1:\n            input_ids, segment_ids, input_mask, masked_lm_positions, \\\n                        masked_lm_labels, next_sentence_labels = batch\n\n            if not isinstance(masked_lm_positions, list):\n                masked_lm_positions = [masked_lm_positions]\n            if not isinstance(masked_lm_labels, list):\n                masked_lm_labels = [masked_lm_labels]\n\n            data = [\n                (input_ids, segment_ids, input_mask),\n                (masked_lm_positions, masked_lm_labels, next_sentence_labels)\n            ]\n            return data\n        else:\n            return batch\n\n    def training_step(self, batch):\n        input_ids, segment_ids, input_mask, masked_lm_positions, \\\n            masked_lm_labels, next_sentence_labels = batch\n\n        # Create the model for the ernie pretrain\n        if self.binary_head:\n            prediction_scores, seq_relationship_score = self.model(\n                input_ids=input_ids,\n                token_type_ids=segment_ids,\n                # position_ids=None,\n                attention_mask=input_mask,\n                masked_positions=masked_lm_positions)\n            lm_loss, sop_loss = self.criterion(\n                prediction_scores, seq_relationship_score, masked_lm_labels,\n                next_sentence_labels)\n            loss = lm_loss + sop_loss\n        else:\n            prediction_scores = self.model(\n                input_ids=input_ids,\n                token_type_ids=segment_ids,\n                # position_ids=None,\n                attention_mask=input_mask,\n                masked_positions=masked_lm_positions)\n\n            loss = self.criterion(prediction_scores, None, masked_lm_labels)\n\n        return loss\n\n    def training_step_end(self, log_dict):\n        speed = 1. / log_dict['train_cost']\n        default_global_tokens_num = self.configs.Global.global_batch_size * \\\n            self.configs.Data.Train.dataset.max_seq_length\n\n        logger.info(\n            \"[train] epoch: %d, batch: %d, loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, \" \\\n            \"ips_total: %.0f tokens/s, ips: %.0f tokens/s, learning rate: %.5e\"\n            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['train_cost'], speed,\n               speed * default_global_tokens_num, speed * default_global_tokens_num / self.nranks, log_dict['lr']))\n\n    def input_spec(self):\n        return [\n            InputSpec(\n                shape=[None, None], dtype='int64'), InputSpec(\n                    shape=[None, None], dtype='int64'), InputSpec(\n                        shape=[None, None], dtype='int64')\n        ]\n\n\nclass ErnieSeqClsModule(BasicModule):\n    def __init__(self, configs):\n        self.nranks = paddle.distributed.get_world_size()\n        super(ErnieSeqClsModule, self).__init__(configs)\n\n        self.criterion = nn.loss.CrossEntropyLoss(\n        )  # if data_args.label_list else nn.loss.MSELoss()\n\n        self.past_index = -1\n        self.past = None\n        self.label_names = ([\"start_positions\", \"end_positions\"] \\\n            if \"QusetionAnswering\" in type(self.model).__name__ else [\"labels\"])\n\n    def process_configs(self, configs):\n        process_model_configs(configs)\n        process_finetune_configs(\"SequenceClassification\", configs)\n\n        cfg_global = configs['Global']\n        cfg_data = configs['Data']\n\n        for mode in (\"Train\", \"Eval\", \"Test\"):\n            if mode in cfg_data.keys():\n                cfg_data[mode]['dataset']['mode'] = mode\n                cfg_data[mode]['sampler']['batch_size'] = cfg_global[\n                    'local_batch_size']\n                cfg_data[mode]['loader']['collate_fn'].setdefault(\n                    'tokenizer_type',\n                    cfg_data[mode]['dataset']['tokenizer_type'])\n\n        return configs\n\n    def get_model(self):\n        model_setting = copy.deepcopy(self.configs.Model)\n        model_setting.pop(\"module\")\n        model_setting.pop(\"name\")\n\n        if self.nranks > 1:\n            model_setting[\n                'num_partitions'] = self.configs.Distributed.mp_degree\n\n            if self.configs.Distributed.pp_degree == 1:\n                model = ErnieForSequenceClassificationHybrid(\n                    ErnieModelHybrid(**model_setting))\n            else:\n                raise ValueError(\n                    \"Pipeline Parallelism is not supported in Sequence \\\n                    Classification task of Ernie model.\")\n        else:\n            model = ErnieForSequenceClassification(ErnieModel(**model_setting))\n\n        return model\n\n    def prepare_input(self, data):\n        \"\"\"\n        Prepares one `data` before feeding it to the model, be it a tensor or a nested list/dictionary of tensors.\n        \"\"\"\n        if isinstance(data, Mapping):\n            return type(data)(\n                {k: self.prepare_input(v)\n                 for k, v in data.items()})\n        elif isinstance(data, (tuple, list)):\n            return type(data)(self.prepare_input(v) for v in data)\n        elif isinstance(data, paddle.Tensor):\n            # kwargs = dict(device=self.args.current_device)\n            # update data type for pure fp16\n            return data\n            # return data.to(**kwargs)\n        return data\n\n    def pretreating_batch(self, batch):\n        self.has_labels = all(\n            batch.get(k) is not None for k in self.label_names)\n\n        batch = self.prepare_input(batch)\n        if self.past_index >= 0 and self.past is not None:\n            batch[\"mems\"] = self.past\n\n        return batch\n\n    def forward(self, inputs):\n        return self.model(**inputs)\n\n    def compute_loss(self, inputs, return_outputs=False):\n        if \"labels\" in inputs:\n            labels = inputs.pop(\"labels\")\n        elif \"start_positions\" in inputs and \"end_positions\" in inputs:\n            labels = (inputs.pop(\"start_positions\"),\n                      inputs.pop(\"end_positions\"))\n        elif \"generator_labels\" in inputs:\n            labels = inputs[\"generator_labels\"]\n        else:\n            labels = None\n        outputs = self(inputs)\n\n        loss = self.criterion(outputs, labels)\n        outputs = (loss, outputs)\n\n        # Save past state if it exists\n        # TODO: this needs to be fixed and made cleaner later.\n        if self.past_index >= 0:\n            self.past = outputs[self.args.past_index]\n\n        # We don't use .loss here since the model may return tuples instead of ModelOutput.\n        loss = outputs[\"loss\"] if isinstance(outputs, dict) else outputs[0]\n\n        return (loss, outputs) if return_outputs else loss\n\n    def training_step(self, batch):\n        return self.compute_loss(batch)\n\n    def training_step_end(self, log_dict):\n        speed = 1. / log_dict['train_cost']\n        default_global_tokens_num = self.configs.Global.global_batch_size * \\\n            self.configs.Data.Train.dataset.max_seq_len\n\n        logger.info(\n            \"[train] epoch: %d, batch: %d, loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, \" \\\n            \"ips_total: %.0f tokens/s, ips: %.0f tokens/s, learning rate: %.5e\"\n            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['train_cost'], speed,\n               speed * default_global_tokens_num, speed * default_global_tokens_num / self.nranks, log_dict['lr']))\n\n    def input_spec(self):\n        input_spec = [\n            paddle.static.InputSpec(\n                shape=[None, None], dtype=\"int64\"),  # input_ids\n            paddle.static.InputSpec(\n                shape=[None, None], dtype=\"int64\")  # segment_ids\n        ]\n        return input_spec\n\n    def validation_step(self, inputs):\n        if self.has_labels:\n            loss, outputs = self.compute_loss(inputs, return_outputs=True)\n            loss = loss.mean().detach()\n\n        else:\n            loss = None\n\n        return loss\n\n    def validation_step_end(self, log_dict):\n        speed = 1. / log_dict['eval_cost']\n        logger.info(\n            \"[eval] epoch: %d, batch: %d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s\"\n            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'],\n               log_dict['eval_cost'], speed))\n"
  },
  {
    "path": "ppfleetx/models/language_model/ernie/finetune_configs.yaml",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\n# Datasets which used for sequence classfication\nSequenceClassification:\n    clue afqmc: \n        num_train_epochs: 4\n    clue tnews:\n        num_train_epochs: 4\n    clue iflytek:\n        num_train_epochs: 8\n    clue ocnli:\n        num_train_epochs: 8\n    clue cmnli: \n        num_train_epochs: 3\n    clue wsc: \n        num_train_epochs: 50\n    clue csl:\n        num_train_epochs: 10\n        max_seq_length: 256\n        batch_size: 32\n    xnli_cn:\n        learning_rate: 0.0001\n        num_train_epochs: 3\n        batch_size: 256\n    chnsenticorp_v2:\n        learning_rate: 0.00005\n        batch_size: 16\n        num_train_epochs: 8\n"
  },
  {
    "path": "ppfleetx/models/language_model/ernie/layers/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/models/language_model/ernie/layers/distributed_transformer.py",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# TODO: define the classes of Transformer neural network\n\nimport copy\nimport collections\nimport numpy as np\n\nimport paddle\nimport paddle.nn.functional as F\nimport paddle.nn as nn\n\nfrom paddle.nn import Linear, Dropout, LayerNorm, LayerList, Layer\nimport paddle.tensor as tensor\nfrom paddle.fluid import layers\nfrom paddle import ParamAttr\nfrom paddle.fluid.data_feeder import convert_dtype\nfrom .model_outputs import BaseModelOutputWithPastAndCrossAttentions\n\nfrom paddle.distributed import fleet\nfrom paddle.distributed.fleet.meta_parallel import get_rng_state_tracker\nfrom paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer, SharedLayerDesc\nfrom paddle.distributed.fleet.utils import recompute\n\n__all__ = []\n\n\ndef _convert_param_attr_to_list(param_attr, n):\n    \"\"\"\n    If `param_attr` is a list or tuple, convert every element in it to a\n    ParamAttr instance. Otherwise, repeat `param_attr` `n` times to\n    construct a list, and rename every one by appending a increasing index\n    suffix to avoid having same names when `param_attr` contains a name.\n\n    Parameters:\n        param_attr (list|tuple|ParamAttr): A list, tuple or something can be\n            converted to a ParamAttr instance by `ParamAttr._to_attr`.\n        n (int): The times to repeat to construct a list when `param_attr`\n            is not a list or tuple.\n\n    Returns:\n        list: A list composed of each including cell's `param_attr`.\n    \"\"\"\n    if isinstance(param_attr, (list, tuple)):\n        assert len(param_attr) == n, (\n            \"length of param_attr should be %d when it is a list/tuple\" % n)\n        param_attrs = []\n        for attr in param_attr:\n            if isinstance(attr, bool):\n                if attr:\n                    param_attrs.append(ParamAttr._to_attr(None))\n                else:\n                    param_attrs.append(False)\n            else:\n                param_attrs.append(ParamAttr._to_attr(attr))\n        # param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]\n    elif isinstance(param_attr, bool):\n        param_attrs = []\n        if param_attr:\n            param_attrs = [ParamAttr._to_attr(None) for i in range(n)]\n        else:\n            param_attrs = [False] * n\n    else:\n        param_attrs = []\n        attr = ParamAttr._to_attr(param_attr)\n        for i in range(n):\n            attr_i = copy.deepcopy(attr)\n            if attr.name:\n                attr_i.name = attr_i.name + \"_\" + str(i)\n            param_attrs.append(attr_i)\n    return param_attrs\n\n\ndef _convert_attention_mask(attn_mask, dtype):\n    \"\"\"\n    Convert the attention mask to the target dtype we expect.\n\n    Parameters:\n        attn_mask (Tensor, optional): A tensor used in multi-head attention\n                to prevents attention to some unwanted positions, usually the\n                paddings or the subsequent positions. It is a tensor with shape\n                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.\n                When the data type is bool, the unwanted positions have `False`\n                values and the others have `True` values. When the data type is\n                int, the unwanted positions have 0 values and the others have 1\n                values. When the data type is float, the unwanted positions have\n                `-INF` values and the others have 0 values. It can be None when\n                nothing wanted or needed to be prevented attention to. Default None.\n        dtype (VarType): The target type of `attn_mask` we expect.\n\n    Returns:\n        Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`.\n    \"\"\"\n    if attn_mask is not None and attn_mask.dtype != dtype:\n        attn_mask_dtype = convert_dtype(attn_mask.dtype)\n        if attn_mask_dtype == 'bool' or 'int' in attn_mask_dtype:\n            attn_mask = (paddle.cast(attn_mask, dtype) - 1.0) * 1e9\n        else:\n            attn_mask = paddle.cast(attn_mask, dtype)\n    return attn_mask\n\n\nclass MultiHeadAttention(Layer):\n    \"\"\"\n    Attention mapps queries and a set of key-value pairs to outputs, and\n    Multi-Head Attention performs multiple parallel attention to jointly attending\n    to information from different representation subspaces.\n\n    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_\n    for more details.\n\n    Parameters:\n        embed_dim (int): The expected feature size in the input and output.\n        num_heads (int): The number of heads in multi-head attention.\n        dropout (float, optional): The dropout probability used on attention\n            weights to drop some attention targets. 0 for no dropout. Default 0\n        kdim (int, optional): The feature size in key. If None, assumed equal to\n            `embed_dim`. Default None.\n        vdim (int, optional): The feature size in value. If None, assumed equal to\n            `embed_dim`. Default None.\n        need_weights (bool, optional): Indicate whether to return the attention\n            weights. Default False.\n        weight_attr(ParamAttr, optional):  To specify the weight parameter property.\n            Default: None, which means the default weight parameter property is used.\n            See usage for details in :code:`ParamAttr` .\n        bias_attr (ParamAttr|bool, optional): To specify the bias parameter property.\n            Default: None, which means the default bias parameter property is used.\n            If it is set to False, this layer will not have trainable bias parameter.\n            See usage for details in :code:`ParamAttr` .\n\n    Examples:\n\n        .. code-block:: python\n\n            import paddle\n\n            # encoder input: [batch_size, sequence_length, d_model]\n            query = paddle.rand((2, 4, 128))\n            # self attention mask: [batch_size, num_heads, query_len, query_len]\n            attn_mask = paddle.rand((2, 2, 4, 4))\n            multi_head_attn = paddle.nn.MultiHeadAttention(128, 2)\n            output = multi_head_attn(query, None, None, attn_mask=attn_mask)  # [2, 4, 128]\n    \"\"\"\n\n    Cache = collections.namedtuple(\"Cache\", [\"k\", \"v\"])\n    StaticCache = collections.namedtuple(\"StaticCache\", [\"k\", \"v\"])\n\n    def __init__(self,\n                 embed_dim,\n                 num_heads,\n                 dropout=0.,\n                 kdim=None,\n                 vdim=None,\n                 need_weights=False,\n                 weight_attr=None,\n                 bias_attr=None,\n                 num_partitions=1):\n        super(MultiHeadAttention, self).__init__()\n\n        assert embed_dim > 0, (\"Expected embed_dim to be greater than 0, \"\n                               \"but received {}\".format(embed_dim))\n        assert num_heads > 0, (\"Expected num_heads to be greater than 0, \"\n                               \"but received {}\".format(num_heads))\n\n        self.embed_dim = embed_dim\n        self.kdim = kdim if kdim is not None else embed_dim\n        self.vdim = vdim if vdim is not None else embed_dim\n        self.num_heads = num_heads\n        self.dropout = dropout\n        self.need_weights = need_weights\n\n        self.head_dim = embed_dim // num_heads\n        assert self.head_dim * num_heads == self.embed_dim, \"embed_dim must be divisible by num_heads\"\n\n        assert self.num_heads % num_partitions == 0\n        self.num_heads = self.num_heads // num_partitions\n\n        # self.q_proj = Linear(\n        #     embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)\n        # self.k_proj = Linear(\n        #     self.kdim, embed_dim, weight_attr, bias_attr=bias_attr)\n        # self.v_proj = Linear(\n        #     self.vdim, embed_dim, weight_attr, bias_attr=bias_attr)\n        # self.out_proj = Linear(\n        #     embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)\n\n        self.q_proj = fleet.meta_parallel.ColumnParallelLinear(\n            embed_dim,\n            embed_dim,\n            weight_attr=weight_attr,\n            has_bias=True,\n            gather_output=False)\n\n        self.k_proj = fleet.meta_parallel.ColumnParallelLinear(\n            self.kdim,\n            embed_dim,\n            weight_attr=weight_attr,\n            has_bias=True,\n            gather_output=False)\n\n        self.v_proj = fleet.meta_parallel.ColumnParallelLinear(\n            self.vdim,\n            embed_dim,\n            weight_attr=weight_attr,\n            has_bias=True,\n            gather_output=False)\n\n        self.out_proj = fleet.meta_parallel.RowParallelLinear(\n            embed_dim,\n            embed_dim,\n            weight_attr=weight_attr,\n            has_bias=True,\n            input_is_parallel=True)\n\n    def _prepare_qkv(self, query, key, value, cache=None):\n        r\"\"\"\n        Prapares linear projected queries, keys and values for usage of subsequnt\n        multiple parallel attention. If `cache` is not None, using cached results\n        to reduce redundant calculations.\n\n        Parameters:\n            query (Tensor): The queries for multi-head attention. It is a\n                tensor with shape `[batch_size, query_length, embed_dim]`. The\n                data type should be float32 or float64.\n            key (Tensor): The keys for multi-head attention. It is\n                a tensor with shape `[batch_size, key_length, kdim]`. The\n                data type should be float32 or float64. If None, use `query` as\n                `key`.\n            value (Tensor): The values for multi-head attention. It\n                is a tensor with shape `[batch_size, value_length, vdim]`.\n                The data type should be float32 or float64. If None, use `query` as\n                `value`.\n            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):\n                It is a namedtuple with `k` and `v` as fields, and stores tensors\n                shaped `[batch_size, num_heads, length, embed_dim]` which are results\n                of linear projection, reshape and transpose calculations in\n                MultiHeadAttention. If is an instance of `Cache`, `k` and `v`\n                fields reserve intermediate results of previous positions, which\n                mostly used for decoder self attention. If it is an instance of\n                `StaticCache`, `key` and `value` args would be ignored, `k` and\n                `v` fields would be used as calculated results on `key` and\n                `value`, which mostly used for decoder-encoder cross attention.\n                It is only used for inference and should be None for training.\n                Default None.\n\n        Returns:\n            tuple: A tuple including linear projected keys and values. These two \\\n                tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \\\n                and `[batch_size, n_head, sequence_length, d_value]` separately, \\\n                and their data types are same as inputs.\n        \"\"\"\n        q = self.q_proj(query.clone())\n        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])\n        q = tensor.transpose(x=q, perm=[0, 2, 1, 3])\n\n        if isinstance(cache, self.StaticCache):\n            # for encoder-decoder attention in inference and has cached\n            k, v = cache.k, cache.v\n        else:\n            k, v = self.compute_kv(key.clone(), value.clone())\n\n        if isinstance(cache, self.Cache):\n            # for decoder self-attention in inference\n            k = tensor.concat([cache.k, k], axis=2)\n            v = tensor.concat([cache.v, v], axis=2)\n            cache = self.Cache(k, v)\n\n        return (q, k, v) if cache is None else (q, k, v, cache)\n\n    def compute_kv(self, key, value):\n        r\"\"\"\n        Applies linear projection on input keys and values, then splits heads\n        (reshape and transpose) to get keys and values from different representation\n        subspaces. The results are used as key-values pairs for subsequent multiple\n        parallel attention.\n\n        It is part of calculations in multi-head attention, and is provided as\n        a method to pre-compute and prefetch these results, thus we can use them\n        to construct cache for inference.\n\n        Parameters:\n            key (Tensor): The keys for multi-head attention. It is a tensor\n                with shape `[batch_size, sequence_length, kdim]`. The data type\n                should be float32 or float64.\n            value (Tensor): The values for multi-head attention. It is a tensor\n                with shape `[batch_size, sequence_length, vdim]`. The data type\n                should be float32 or float64.\n\n        Returns:\n            tuple: A tuple including transformed keys and values. Their shapes \\\n                both are `[batch_size, num_heads, sequence_length, embed_dim // num_heads]`, \\\n                and their data types are same as inputs.\n        \"\"\"\n        k = self.k_proj(key)\n        v = self.v_proj(value)\n        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])\n        k = tensor.transpose(x=k, perm=[0, 2, 1, 3])\n        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])\n        v = tensor.transpose(x=v, perm=[0, 2, 1, 3])\n        return k, v\n\n    def gen_cache(self, key, value=None, type=Cache):\n        \"\"\"\n        Generates cache for `forward` usage in inference accroding to arguments.\n        The generated cache is an instance of `MultiHeadAttention.Cache` or an\n        instance of `MultiHeadAttention.StaticCache`.\n\n        `Cache` or `StaticCache` is namedtuple with `k` and `v` as fields,\n        and it stores tensors shaped `[batch_size, num_heads, length, embed_dim]`\n        which are results of linear projection, reshape and transpose calculations\n        in MultiHeadAttention.\n\n        If the generated cache is an instance of `Cache`, `k` and `v` fields\n        reserve intermediate result tensors of previous positions, and the tensors\n        are incremental among decoding steps, which mostly are used for decoder\n        decoder self attention.\n\n        If the generated cache is an instance of `StaticCache`, `k` and `v` fields\n        would be used as calculated result tensors on keys an values in `forward`,\n        and the tensors keep unchanged among decoding steps, which are mostly used\n        for decoder-encoder cross attention.\n\n        The cache is generated as follows:\n\n        1. If `type` is `StaticCache`, apply `compute_kv(key, value)` and use the\n        results to create an instance of `StaticCache`.\n\n        2. If `type` is `Cache` and `value` is None, generate empty tensors shaped\n        `[batch_size, num_heads, 0, embed_dim // num_heads]` and use the results\n        to create an instance of `Cache`, where `batch_size` is from the first\n        dimension of `key`.\n\n        3. If `type` is `Cache` and `value` is not None, use `key`, `value` to create\n        an instance of `Cache`.\n\n        Parameters:\n            key (Tensor): The keys for multi-head attention. It is\n                a tensor with shape `[batch_size, key_length, kdim]`. The\n                data type should be float32 or float64. If `value` is None,\n                it is only for batch size and data type reference.\n            value (Tensor, optional): The values for multi-head attention. It\n                is a tensor with shape `[batch_size, value_length, vdim]`.\n                The data type should be float32 or float64. If None, `key` is only\n                for batch size reference. Default None.\n            type (type): It should be `MultiHeadAttention.StaticCache` or\n                `MultiHeadAttention.Cache` to indicate the cache type to generate.\n\n        Returns:\n            namedtuple: an instance of `Cache` or `StaticCache` accordingly.\n        \"\"\"\n        if type == MultiHeadAttention.StaticCache:  # static_kv\n            k, v = self.compute_kv(key, value)\n            return self.StaticCache(k, v)\n        elif value is None:  # incremental_state\n            k = layers.fill_constant_batch_size_like(\n                input=key,\n                shape=[-1, self.num_heads, 0, self.head_dim],\n                dtype=key.dtype,\n                value=0)\n            v = layers.fill_constant_batch_size_like(\n                input=key,\n                shape=[-1, self.num_heads, 0, self.head_dim],\n                dtype=key.dtype,\n                value=0)\n            return self.Cache(k, v)\n        else:\n            # incremental_state with initial value, mainly for usage like UniLM\n            return self.Cache(key, value)\n\n    def forward(self, query, key=None, value=None, attn_mask=None, cache=None):\n        r\"\"\"\n        Applies multi-head attention to map queries and a set of key-value pairs\n        to outputs.\n\n        Parameters:\n            query (Tensor): The queries for multi-head attention. It is a\n                tensor with shape `[batch_size, query_length, embed_dim]`. The\n                data type should be float32 or float64.\n            key (Tensor, optional): The keys for multi-head attention. It is\n                a tensor with shape `[batch_size, key_length, kdim]`. The\n                data type should be float32 or float64. If None, use `query` as\n                `key`. Default None.\n            value (Tensor, optional): The values for multi-head attention. It\n                is a tensor with shape `[batch_size, value_length, vdim]`.\n                The data type should be float32 or float64. If None, use `query` as\n                `value`. Default None.\n            attn_mask (Tensor, optional): A tensor used in multi-head attention\n                to prevents attention to some unwanted positions, usually the\n                paddings or the subsequent positions. It is a tensor with shape\n                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.\n                When the data type is bool, the unwanted positions have `False`\n                values and the others have `True` values. When the data type is\n                int, the unwanted positions have 0 values and the others have 1\n                values. When the data type is float, the unwanted positions have\n                `-INF` values and the others have 0 values. It can be None when\n                nothing wanted or needed to be prevented attention to. Default None.\n            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):\n                It is a namedtuple with `k` and `v` as fields, and stores tensors\n                shaped `[batch_size, num_heads, length, embed_dim]` which are results\n                of linear projection, reshape and transpose calculations in\n                MultiHeadAttention. If it is an instance of `Cache`, `k` and `v`\n                fields reserve intermediate results of previous positions, which\n                mostly used for decoder self attention. If it is an instance of\n                `StaticCache`, `key` and `value` args would be ignored, `k` and\n                `v` fields would be used as calculated results on `key` and\n                `value`, which mostly used for decoder-encoder cross attention.\n                It is only used for inference and should be None for training.\n                Default None.\n\n        Returns:\n            Tensor|tuple: It is a tensor that has the same shape and data type \\\n                as `query`, representing attention output. Or a tuple if \\\n                `need_weights` is True or `cache` is not None. If `need_weights` \\\n                is True, except for attention output, the tuple also includes \\\n                the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \\\n                If `cache` is not None, the tuple then includes the new cache \\\n                having the same type as `cache`, and if it is `StaticCache`, it \\\n                is same as the input `cache`, if it is `Cache`, the new cache \\\n                reserves tensors concatanating raw tensors with intermediate \\\n                results of current query.\n        \"\"\"\n        key = query if key is None else key\n        value = query if value is None else value\n        # compute q ,k ,v\n        if cache is None:\n            q, k, v = self._prepare_qkv(query, key, value, cache)\n        else:\n            q, k, v, cache = self._prepare_qkv(query, key, value, cache)\n\n        # scale dot product attention\n        product = paddle.matmul(\n            x=q * (self.head_dim**-0.5), y=k, transpose_y=True)\n\n        if attn_mask is not None:\n            # Support bool or int mask\n            attn_mask = _convert_attention_mask(attn_mask, product.dtype)\n            product = product + attn_mask\n\n        weights = F.softmax(product)\n\n        if self.dropout:\n            with get_rng_state_tracker().rng_state('local_seed'):\n                weights = F.dropout(\n                    weights,\n                    self.dropout,\n                    training=self.training,\n                    mode=\"upscale_in_train\")\n\n        out = paddle.matmul(weights, v)\n\n        # combine heads\n        out = tensor.transpose(out, perm=[0, 2, 1, 3])\n        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])\n\n        # project to output\n        out = self.out_proj(out)\n\n        outs = [out]\n        if self.need_weights:\n            outs.append(weights)\n        if cache is not None:\n            outs.append(cache)\n        return out if len(outs) == 1 else tuple(outs)\n\n\nclass TransformerEncoderLayer(Layer):\n    \"\"\"\n    TransformerEncoderLayer is composed of two sub-layers which are self (multi-head)\n    attention and feedforward network. Before and after each sub-layer, pre-process\n    and post-precess would be applied on the input and output accordingly. If\n    `normalize_before` is True, pre-process is layer normalization and post-precess\n    includes dropout, residual connection. Otherwise, no pre-process and post-precess\n    includes dropout, residual connection, layer normalization.\n\n    Parameters:\n        d_model (int): The expected feature size in the input and output.\n        nhead (int): The number of heads in multi-head attention(MHA).\n        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).\n        dropout (float, optional): The dropout probability used in pre-process\n            and post-precess of MHA and FFN sub-layer. Default 0.1\n        activation (str, optional): The activation function in the feedforward\n            network. Default relu.\n        attn_dropout (float, optional): The dropout probability used\n            in MHA to drop some attention target. If None, use the value of\n            `dropout`. Default None\n        act_dropout (float, optional): The dropout probability used after FFN\n            activition.  If None, use the value of `dropout`. Default None\n        normalize_before (bool, optional): Indicate whether to put layer normalization\n            into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer\n            normalization and post-precess includes dropout, residual connection.\n            Otherwise, no pre-process and post-precess includes dropout, residual\n            connection, layer normalization. Default False\n        weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property.\n            If it is a list/tuple, `weight_attr[0]` would be used as `weight_attr` for\n            MHA, and `weight_attr[1]` would be used as `weight_attr` for linear in FFN.\n            Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.\n            Default: None, which means the default weight parameter property is used.\n            See usage for details in :code:`ParamAttr` .\n        bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.\n            If it is a list/tuple, `bias_attr[0]` would be used as `bias_attr` for\n            MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.\n            Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.\n            The `False` value means the corresponding layer would not have trainable\n            bias parameter. See usage for details in :code:`ParamAttr` . Default: None,\n            which means the default bias parameter property is used.\n\n\n    Examples:\n\n        .. code-block:: python\n\n            import paddle\n            from paddle.nn import TransformerEncoderLayer\n\n            # encoder input: [batch_size, src_len, d_model]\n            enc_input = paddle.rand((2, 4, 128))\n            # self attention mask: [batch_size, n_head, src_len, src_len]\n            attn_mask = paddle.rand((2, 2, 4, 4))\n            encoder_layer = TransformerEncoderLayer(128, 2, 512)\n            enc_output = encoder_layer(enc_input, attn_mask)  # [2, 4, 128]\n    \"\"\"\n\n    def __init__(self,\n                 d_model,\n                 nhead,\n                 dim_feedforward,\n                 dropout=0.1,\n                 activation=\"relu\",\n                 attn_dropout=None,\n                 act_dropout=None,\n                 normalize_before=False,\n                 weight_attr=None,\n                 bias_attr=None,\n                 num_partitions=1):\n        self._config = locals()\n        self._config.pop(\"self\")\n        self._config.pop(\"__class__\", None)  # py3\n\n        super(TransformerEncoderLayer, self).__init__()\n\n        assert d_model > 0, (\"Expected d_model to be greater than 0, \"\n                             \"but received {}\".format(d_model))\n        assert nhead > 0, (\"Expected nhead to be greater than 0, \"\n                           \"but received {}\".format(nhead))\n        assert dim_feedforward > 0, (\n            \"Expected dim_feedforward to be greater than 0, \"\n            \"but received {}\".format(dim_feedforward))\n\n        attn_dropout = dropout if attn_dropout is None else attn_dropout\n        act_dropout = dropout if act_dropout is None else act_dropout\n        self.normalize_before = normalize_before\n\n        weight_attrs = _convert_param_attr_to_list(weight_attr, 2)\n        bias_attrs = _convert_param_attr_to_list(bias_attr, 2)\n\n        self.self_attn = MultiHeadAttention(\n            d_model,\n            nhead,\n            dropout=attn_dropout,\n            weight_attr=weight_attrs[0],\n            bias_attr=bias_attrs[0],\n            num_partitions=num_partitions)\n        # self.linear1 = Linear(\n        # d_model, dim_feedforward, weight_attrs[1], bias_attr=bias_attrs[1])\n        self.dropout = Dropout(act_dropout, mode=\"upscale_in_train\")\n        # self.linear2 = Linear(\n        # dim_feedforward, d_model, weight_attrs[1], bias_attr=bias_attrs[1])\n        self.norm1 = LayerNorm(d_model)\n        self.norm2 = LayerNorm(d_model)\n        self.dropout1 = Dropout(dropout, mode=\"upscale_in_train\")\n        self.dropout2 = Dropout(dropout, mode=\"upscale_in_train\")\n        self.activation = getattr(F, activation)\n\n        self.linear1 = fleet.meta_parallel.ColumnParallelLinear(\n            d_model,\n            dim_feedforward,\n            weight_attr=weight_attrs[1],\n            gather_output=False,\n            has_bias=True)\n\n        self.linear2 = fleet.meta_parallel.RowParallelLinear(\n            dim_feedforward,\n            d_model,\n            weight_attr=weight_attrs[1],\n            input_is_parallel=True,\n            has_bias=True)\n\n    def forward(self, src, src_mask=None, cache=None, output_attentions=False):\n        r\"\"\"\n        Applies a Transformer encoder layer on the input.\n\n        Parameters:\n            src (Tensor): The input of Transformer encoder layer. It is\n                a tensor with shape `[batch_size, sequence_length, d_model]`.\n                The data type should be float32 or float64.\n            src_mask (Tensor, optional): A tensor used in multi-head attention\n                to prevents attention to some unwanted positions, usually the\n                paddings or the subsequent positions. It is a tensor with shape\n                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.\n                When the data type is bool, the unwanted positions have `False`\n                values and the others have `True` values. When the data type is\n                int, the unwanted positions have 0 values and the others have 1\n                values. When the data type is float, the unwanted positions have\n                `-INF` values and the others have 0 values. It can be None when\n                nothing wanted or needed to be prevented attention to. Default None.\n            cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`.\n                See `TransformerEncoderLayer.gen_cache` for more details. It is\n                only used for inference and should be None for training. Default\n                None.\n\n        Returns:\n            Tensor|tuple: It is a tensor that has the same shape and data type \\\n                as `enc_input`, representing the output of Transformer encoder \\\n                layer. Or a tuple if `cache` is not None, except for encoder \\\n                layer output, the tuple includes the new cache which is same \\\n                as input `cache` argument but `incremental_cache` has an \\\n                incremental length. See `MultiHeadAttention.gen_cache` and \\\n                `MultiHeadAttention.forward` for more details.\n        \"\"\"\n        self.self_attn.need_weights = output_attentions\n        src_mask = _convert_attention_mask(src_mask, src.dtype)\n\n        residual = src\n        if self.normalize_before:\n            src = self.norm1(src)\n\n        attn_outputs = self.self_attn(src, src, src, src_mask, cache)\n        if isinstance(attn_outputs, tuple):\n            src = attn_outputs[0]\n            outputs = attn_outputs[1:]\n        else:\n            src = attn_outputs\n            outputs = None\n\n        src = residual + self.dropout1(src)\n        if not self.normalize_before:\n            src = self.norm1(src)\n\n        residual = src\n        if self.normalize_before:\n            src = self.norm2(src)\n\n        with get_rng_state_tracker().rng_state('global_seed'):\n            tgt = self.dropout(self.activation(self.linear1(src)))\n            # tgt = residual + self.dropout1(tgt)\n\n        src = self.linear2(tgt)\n\n        with get_rng_state_tracker().rng_state('global_seed'):\n            src = residual + self.dropout2(src)\n\n        if not self.normalize_before:\n            src = self.norm2(src)\n\n        return src if outputs is None else (\n            (src, ) + outputs[::-1])  # hidden_states, cache, attentions\n\n    def gen_cache(self, src):\n        r\"\"\"\n        Generates cache for `forward` usage. The generated cache is an\n        instance of `MultiHeadAttention.Cache`.\n\n        Parameters:\n            src (Tensor): The input of Transformer encoder. It is a tensor\n                with shape `[batch_size, source_length, d_model]`. The data\n                type should be float32 or float64.\n\n        Returns:\n            incremental_cache: It is an instance of `MultiHeadAttention.Cache` \\\n                produced by `self_attn.gen_cache`, it reserves two tensors\n                shaped `[batch_size, nhead, 0, d_model // nhead]`. See \\\n                `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \\\n                for more details.\n        \"\"\"\n        incremental_cache = self.self_attn.gen_cache(\n            src, type=self.self_attn.Cache)\n        return incremental_cache\n\n\nclass TransformerEncoder(Layer):\n    \"\"\"\n    TransformerEncoder is a stack of N encoder layers.\n\n    Parameters:\n        encoder_layer (Layer): an instance of the `TransformerEncoderLayer`. It\n            would be used as the first layer, and the other layers would be created\n            according to the configurations of it.\n        num_layers (int): The number of encoder layers to be stacked.\n        norm (LayerNorm, optional): the layer normalization component. If provided,\n            apply layer normalization on the output of last encoder layer.\n\n    Examples:\n\n        .. code-block:: python\n\n            import paddle\n            from paddle.nn import TransformerEncoderLayer, TransformerEncoder\n\n            # encoder input: [batch_size, src_len, d_model]\n            enc_input = paddle.rand((2, 4, 128))\n            # self attention mask: [batch_size, n_head, src_len, src_len]\n            attn_mask = paddle.rand((2, 2, 4, 4))\n            encoder_layer = TransformerEncoderLayer(128, 2, 512)\n            encoder = TransformerEncoder(encoder_layer, 2)\n            enc_output = encoder(enc_input, attn_mask)  # [2, 4, 128]\n    \"\"\"\n\n    def __init__(self,\n                 encoder_layer,\n                 num_layers,\n                 norm=None,\n                 enable_recompute=False):\n        super(TransformerEncoder, self).__init__()\n        self.layers = LayerList([(encoder_layer if i == 0 else\n                                  type(encoder_layer)(**encoder_layer._config))\n                                 for i in range(num_layers)])\n        self.num_layers = num_layers\n        self.norm = norm\n        self.enable_recompute = enable_recompute\n\n    def forward(self,\n                src,\n                src_mask=None,\n                cache=None,\n                output_attentions=False,\n                output_hidden_states=False,\n                return_dict=False):\n        r\"\"\"\n        Applies a stack of N Transformer encoder layers on inputs. If `norm` is\n        provided, also applies layer normalization on the output of last encoder\n        layer.\n\n        Parameters:\n            src (Tensor): The input of Transformer encoder. It is a tensor\n                with shape `[batch_size, sequence_length, d_model]`. The data\n                type should be float32 or float64.\n            src_mask (Tensor, optional): A tensor used in multi-head attention\n                to prevents attention to some unwanted positions, usually the\n                paddings or the subsequent positions. It is a tensor with shape\n                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.\n                When the data type is bool, the unwanted positions have `False`\n                values and the others have `True` values. When the data type is\n                int, the unwanted positions have 0 values and the others have 1\n                values. When the data type is float, the unwanted positions have\n                `-INF` values and the others have 0 values. It can be None when\n                nothing wanted or needed to be prevented attention to. Default None.\n            cache (list, optional): It is a list, and each element in the list\n                is `incremental_cache` produced by `TransformerEncoderLayer.gen_cache`.\n                See `TransformerEncoder.gen_cache` for more details. It is only\n                used for inference and should be None for training. Default None.\n\n        Returns:\n            Tensor|tuple: It is a tensor that has the same shape and data type \\\n                as `src`, representing the output of Transformer encoder. \\\n                Or a tuple if `cache` is not None, except for encoder output, \\\n                the tuple includes the new cache which is same as input `cache` \\\n                argument but `incremental_cache` in it has an incremental length. \\\n                See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \\\n                for more details.\n        \"\"\"\n        src_mask = _convert_attention_mask(src_mask, src.dtype)\n\n        output = src\n        # To get cache from None when use_cache is True, which is compatible with HF\n        # while HF requires decoder. The implementation here uses cache update in the\n        # MultiHeadAttention not so efficiently, and maybe optimize it later.\n        if cache is None and getattr(self, \"_use_cache\", False):\n            cache = [tuple(self.layers[0].gen_cache(src))] * len(self.layers)\n        # To be compatible with `TransformerEncoder.forward`, `_use_cache` defualts\n        # to True when cache is not None.\n        new_caches = [] if cache is not None and getattr(self, \"_use_cache\",\n                                                         True) else None\n        all_attentions = [] if output_attentions else None\n        # NOTE: Also includes embeding output which is same as HF.\n        all_hidden_states = [output] if output_hidden_states else None\n        for i, mod in enumerate(self.layers):\n            if self.enable_recompute:\n                # Note: recompute do not support pass as **kwargs yet.\n                layer_outputs = recompute(\n                    mod, output, src_mask, None if cache is None else cache[i]\n                    if isinstance(cache[i], MultiHeadAttention.Cache) else\n                    MultiHeadAttention.Cache(*cache[i]), output_attentions)\n            else:\n                layer_outputs = mod(\n                    output,\n                    src_mask=src_mask,\n                    cache=None if cache is None else cache[i]\n                    if isinstance(cache[i], MultiHeadAttention.Cache) else\n                    MultiHeadAttention.Cache(*cache[i]),\n                    output_attentions=output_attentions)\n\n            if isinstance(layer_outputs, tuple):\n                output = layer_outputs[0]\n                outputs = layer_outputs[1:]\n            else:\n                output = layer_outputs\n                outputs = None\n\n            if output_hidden_states:\n                all_hidden_states.append(output)\n            if output_attentions:\n                all_attentions.append(outputs[-1])\n            if new_caches is not None:\n                new_caches.append(outputs[0] if isinstance(cache[\n                    i], MultiHeadAttention.Cache) else (tuple(outputs[0])))\n\n        if self.norm is not None:\n            output = self.norm(output)\n\n            if output_hidden_states:\n                all_hidden_states[-1] = output\n\n        if not return_dict:\n            outputs = tuple(\n                tuple(v) if isinstance(v, list) else v\n                for v in [\n                    output,\n                    new_caches,\n                    all_hidden_states,\n                    all_attentions,\n                ] if v is not None)\n            if len(outputs) == 1:\n                return output\n            else:\n                return outputs\n\n        return BaseModelOutputWithPastAndCrossAttentions(\n            last_hidden_state=output,\n            past_key_values=new_caches,\n            hidden_states=all_hidden_states,\n            attentions=all_attentions)\n\n    def gen_cache(self, src):\n        r\"\"\"\n        Generates cache for `forward` usage. The generated cache is a list, and\n        each element in it is `incremental_cache` produced by\n        `TransformerEncoderLayer.gen_cache`. See `TransformerEncoderLayer.gen_cache`\n        for more details.\n\n        Parameters:\n            src (Tensor): The input of Transformer encoder. It is a tensor\n                with shape `[batch_size, source_length, d_model]`. The data type\n                should be float32 or float64.\n\n        Returns:\n            list: It is a list, and each element in the list is `incremental_cache`\n            produced by `TransformerEncoderLayer.gen_cache`. See\n            `TransformerEncoderLayer.gen_cache` for more details.\n        \"\"\"\n        cache = [layer.gen_cache(src) for layer in self.layers]\n        return cache\n"
  },
  {
    "path": "ppfleetx/models/language_model/ernie/layers/model_outputs.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# Copyright 2020 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport functools\nimport paddle\nimport numpy as np\nfrom collections import OrderedDict\nfrom dataclasses import fields, dataclass\nfrom typing import Any, List, Tuple, Optional\nfrom paddle.nn.layer.transformer import _convert_attention_mask, MultiHeadAttention\nfrom paddle.distributed.fleet.utils import recompute\n\nfrom .utils import adapt_stale_fwd_patch\n\n\ndef is_tensor(x):\n    if isinstance(x, paddle.Tensor):\n        return True\n\n    return isinstance(x, np.ndarray)\n\n\nclass ModelOutput(OrderedDict):\n    \"\"\"\n    Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a\n    tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular\n    python dictionary.\n\n    <Tip warning={true}>\n\n    You can't unpack a `ModelOutput` directly. Use the [`~utils.ModelOutput.to_tuple`] method to convert it to a tuple\n    before.\n\n    </Tip>\n    \"\"\"\n\n    def __post_init__(self):\n        class_fields = fields(self)\n\n        # note(guosheng): Convert list to tuple automatically, and better to\n        # check if it is frozen.\n        # assert not getattr(self, dataclasses._PARAMS).frozen\n        for f in class_fields:\n            value = getattr(self, f.name)\n            if isinstance(value, list):\n                setattr(self, f.name, tuple(value))\n\n        # Safety and consistency checks\n        if not len(class_fields):\n            raise ValueError(f\"{self.__class__.__name__} has no fields.\")\n        if not all(field.default is None for field in class_fields[1:]):\n            raise ValueError(\n                f\"{self.__class__.__name__} should not have more than one required field.\"\n            )\n\n        first_field = getattr(self, class_fields[0].name)\n        other_fields_are_none = all(\n            getattr(self, field.name) is None for field in class_fields[1:])\n\n        if other_fields_are_none and not is_tensor(first_field):\n            if isinstance(first_field, dict):\n                iterator = first_field.items()\n                first_field_iterator = True\n            else:\n                try:\n                    iterator = iter(first_field)\n                    first_field_iterator = True\n                except TypeError:\n                    first_field_iterator = False\n\n            # if we provided an iterator as first field and the iterator is a (key, value) iterator\n            # set the associated fields\n            if first_field_iterator:\n                for element in iterator:\n                    if (not isinstance(element, (list, tuple)) or\n                            not len(element) == 2 or\n                            not isinstance(element[0], str)):\n                        break\n                    setattr(self, element[0], element[1])\n                    if element[1] is not None:\n                        self[element[0]] = element[1]\n            elif first_field is not None:\n                self[class_fields[0].name] = first_field\n        else:\n            for field in class_fields:\n                v = getattr(self, field.name)\n                if v is not None:\n                    self[field.name] = v\n\n    def __delitem__(self, *args, **kwargs):\n        raise Exception(\n            f\"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.\"\n        )\n\n    def setdefault(self, *args, **kwargs):\n        raise Exception(\n            f\"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.\"\n        )\n\n    def pop(self, *args, **kwargs):\n        raise Exception(\n            f\"You cannot use ``pop`` on a {self.__class__.__name__} instance.\")\n\n    def update(self, *args, **kwargs):\n        raise Exception(\n            f\"You cannot use ``update`` on a {self.__class__.__name__} instance.\"\n        )\n\n    def __getitem__(self, k):\n        if isinstance(k, str):\n            inner_dict = {k: v for (k, v) in self.items()}\n            return inner_dict[k]\n        else:\n            return self.to_tuple()[k]\n\n    def __setattr__(self, name, value):\n        if name in self.keys() and value is not None:\n            # Don't call self.__setitem__ to avoid recursion errors\n            super().__setitem__(name, value)\n        super().__setattr__(name, value)\n\n    def __setitem__(self, key, value):\n        # Will raise a KeyException if needed\n        super().__setitem__(key, value)\n        # Don't call self.__setattr__ to avoid recursion errors\n        super().__setattr__(key, value)\n\n    def to_tuple(self) -> Tuple[Any]:\n        \"\"\"\n        Convert self to a tuple containing all the attributes/keys that are not `None`.\n        \"\"\"\n        return tuple(self[k] for k in self.keys())\n\n\n@dataclass\nclass ErnieForPreTrainingOutput(ModelOutput):\n    \"\"\"\n    Output type of [`ErnieForPreTraining`].\n    Args:\n        loss (*optional*, returned when `labels` is provided, `paddle.Tensor` of shape `(1,)`):\n            Total loss as the sum of the masked language modeling loss and the next sequence prediction\n            (classification) loss.\n        prediction_logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):\n            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).\n        seq_relationship_logits (`paddle.Tensor` of shape `(batch_size, 2)`):\n            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation\n            before SoftMax).\n        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):\n            Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of\n            shape `(batch_size, sequence_length, hidden_size)`.\n            Hidden-states of the model at the output of each layer plus the initial embedding outputs.\n        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):\n            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,\n            sequence_length)`.\n            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention\n            heads.\n    \"\"\"\n\n    loss = None\n    prediction_logits = None\n    seq_relationship_logits = None\n    hidden_states = None\n    attentions = None\n\n\n@dataclass\nclass BaseModelOutputWithPastAndCrossAttentions(ModelOutput):\n    \"\"\"\n    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).\n\n    Args:\n        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):\n            Sequence of hidden-states at the output of the last layer of the model.\n\n            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,\n            hidden_size)` is output.\n        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):\n            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape\n            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if\n            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,\n            encoder_sequence_length, embed_size_per_head)`.\n\n            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if\n            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`\n            input) to speed up sequential decoding.\n        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):\n            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +\n            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.\n\n            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.\n        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):\n            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,\n            sequence_length)`.\n\n            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention\n            heads.\n        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):\n            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,\n            sequence_length)`.\n\n            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the\n            weighted average in the cross-attention heads.\n    \"\"\"\n\n    last_hidden_state: paddle.Tensor = None\n    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None\n    hidden_states: Optional[Tuple[paddle.Tensor]] = None\n    attentions: Optional[Tuple[paddle.Tensor]] = None\n    cross_attentions: Optional[Tuple[paddle.Tensor]] = None\n\n\n@dataclass\nclass BaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):\n    \"\"\"\n    Base class for model's outputs that also contains a pooling of the last hidden states.\n\n    Args:\n        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):\n            Sequence of hidden-states at the output of the last layer of the model.\n        pooler_output (`paddle.Tensor` of shape `(batch_size, hidden_size)`):\n            Last layer hidden-state of the first token of the sequence (classification token) after further processing\n            through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns\n            the classification token after processing through a linear layer and a tanh activation function. The linear\n            layer weights are trained from the next sentence prediction (classification) objective during pretraining.\n        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):\n            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +\n            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.\n\n            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.\n        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):\n            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,\n            sequence_length)`.\n\n            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention\n            heads.\n        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):\n            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,\n            sequence_length)`.\n\n            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the\n            weighted average in the cross-attention heads.\n        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):\n            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape\n            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if\n            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,\n            encoder_sequence_length, embed_size_per_head)`.\n\n            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if\n            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`\n            input) to speed up sequential decoding.\n    \"\"\"\n\n    last_hidden_state: paddle.Tensor = None\n    pooler_output: paddle.Tensor = None\n    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None\n    hidden_states: Optional[Tuple[paddle.Tensor]] = None\n    attentions: Optional[Tuple[paddle.Tensor]] = None\n    cross_attentions: Optional[Tuple[paddle.Tensor]] = None\n\n\n@dataclass\nclass SequenceClassifierOutput(ModelOutput):\n    \"\"\"\n    Base class for outputs of sentence classification models.\n\n    Args:\n        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):\n            Classification (or regression if config.num_labels==1) loss.\n        logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`):\n            Classification (or regression if config.num_labels==1) scores (before SoftMax).\n        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):\n            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +\n            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.\n\n            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.\n        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):\n            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,\n            sequence_length)`.\n\n            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention\n            heads.\n    \"\"\"\n\n    loss: Optional[paddle.Tensor] = None\n    logits: paddle.Tensor = None\n    hidden_states: Optional[Tuple[paddle.Tensor]] = None\n    attentions: Optional[Tuple[paddle.Tensor]] = None\n\n\n@dataclass\nclass TokenClassifierOutput(ModelOutput):\n    \"\"\"\n    Base class for outputs of token classification models.\n\n    Args:\n        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :\n            Classification loss.\n        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`):\n            Classification scores (before SoftMax).\n        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):\n            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +\n            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.\n\n            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.\n        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):\n            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,\n            sequence_length)`.\n\n            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention\n            heads.\n    \"\"\"\n\n    loss: Optional[paddle.Tensor] = None\n    logits: paddle.Tensor = None\n    hidden_states: Optional[Tuple[paddle.Tensor]] = None\n    attentions: Optional[Tuple[paddle.Tensor]] = None\n\n\n@dataclass\nclass QuestionAnsweringModelOutput(ModelOutput):\n    \"\"\"\n    Base class for outputs of question answering models.\n\n    Args:\n        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):\n            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.\n        start_logits (`paddle.Tensor` of shape `(batch_size, sequence_length)`):\n            Span-start scores (before SoftMax).\n        end_logits (`paddle.Tensor` of shape `(batch_size, sequence_length)`):\n            Span-end scores (before SoftMax).\n        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):\n            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +\n            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.\n\n            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.\n        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):\n            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,\n            sequence_length)`.\n\n            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention\n            heads.\n    \"\"\"\n\n    loss: Optional[paddle.Tensor] = None\n    start_logits: paddle.Tensor = None\n    end_logits: paddle.Tensor = None\n    hidden_states: Optional[Tuple[paddle.Tensor]] = None\n    attentions: Optional[Tuple[paddle.Tensor]] = None\n\n\n@dataclass\nclass MultipleChoiceModelOutput(ModelOutput):\n    \"\"\"\n    Base class for outputs of multiple choice models.\n\n    Args:\n        loss (`paddle.Tensor` of shape *(1,)*, *optional*, returned when `labels` is provided):\n            Classification loss.\n        logits (`paddle.Tensor` of shape `(batch_size, num_choices)`):\n            *num_choices* is the second dimension of the input tensors. (see *input_ids* above).\n\n            Classification scores (before SoftMax).\n        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):\n            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +\n            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.\n\n            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.\n        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):\n            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,\n            sequence_length)`.\n\n            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention\n            heads.\n    \"\"\"\n\n    loss: Optional[paddle.Tensor] = None\n    logits: paddle.Tensor = None\n    hidden_states: Optional[Tuple[paddle.Tensor]] = None\n    attentions: Optional[Tuple[paddle.Tensor]] = None\n\n\n@dataclass\nclass MaskedLMOutput(ModelOutput):\n    \"\"\"\n    Base class for masked language models outputs.\n\n    Args:\n        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):\n            Masked language modeling (MLM) loss.\n        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):\n            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).\n        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):\n            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +\n            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.\n\n            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.\n        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):\n            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,\n            sequence_length)`.\n\n            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention\n            heads.\n    \"\"\"\n\n    loss: Optional[paddle.Tensor] = None\n    logits: paddle.Tensor = None\n    hidden_states: Optional[Tuple[paddle.Tensor]] = None\n    attentions: Optional[Tuple[paddle.Tensor]] = None\n\n\n@dataclass\nclass CausalLMOutputWithCrossAttentions(ModelOutput):\n    \"\"\"\n    Base class for causal language model (or autoregressive) outputs.\n\n    Args:\n        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):\n            Language modeling loss (for next-token prediction).\n        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):\n            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).\n        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):\n            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +\n            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.\n\n            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.\n        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):\n            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,\n            sequence_length)`.\n\n            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention\n            heads.\n        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):\n            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,\n            sequence_length)`.\n\n            Cross attentions weights after the attention softmax, used to compute the weighted average in the\n            cross-attention heads.\n        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):\n            Tuple of `paddle.Tensor` tuples of length `config.n_layers`, with each tuple containing the cached key,\n            value states of the self-attention and the cross-attention layers if model is used in encoder-decoder\n            setting. Only relevant if `config.is_decoder = True`.\n\n            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see\n            `past_key_values` input) to speed up sequential decoding.\n    \"\"\"\n\n    loss: Optional[paddle.Tensor] = None\n    logits: paddle.Tensor = None\n    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None\n    hidden_states: Optional[Tuple[paddle.Tensor]] = None\n    attentions: Optional[Tuple[paddle.Tensor]] = None\n    cross_attentions: Optional[Tuple[paddle.Tensor]] = None\n"
  },
  {
    "path": "ppfleetx/models/language_model/ernie/layers/transformer.py",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# TODO: define the classes of Transformer neural network\n\nimport copy\nimport collections\nimport numpy as np\n\nimport paddle\nimport paddle.nn.functional as F\nimport paddle.nn as nn\n\nfrom paddle.nn import Linear, Dropout, LayerNorm, LayerList, Layer\nimport paddle.tensor as tensor\nfrom paddle.fluid import layers\nfrom paddle import ParamAttr\nfrom paddle.fluid.data_feeder import convert_dtype\nfrom .model_outputs import BaseModelOutputWithPastAndCrossAttentions\nfrom paddle.distributed.fleet.utils import recompute\n\n__all__ = []\n\n\ndef _convert_param_attr_to_list(param_attr, n):\n    \"\"\"\n    If `param_attr` is a list or tuple, convert every element in it to a\n    ParamAttr instance. Otherwise, repeat `param_attr` `n` times to\n    construct a list, and rename every one by appending a increasing index\n    suffix to avoid having same names when `param_attr` contains a name.\n\n    Parameters:\n        param_attr (list|tuple|ParamAttr): A list, tuple or something can be\n            converted to a ParamAttr instance by `ParamAttr._to_attr`.\n        n (int): The times to repeat to construct a list when `param_attr`\n            is not a list or tuple.\n\n    Returns:\n        list: A list composed of each including cell's `param_attr`.\n    \"\"\"\n    if isinstance(param_attr, (list, tuple)):\n        assert len(param_attr) == n, (\n            \"length of param_attr should be %d when it is a list/tuple\" % n)\n        param_attrs = []\n        for attr in param_attr:\n            if isinstance(attr, bool):\n                if attr:\n                    param_attrs.append(ParamAttr._to_attr(None))\n                else:\n                    param_attrs.append(False)\n            else:\n                param_attrs.append(ParamAttr._to_attr(attr))\n        # param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]\n    elif isinstance(param_attr, bool):\n        param_attrs = []\n        if param_attr:\n            param_attrs = [ParamAttr._to_attr(None) for i in range(n)]\n        else:\n            param_attrs = [False] * n\n    else:\n        param_attrs = []\n        attr = ParamAttr._to_attr(param_attr)\n        for i in range(n):\n            attr_i = copy.deepcopy(attr)\n            if attr.name:\n                attr_i.name = attr_i.name + \"_\" + str(i)\n            param_attrs.append(attr_i)\n    return param_attrs\n\n\ndef _convert_attention_mask(attn_mask, dtype):\n    \"\"\"\n    Convert the attention mask to the target dtype we expect.\n\n    Parameters:\n        attn_mask (Tensor, optional): A tensor used in multi-head attention\n                to prevents attention to some unwanted positions, usually the\n                paddings or the subsequent positions. It is a tensor with shape\n                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.\n                When the data type is bool, the unwanted positions have `False`\n                values and the others have `True` values. When the data type is\n                int, the unwanted positions have 0 values and the others have 1\n                values. When the data type is float, the unwanted positions have\n                `-INF` values and the others have 0 values. It can be None when\n                nothing wanted or needed to be prevented attention to. Default None.\n        dtype (VarType): The target type of `attn_mask` we expect.\n\n    Returns:\n        Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`.\n    \"\"\"\n    if attn_mask is not None and attn_mask.dtype != dtype:\n        attn_mask_dtype = convert_dtype(attn_mask.dtype)\n        if attn_mask_dtype == 'bool' or 'int' in attn_mask_dtype:\n            attn_mask = (paddle.cast(attn_mask, dtype) - 1.0) * 1e9\n        else:\n            attn_mask = paddle.cast(attn_mask, dtype)\n    return attn_mask\n\n\nclass MultiHeadAttention(Layer):\n    \"\"\"\n    Attention mapps queries and a set of key-value pairs to outputs, and\n    Multi-Head Attention performs multiple parallel attention to jointly attending\n    to information from different representation subspaces.\n\n    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_\n    for more details.\n\n    Parameters:\n        embed_dim (int): The expected feature size in the input and output.\n        num_heads (int): The number of heads in multi-head attention.\n        dropout (float, optional): The dropout probability used on attention\n            weights to drop some attention targets. 0 for no dropout. Default 0\n        kdim (int, optional): The feature size in key. If None, assumed equal to\n            `embed_dim`. Default None.\n        vdim (int, optional): The feature size in value. If None, assumed equal to\n            `embed_dim`. Default None.\n        need_weights (bool, optional): Indicate whether to return the attention\n            weights. Default False.\n        weight_attr(ParamAttr, optional):  To specify the weight parameter property.\n            Default: None, which means the default weight parameter property is used.\n            See usage for details in :code:`ParamAttr` .\n        bias_attr (ParamAttr|bool, optional): To specify the bias parameter property.\n            Default: None, which means the default bias parameter property is used.\n            If it is set to False, this layer will not have trainable bias parameter.\n            See usage for details in :code:`ParamAttr` .\n\n    Examples:\n\n        .. code-block:: python\n\n            import paddle\n\n            # encoder input: [batch_size, sequence_length, d_model]\n            query = paddle.rand((2, 4, 128))\n            # self attention mask: [batch_size, num_heads, query_len, query_len]\n            attn_mask = paddle.rand((2, 2, 4, 4))\n            multi_head_attn = paddle.nn.MultiHeadAttention(128, 2)\n            output = multi_head_attn(query, None, None, attn_mask=attn_mask)  # [2, 4, 128]\n    \"\"\"\n\n    Cache = collections.namedtuple(\"Cache\", [\"k\", \"v\"])\n    StaticCache = collections.namedtuple(\"StaticCache\", [\"k\", \"v\"])\n\n    def __init__(self,\n                 embed_dim,\n                 num_heads,\n                 dropout=0.,\n                 kdim=None,\n                 vdim=None,\n                 need_weights=False,\n                 weight_attr=None,\n                 bias_attr=None):\n        super(MultiHeadAttention, self).__init__()\n\n        assert embed_dim > 0, (\"Expected embed_dim to be greater than 0, \"\n                               \"but received {}\".format(embed_dim))\n        assert num_heads > 0, (\"Expected num_heads to be greater than 0, \"\n                               \"but received {}\".format(num_heads))\n\n        self.embed_dim = embed_dim\n        self.kdim = kdim if kdim is not None else embed_dim\n        self.vdim = vdim if vdim is not None else embed_dim\n        self.num_heads = num_heads\n        self.dropout = dropout\n        self.need_weights = need_weights\n\n        self.head_dim = embed_dim // num_heads\n        assert self.head_dim * num_heads == self.embed_dim, \"embed_dim must be divisible by num_heads\"\n\n        self.q_proj = Linear(\n            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)\n        self.k_proj = Linear(\n            self.kdim, embed_dim, weight_attr, bias_attr=bias_attr)\n        self.v_proj = Linear(\n            self.vdim, embed_dim, weight_attr, bias_attr=bias_attr)\n        self.out_proj = Linear(\n            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)\n\n    def _prepare_qkv(self, query, key, value, cache=None):\n        r\"\"\"\n        Prapares linear projected queries, keys and values for usage of subsequnt\n        multiple parallel attention. If `cache` is not None, using cached results\n        to reduce redundant calculations.\n\n        Parameters:\n            query (Tensor): The queries for multi-head attention. It is a\n                tensor with shape `[batch_size, query_length, embed_dim]`. The\n                data type should be float32 or float64.\n            key (Tensor): The keys for multi-head attention. It is\n                a tensor with shape `[batch_size, key_length, kdim]`. The\n                data type should be float32 or float64. If None, use `query` as\n                `key`.\n            value (Tensor): The values for multi-head attention. It\n                is a tensor with shape `[batch_size, value_length, vdim]`.\n                The data type should be float32 or float64. If None, use `query` as\n                `value`.\n            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):\n                It is a namedtuple with `k` and `v` as fields, and stores tensors\n                shaped `[batch_size, num_heads, length, embed_dim]` which are results\n                of linear projection, reshape and transpose calculations in\n                MultiHeadAttention. If is an instance of `Cache`, `k` and `v`\n                fields reserve intermediate results of previous positions, which\n                mostly used for decoder self attention. If it is an instance of\n                `StaticCache`, `key` and `value` args would be ignored, `k` and\n                `v` fields would be used as calculated results on `key` and\n                `value`, which mostly used for decoder-encoder cross attention.\n                It is only used for inference and should be None for training.\n                Default None.\n\n        Returns:\n            tuple: A tuple including linear projected keys and values. These two \\\n                tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \\\n                and `[batch_size, n_head, sequence_length, d_value]` separately, \\\n                and their data types are same as inputs.\n        \"\"\"\n        q = self.q_proj(query)\n        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])\n        q = tensor.transpose(x=q, perm=[0, 2, 1, 3])\n\n        if isinstance(cache, self.StaticCache):\n            # for encoder-decoder attention in inference and has cached\n            k, v = cache.k, cache.v\n        else:\n            k, v = self.compute_kv(key, value)\n\n        if isinstance(cache, self.Cache):\n            # for decoder self-attention in inference\n            k = tensor.concat([cache.k, k], axis=2)\n            v = tensor.concat([cache.v, v], axis=2)\n            cache = self.Cache(k, v)\n\n        return (q, k, v) if cache is None else (q, k, v, cache)\n\n    def compute_kv(self, key, value):\n        r\"\"\"\n        Applies linear projection on input keys and values, then splits heads\n        (reshape and transpose) to get keys and values from different representation\n        subspaces. The results are used as key-values pairs for subsequent multiple\n        parallel attention.\n\n        It is part of calculations in multi-head attention, and is provided as\n        a method to pre-compute and prefetch these results, thus we can use them\n        to construct cache for inference.\n\n        Parameters:\n            key (Tensor): The keys for multi-head attention. It is a tensor\n                with shape `[batch_size, sequence_length, kdim]`. The data type\n                should be float32 or float64.\n            value (Tensor): The values for multi-head attention. It is a tensor\n                with shape `[batch_size, sequence_length, vdim]`. The data type\n                should be float32 or float64.\n\n        Returns:\n            tuple: A tuple including transformed keys and values. Their shapes \\\n                both are `[batch_size, num_heads, sequence_length, embed_dim // num_heads]`, \\\n                and their data types are same as inputs.\n        \"\"\"\n        k = self.k_proj(key)\n        v = self.v_proj(value)\n        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])\n        k = tensor.transpose(x=k, perm=[0, 2, 1, 3])\n        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])\n        v = tensor.transpose(x=v, perm=[0, 2, 1, 3])\n        return k, v\n\n    def gen_cache(self, key, value=None, type=Cache):\n        \"\"\"\n        Generates cache for `forward` usage in inference accroding to arguments.\n        The generated cache is an instance of `MultiHeadAttention.Cache` or an\n        instance of `MultiHeadAttention.StaticCache`.\n\n        `Cache` or `StaticCache` is namedtuple with `k` and `v` as fields,\n        and it stores tensors shaped `[batch_size, num_heads, length, embed_dim]`\n        which are results of linear projection, reshape and transpose calculations\n        in MultiHeadAttention.\n\n        If the generated cache is an instance of `Cache`, `k` and `v` fields\n        reserve intermediate result tensors of previous positions, and the tensors\n        are incremental among decoding steps, which mostly are used for decoder\n        decoder self attention.\n\n        If the generated cache is an instance of `StaticCache`, `k` and `v` fields\n        would be used as calculated result tensors on keys an values in `forward`,\n        and the tensors keep unchanged among decoding steps, which are mostly used\n        for decoder-encoder cross attention.\n\n        The cache is generated as follows:\n\n        1. If `type` is `StaticCache`, apply `compute_kv(key, value)` and use the\n        results to create an instance of `StaticCache`.\n\n        2. If `type` is `Cache` and `value` is None, generate empty tensors shaped\n        `[batch_size, num_heads, 0, embed_dim // num_heads]` and use the results\n        to create an instance of `Cache`, where `batch_size` is from the first\n        dimension of `key`.\n\n        3. If `type` is `Cache` and `value` is not None, use `key`, `value` to create\n        an instance of `Cache`.\n\n        Parameters:\n            key (Tensor): The keys for multi-head attention. It is\n                a tensor with shape `[batch_size, key_length, kdim]`. The\n                data type should be float32 or float64. If `value` is None,\n                it is only for batch size and data type reference.\n            value (Tensor, optional): The values for multi-head attention. It\n                is a tensor with shape `[batch_size, value_length, vdim]`.\n                The data type should be float32 or float64. If None, `key` is only\n                for batch size reference. Default None.\n            type (type): It should be `MultiHeadAttention.StaticCache` or\n                `MultiHeadAttention.Cache` to indicate the cache type to generate.\n\n        Returns:\n            namedtuple: an instance of `Cache` or `StaticCache` accordingly.\n        \"\"\"\n        if type == MultiHeadAttention.StaticCache:  # static_kv\n            k, v = self.compute_kv(key, value)\n            return self.StaticCache(k, v)\n        elif value is None:  # incremental_state\n            k = layers.fill_constant_batch_size_like(\n                input=key,\n                shape=[-1, self.num_heads, 0, self.head_dim],\n                dtype=key.dtype,\n                value=0)\n            v = layers.fill_constant_batch_size_like(\n                input=key,\n                shape=[-1, self.num_heads, 0, self.head_dim],\n                dtype=key.dtype,\n                value=0)\n            return self.Cache(k, v)\n        else:\n            # incremental_state with initial value, mainly for usage like UniLM\n            return self.Cache(key, value)\n\n    def forward(self, query, key=None, value=None, attn_mask=None, cache=None):\n        r\"\"\"\n        Applies multi-head attention to map queries and a set of key-value pairs\n        to outputs.\n\n        Parameters:\n            query (Tensor): The queries for multi-head attention. It is a\n                tensor with shape `[batch_size, query_length, embed_dim]`. The\n                data type should be float32 or float64.\n            key (Tensor, optional): The keys for multi-head attention. It is\n                a tensor with shape `[batch_size, key_length, kdim]`. The\n                data type should be float32 or float64. If None, use `query` as\n                `key`. Default None.\n            value (Tensor, optional): The values for multi-head attention. It\n                is a tensor with shape `[batch_size, value_length, vdim]`.\n                The data type should be float32 or float64. If None, use `query` as\n                `value`. Default None.\n            attn_mask (Tensor, optional): A tensor used in multi-head attention\n                to prevents attention to some unwanted positions, usually the\n                paddings or the subsequent positions. It is a tensor with shape\n                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.\n                When the data type is bool, the unwanted positions have `False`\n                values and the others have `True` values. When the data type is\n                int, the unwanted positions have 0 values and the others have 1\n                values. When the data type is float, the unwanted positions have\n                `-INF` values and the others have 0 values. It can be None when\n                nothing wanted or needed to be prevented attention to. Default None.\n            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):\n                It is a namedtuple with `k` and `v` as fields, and stores tensors\n                shaped `[batch_size, num_heads, length, embed_dim]` which are results\n                of linear projection, reshape and transpose calculations in\n                MultiHeadAttention. If it is an instance of `Cache`, `k` and `v`\n                fields reserve intermediate results of previous positions, which\n                mostly used for decoder self attention. If it is an instance of\n                `StaticCache`, `key` and `value` args would be ignored, `k` and\n                `v` fields would be used as calculated results on `key` and\n                `value`, which mostly used for decoder-encoder cross attention.\n                It is only used for inference and should be None for training.\n                Default None.\n\n        Returns:\n            Tensor|tuple: It is a tensor that has the same shape and data type \\\n                as `query`, representing attention output. Or a tuple if \\\n                `need_weights` is True or `cache` is not None. If `need_weights` \\\n                is True, except for attention output, the tuple also includes \\\n                the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \\\n                If `cache` is not None, the tuple then includes the new cache \\\n                having the same type as `cache`, and if it is `StaticCache`, it \\\n                is same as the input `cache`, if it is `Cache`, the new cache \\\n                reserves tensors concatanating raw tensors with intermediate \\\n                results of current query.\n        \"\"\"\n        key = query if key is None else key\n        value = query if value is None else value\n        # compute q ,k ,v\n        if cache is None:\n            q, k, v = self._prepare_qkv(query, key, value, cache)\n        else:\n            q, k, v, cache = self._prepare_qkv(query, key, value, cache)\n\n        # scale dot product attention\n        product = paddle.matmul(\n            x=q * (self.head_dim**-0.5), y=k, transpose_y=True)\n        if attn_mask is not None:\n            # Support bool or int mask\n            attn_mask = _convert_attention_mask(attn_mask, product.dtype)\n            product = product + attn_mask\n        weights = F.softmax(product)\n        if self.dropout:\n            weights = F.dropout(\n                weights,\n                self.dropout,\n                training=self.training,\n                mode=\"upscale_in_train\")\n\n        out = paddle.matmul(weights, v)\n\n        # combine heads\n        out = tensor.transpose(out, perm=[0, 2, 1, 3])\n        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])\n\n        # project to output\n        out = self.out_proj(out)\n\n        outs = [out]\n        if self.need_weights:\n            outs.append(weights)\n        if cache is not None:\n            outs.append(cache)\n        return out if len(outs) == 1 else tuple(outs)\n\n\nclass TransformerEncoderLayer(Layer):\n    \"\"\"\n    TransformerEncoderLayer is composed of two sub-layers which are self (multi-head)\n    attention and feedforward network. Before and after each sub-layer, pre-process\n    and post-precess would be applied on the input and output accordingly. If\n    `normalize_before` is True, pre-process is layer normalization and post-precess\n    includes dropout, residual connection. Otherwise, no pre-process and post-precess\n    includes dropout, residual connection, layer normalization.\n\n    Parameters:\n        d_model (int): The expected feature size in the input and output.\n        nhead (int): The number of heads in multi-head attention(MHA).\n        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).\n        dropout (float, optional): The dropout probability used in pre-process\n            and post-precess of MHA and FFN sub-layer. Default 0.1\n        activation (str, optional): The activation function in the feedforward\n            network. Default relu.\n        attn_dropout (float, optional): The dropout probability used\n            in MHA to drop some attention target. If None, use the value of\n            `dropout`. Default None\n        act_dropout (float, optional): The dropout probability used after FFN\n            activition.  If None, use the value of `dropout`. Default None\n        normalize_before (bool, optional): Indicate whether to put layer normalization\n            into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer\n            normalization and post-precess includes dropout, residual connection.\n            Otherwise, no pre-process and post-precess includes dropout, residual\n            connection, layer normalization. Default False\n        weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property.\n            If it is a list/tuple, `weight_attr[0]` would be used as `weight_attr` for\n            MHA, and `weight_attr[1]` would be used as `weight_attr` for linear in FFN.\n            Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.\n            Default: None, which means the default weight parameter property is used.\n            See usage for details in :code:`ParamAttr` .\n        bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.\n            If it is a list/tuple, `bias_attr[0]` would be used as `bias_attr` for\n            MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.\n            Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.\n            The `False` value means the corresponding layer would not have trainable\n            bias parameter. See usage for details in :code:`ParamAttr` . Default: None,\n            which means the default bias parameter property is used.\n\n\n    Examples:\n\n        .. code-block:: python\n\n            import paddle\n            from paddle.nn import TransformerEncoderLayer\n\n            # encoder input: [batch_size, src_len, d_model]\n            enc_input = paddle.rand((2, 4, 128))\n            # self attention mask: [batch_size, n_head, src_len, src_len]\n            attn_mask = paddle.rand((2, 2, 4, 4))\n            encoder_layer = TransformerEncoderLayer(128, 2, 512)\n            enc_output = encoder_layer(enc_input, attn_mask)  # [2, 4, 128]\n    \"\"\"\n\n    def __init__(self,\n                 d_model,\n                 nhead,\n                 dim_feedforward,\n                 dropout=0.1,\n                 activation=\"relu\",\n                 attn_dropout=None,\n                 act_dropout=None,\n                 normalize_before=False,\n                 weight_attr=None,\n                 bias_attr=None):\n        self._config = locals()\n        self._config.pop(\"self\")\n        self._config.pop(\"__class__\", None)  # py3\n\n        super(TransformerEncoderLayer, self).__init__()\n\n        assert d_model > 0, (\"Expected d_model to be greater than 0, \"\n                             \"but received {}\".format(d_model))\n        assert nhead > 0, (\"Expected nhead to be greater than 0, \"\n                           \"but received {}\".format(nhead))\n        assert dim_feedforward > 0, (\n            \"Expected dim_feedforward to be greater than 0, \"\n            \"but received {}\".format(dim_feedforward))\n\n        attn_dropout = dropout if attn_dropout is None else attn_dropout\n        act_dropout = dropout if act_dropout is None else act_dropout\n        self.normalize_before = normalize_before\n\n        weight_attrs = _convert_param_attr_to_list(weight_attr, 2)\n        bias_attrs = _convert_param_attr_to_list(bias_attr, 2)\n\n        self.self_attn = MultiHeadAttention(\n            d_model,\n            nhead,\n            dropout=attn_dropout,\n            weight_attr=weight_attrs[0],\n            bias_attr=bias_attrs[0])\n        self.linear1 = Linear(\n            d_model, dim_feedforward, weight_attrs[1], bias_attr=bias_attrs[1])\n        self.dropout = Dropout(act_dropout, mode=\"upscale_in_train\")\n        self.linear2 = Linear(\n            dim_feedforward, d_model, weight_attrs[1], bias_attr=bias_attrs[1])\n        self.norm1 = LayerNorm(d_model)\n        self.norm2 = LayerNorm(d_model)\n        self.dropout1 = Dropout(dropout, mode=\"upscale_in_train\")\n        self.dropout2 = Dropout(dropout, mode=\"upscale_in_train\")\n        self.activation = getattr(F, activation)\n\n    def forward(self, src, src_mask=None, cache=None, output_attentions=False):\n        r\"\"\"\n        Applies a Transformer encoder layer on the input.\n\n        Parameters:\n            src (Tensor): The input of Transformer encoder layer. It is\n                a tensor with shape `[batch_size, sequence_length, d_model]`.\n                The data type should be float32 or float64.\n            src_mask (Tensor, optional): A tensor used in multi-head attention\n                to prevents attention to some unwanted positions, usually the\n                paddings or the subsequent positions. It is a tensor with shape\n                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.\n                When the data type is bool, the unwanted positions have `False`\n                values and the others have `True` values. When the data type is\n                int, the unwanted positions have 0 values and the others have 1\n                values. When the data type is float, the unwanted positions have\n                `-INF` values and the others have 0 values. It can be None when\n                nothing wanted or needed to be prevented attention to. Default None.\n            cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`.\n                See `TransformerEncoderLayer.gen_cache` for more details. It is\n                only used for inference and should be None for training. Default\n                None.\n\n        Returns:\n            Tensor|tuple: It is a tensor that has the same shape and data type \\\n                as `enc_input`, representing the output of Transformer encoder \\\n                layer. Or a tuple if `cache` is not None, except for encoder \\\n                layer output, the tuple includes the new cache which is same \\\n                as input `cache` argument but `incremental_cache` has an \\\n                incremental length. See `MultiHeadAttention.gen_cache` and \\\n                `MultiHeadAttention.forward` for more details.\n        \"\"\"\n        self.self_attn.need_weights = output_attentions\n        src_mask = _convert_attention_mask(src_mask, src.dtype)\n\n        residual = src\n        if self.normalize_before:\n            src = self.norm1(src)\n\n        attn_outputs = self.self_attn(src, src, src, src_mask, cache)\n        if isinstance(attn_outputs, tuple):\n            src = attn_outputs[0]\n            outputs = attn_outputs[1:]\n        else:\n            src = attn_outputs\n            outputs = None\n\n        src = residual + self.dropout1(src)\n        if not self.normalize_before:\n            src = self.norm1(src)\n\n        residual = src\n        if self.normalize_before:\n            src = self.norm2(src)\n        src = self.linear2(self.dropout(self.activation(self.linear1(src))))\n        src = residual + self.dropout2(src)\n        if not self.normalize_before:\n            src = self.norm2(src)\n\n        return src if outputs is None else (\n            (src, ) + outputs[::-1])  # hidden_states, cache, attentions\n\n    def gen_cache(self, src):\n        r\"\"\"\n        Generates cache for `forward` usage. The generated cache is an\n        instance of `MultiHeadAttention.Cache`.\n\n        Parameters:\n            src (Tensor): The input of Transformer encoder. It is a tensor\n                with shape `[batch_size, source_length, d_model]`. The data\n                type should be float32 or float64.\n\n        Returns:\n            incremental_cache: It is an instance of `MultiHeadAttention.Cache` \\\n                produced by `self_attn.gen_cache`, it reserves two tensors\n                shaped `[batch_size, nhead, 0, d_model // nhead]`. See \\\n                `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \\\n                for more details.\n        \"\"\"\n        incremental_cache = self.self_attn.gen_cache(\n            src, type=self.self_attn.Cache)\n        return incremental_cache\n\n\nclass TransformerEncoder(Layer):\n    \"\"\"\n    TransformerEncoder is a stack of N encoder layers.\n\n    Parameters:\n        encoder_layer (Layer): an instance of the `TransformerEncoderLayer`. It\n            would be used as the first layer, and the other layers would be created\n            according to the configurations of it.\n        num_layers (int): The number of encoder layers to be stacked.\n        norm (LayerNorm, optional): the layer normalization component. If provided,\n            apply layer normalization on the output of last encoder layer.\n\n    Examples:\n\n        .. code-block:: python\n\n            import paddle\n            from paddle.nn import TransformerEncoderLayer, TransformerEncoder\n\n            # encoder input: [batch_size, src_len, d_model]\n            enc_input = paddle.rand((2, 4, 128))\n            # self attention mask: [batch_size, n_head, src_len, src_len]\n            attn_mask = paddle.rand((2, 2, 4, 4))\n            encoder_layer = TransformerEncoderLayer(128, 2, 512)\n            encoder = TransformerEncoder(encoder_layer, 2)\n            enc_output = encoder(enc_input, attn_mask)  # [2, 4, 128]\n    \"\"\"\n\n    def __init__(self,\n                 encoder_layer,\n                 num_layers,\n                 norm=None,\n                 enable_recompute=False):\n        super(TransformerEncoder, self).__init__()\n        self.layers = LayerList([(encoder_layer if i == 0 else\n                                  type(encoder_layer)(**encoder_layer._config))\n                                 for i in range(num_layers)])\n        self.num_layers = num_layers\n        self.norm = norm\n        self.enable_recompute = enable_recompute\n\n    def forward(self,\n                src,\n                src_mask=None,\n                cache=None,\n                output_attentions=False,\n                output_hidden_states=False,\n                return_dict=False):\n        r\"\"\"\n        Applies a stack of N Transformer encoder layers on inputs. If `norm` is\n        provided, also applies layer normalization on the output of last encoder\n        layer.\n\n        Parameters:\n            src (Tensor): The input of Transformer encoder. It is a tensor\n                with shape `[batch_size, sequence_length, d_model]`. The data\n                type should be float32 or float64.\n            src_mask (Tensor, optional): A tensor used in multi-head attention\n                to prevents attention to some unwanted positions, usually the\n                paddings or the subsequent positions. It is a tensor with shape\n                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.\n                When the data type is bool, the unwanted positions have `False`\n                values and the others have `True` values. When the data type is\n                int, the unwanted positions have 0 values and the others have 1\n                values. When the data type is float, the unwanted positions have\n                `-INF` values and the others have 0 values. It can be None when\n                nothing wanted or needed to be prevented attention to. Default None.\n            cache (list, optional): It is a list, and each element in the list\n                is `incremental_cache` produced by `TransformerEncoderLayer.gen_cache`.\n                See `TransformerEncoder.gen_cache` for more details. It is only\n                used for inference and should be None for training. Default None.\n\n        Returns:\n            Tensor|tuple: It is a tensor that has the same shape and data type \\\n                as `src`, representing the output of Transformer encoder. \\\n                Or a tuple if `cache` is not None, except for encoder output, \\\n                the tuple includes the new cache which is same as input `cache` \\\n                argument but `incremental_cache` in it has an incremental length. \\\n                See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \\\n                for more details.\n        \"\"\"\n        src_mask = _convert_attention_mask(src_mask, src.dtype)\n\n        output = src\n        # To get cache from None when use_cache is True, which is compatible with HF\n        # while HF requires decoder. The implementation here uses cache update in the\n        # MultiHeadAttention not so efficiently, and maybe optimize it later.\n        if cache is None and getattr(self, \"_use_cache\", False):\n            cache = [tuple(self.layers[0].gen_cache(src))] * len(self.layers)\n        # To be compatible with `TransformerEncoder.forward`, `_use_cache` defualts\n        # to True when cache is not None.\n        new_caches = [] if cache is not None and getattr(self, \"_use_cache\",\n                                                         True) else None\n        all_attentions = [] if output_attentions else None\n        # NOTE: Also includes embeding output which is same as HF.\n        all_hidden_states = [output] if output_hidden_states else None\n        for i, mod in enumerate(self.layers):\n            if self.enable_recompute:\n                # Note: recompute do not support pass as **kwargs yet.\n                layer_outputs = recompute(\n                    mod, output, src_mask, None if cache is None else cache[i]\n                    if isinstance(cache[i], MultiHeadAttention.Cache) else\n                    MultiHeadAttention.Cache(*cache[i]), output_attentions)\n            else:\n                layer_outputs = mod(\n                    output,\n                    src_mask=src_mask,\n                    cache=None if cache is None else cache[i]\n                    if isinstance(cache[i], MultiHeadAttention.Cache) else\n                    MultiHeadAttention.Cache(*cache[i]),\n                    output_attentions=output_attentions)\n\n            if isinstance(layer_outputs, tuple):\n                output = layer_outputs[0]\n                outputs = layer_outputs[1:]\n            else:\n                output = layer_outputs\n                outputs = None\n\n            if output_hidden_states:\n                all_hidden_states.append(output)\n            if output_attentions:\n                all_attentions.append(outputs[-1])\n            if new_caches is not None:\n                new_caches.append(outputs[0] if isinstance(cache[\n                    i], MultiHeadAttention.Cache) else (tuple(outputs[0])))\n\n        if self.norm is not None:\n            output = self.norm(output)\n\n            if output_hidden_states:\n                all_hidden_states[-1] = output\n\n        if not return_dict:\n            return output\n\n        return BaseModelOutputWithPastAndCrossAttentions(\n            last_hidden_state=output,\n            past_key_values=new_caches,\n            hidden_states=all_hidden_states,\n            attentions=all_attentions)\n\n    def gen_cache(self, src):\n        r\"\"\"\n        Generates cache for `forward` usage. The generated cache is a list, and\n        each element in it is `incremental_cache` produced by\n        `TransformerEncoderLayer.gen_cache`. See `TransformerEncoderLayer.gen_cache`\n        for more details.\n\n        Parameters:\n            src (Tensor): The input of Transformer encoder. It is a tensor\n                with shape `[batch_size, source_length, d_model]`. The data type\n                should be float32 or float64.\n\n        Returns:\n            list: It is a list, and each element in the list is `incremental_cache`\n            produced by `TransformerEncoderLayer.gen_cache`. See\n            `TransformerEncoderLayer.gen_cache` for more details.\n        \"\"\"\n        cache = [layer.gen_cache(src) for layer in self.layers]\n        return cache\n"
  },
  {
    "path": "ppfleetx/models/language_model/ernie/layers/utils.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport functools\nimport inspect\nimport warnings\n\nimport paddle\nfrom paddle.nn import Layer\n\n\ndef fn_args_to_dict(func, *args, **kwargs):\n    \"\"\"\n    Inspect function `func` and its arguments for running, and extract a\n    dict mapping between argument names and keys. \n    \"\"\"\n    if hasattr(inspect, 'getfullargspec'):\n        (spec_args, spec_varargs, spec_varkw, spec_defaults, _, _,\n         _) = inspect.getfullargspec(func)\n    else:\n        (spec_args, spec_varargs, spec_varkw,\n         spec_defaults) = inspect.getargspec(func)\n    # add positional argument values\n    init_dict = dict(zip(spec_args, args))\n    # add default argument values\n    kwargs_dict = dict(zip(spec_args[-len(spec_defaults):],\n                           spec_defaults)) if spec_defaults else {}\n    for k in list(kwargs_dict.keys()):\n        if k in init_dict:\n            kwargs_dict.pop(k)\n    kwargs_dict.update(kwargs)\n    init_dict.update(kwargs_dict)\n    return init_dict\n\n\ndef adapt_stale_fwd_patch(self, name, value):\n    \"\"\"\n    Since there are some monkey patches for forward of PretrainedModel, such as\n    model compression, we make these patches compatible with the latest forward\n    method.\n    \"\"\"\n    if name == \"forward\":\n        # NOTE(guosheng): In dygraph to static, `layer.forward` would be patched\n        # by an instance of `StaticFunction`. And use string compare to avoid to\n        # import fluid.\n        if type(value).__name__.endswith('StaticFunction'):\n            return value\n        if hasattr(inspect, 'getfullargspec'):\n            (patch_spec_args, patch_spec_varargs, patch_spec_varkw,\n             patch_spec_defaults, _, _, _) = inspect.getfullargspec(value)\n            (spec_args, spec_varargs, spec_varkw, spec_defaults, _, _,\n             _) = inspect.getfullargspec(self.forward)\n        else:\n            (patch_spec_args, patch_spec_varargs, patch_spec_varkw,\n             patch_spec_defaults) = inspect.getargspec(value)\n            (spec_args, spec_varargs, spec_varkw,\n             spec_defaults) = inspect.getargspec(self.forward)\n        new_args = [\n            arg\n            for arg in ('output_hidden_states', 'output_attentions',\n                        'return_dict')\n            if arg not in patch_spec_args and arg in spec_args\n        ]\n\n        if new_args:\n            if self.__module__.startswith(\"paddlenlp\"):\n                warnings.warn(\n                    f\"The `forward` method of {self.__class__ if isinstance(self, Layer) else self} is patched and the patch \"\n                    \"might be based on an old oversion which missing some \"\n                    f\"arguments compared with the latest, such as {new_args}. \"\n                    \"We automatically add compatibility on the patch for \"\n                    \"these arguemnts, and maybe the patch should be updated.\")\n            else:\n                warnings.warn(\n                    f\"The `forward` method of {self.__class__ if isinstance(self, Layer) else self} \"\n                    \"is patched and the patch might be conflict with patches made \"\n                    f\"by paddlenlp which seems have more arguments such as {new_args}. \"\n                    \"We automatically add compatibility on the patch for \"\n                    \"these arguemnts, and maybe the patch should be updated.\")\n            if isinstance(self, Layer) and inspect.isfunction(value):\n\n                @functools.wraps(value)\n                def wrap_fwd(*args, **kwargs):\n                    for arg in new_args:\n                        kwargs.pop(arg, None)\n                    return value(self, *args, **kwargs)\n            else:\n\n                @functools.wraps(value)\n                def wrap_fwd(*args, **kwargs):\n                    for arg in new_args:\n                        kwargs.pop(arg, None)\n                    return value(*args, **kwargs)\n\n            return wrap_fwd\n    return value\n\n\nclass InitTrackerMeta(type(Layer)):\n    \"\"\"\n    This metaclass wraps the `__init__` method of a class to add `init_config`\n    attribute for instances of that class, and `init_config` use a dict to track\n    the initial configuration. If the class has `_pre_init` or `_post_init`\n    method, it would be hooked before or after `__init__` and called as\n    `_pre_init(self, init_fn, init_args)` or `_post_init(self, init_fn, init_args)`.\n    Since InitTrackerMeta would be used as metaclass for pretrained model classes,\n    which always are Layer and `type(Layer)` is not `type`, thus use `type(Layer)`\n    rather than `type` as base class for it to avoid inheritance metaclass\n    conflicts.\n    \"\"\"\n\n    def __init__(cls, name, bases, attrs):\n        init_func = cls.__init__\n        # If attrs has `__init__`, wrap it using accessable `_pre_init, _post_init`.\n        # Otherwise, no need to wrap again since the super cls has been wraped.\n        # TODO: remove reduplicated tracker if using super cls `__init__`\n        pre_init_func = getattr(cls, '_pre_init',\n                                None) if '__init__' in attrs else None\n        post_init_func = getattr(cls, '_post_init',\n                                 None) if '__init__' in attrs else None\n        cls.__init__ = InitTrackerMeta.init_and_track_conf(\n            init_func, pre_init_func, post_init_func)\n        super(InitTrackerMeta, cls).__init__(name, bases, attrs)\n\n    @staticmethod\n    def init_and_track_conf(init_func, pre_init_func=None,\n                            post_init_func=None):\n        \"\"\"\n        wraps `init_func` which is `__init__` method of a class to add `init_config`\n        attribute for instances of that class.\n        Args:\n            init_func (callable): It should be the `__init__` method of a class.\n            pre_init_func (callable, optional): If provided, it would be hooked after\n                `init_func` and called as `pre_init_func(self, init_func, *init_args, **init_args)`.\n                Default None.\n            post_init_func (callable, optional): If provided, it would be hooked after\n                `init_func` and called as `post_init_func(self, init_func, *init_args, **init_args)`.\n                Default None.\n        \n        Returns:\n            function: the wrapped function\n        \"\"\"\n\n        @functools.wraps(init_func)\n        def __impl__(self, *args, **kwargs):\n            # registed helper by `pre_init_func`\n            if pre_init_func:\n                pre_init_func(self, init_func, *args, **kwargs)\n            # keep full configuration\n            init_func(self, *args, **kwargs)\n            # registed helper by `post_init_func`\n            if post_init_func:\n                post_init_func(self, init_func, *args, **kwargs)\n            self.init_config = kwargs\n            if args:\n                kwargs['init_args'] = args\n            kwargs['init_class'] = self.__class__.__name__\n\n        return __impl__\n\n    def __setattr__(self, name, value):\n        value = adapt_stale_fwd_patch(self, name, value)\n        return super(InitTrackerMeta, self).__setattr__(name, value)\n"
  },
  {
    "path": "ppfleetx/models/language_model/gpt/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom .dygraph.hybrid_model import (\n    GPTModelHybrid, GPTForPretrainingPipe, GPTPretrainingCriterionHybird,\n    GPTForPretrainingHybrid, GPTForGenerationHybrid)\nfrom .auto.auto_model import (GPTModelAuto, GPTForPretrainingAuto,\n                              GPTPretrainingCriterionAuto,\n                              GPTForGenerationAuto)\n\nfrom .dygraph.single_model import GPTForPretraining, GPTPretrainingCriterion, GPTModel, GPTForGeneration, GPTForSequenceClassification\n"
  },
  {
    "path": "ppfleetx/models/language_model/gpt/auto/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/models/language_model/gpt/auto/auto_model.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.\n# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport collections\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport paddle.tensor as tensor\nimport paddle.incubate as incubate\nimport paddle.distributed.auto_parallel as auto\n\nfrom paddle.fluid import layers\nfrom paddle.common_ops_import import convert_dtype\nfrom paddle.nn.layer.transformer import _convert_param_attr_to_list\nfrom paddle.distributed.fleet.meta_parallel import get_rng_state_tracker\nfrom ..dygraph.processor import (\n    LogitsProcessorList, MinLengthLogitsProcessor,\n    HammingDiversityLogitsProcessor, RepetitionPenaltyLogitsProcessor,\n    ForcedBOSTokenLogitsProcessor, ForcedEOSTokenLogitsProcessor)\n\n\nclass MultiHeadAttention(nn.Layer):\n    \"\"\"\n    Attention mapps queries and a set of key-value pairs to outputs, and\n    Multi-Head Attention performs multiple parallel attention to jointly attending\n    to information from different representation subspaces.\n\n    \"\"\"\n\n    Cache = collections.namedtuple(\"Cache\", [\"k\", \"v\"])\n    StaticCache = collections.namedtuple(\"StaticCache\", [\"k\", \"v\"])\n\n    def __init__(self,\n                 embed_dim,\n                 num_heads,\n                 dropout=0.,\n                 kdim=None,\n                 vdim=None,\n                 need_weights=False,\n                 weight_attr=None,\n                 bias_attr=None,\n                 fuse_attn_qkv=False,\n                 use_recompute=False,\n                 recompute_granularity=\"full\",\n                 mesh=None,\n                 mesh_idx=None):\n        super(MultiHeadAttention, self).__init__()\n        self.embed_dim = embed_dim\n        self.kdim = kdim if kdim is not None else embed_dim\n        self.vdim = vdim if vdim is not None else embed_dim\n        self.num_heads = num_heads\n        self.dropout = dropout\n        self.need_weights = need_weights\n        self.fuse_attn_qkv = fuse_attn_qkv\n        self.use_recompute = use_recompute\n        self.recompute_granularity = recompute_granularity\n        self.mesh = mesh\n        self.mesh_idx = mesh_idx\n\n        self.head_dim = embed_dim // num_heads\n        assert self.head_dim * num_heads == self.embed_dim, \"embed_dim must be divisible by num_heads\"\n\n        if self.fuse_attn_qkv:\n            assert self.kdim == embed_dim\n            assert self.vdim == embed_dim\n            self.qkv_proj = nn.Linear(\n                embed_dim, 3 * embed_dim, weight_attr, bias_attr=bias_attr)\n        else:\n            self.q_proj = nn.Linear(\n                embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)\n            self.k_proj = nn.Linear(\n                self.kdim, embed_dim, weight_attr, bias_attr=bias_attr)\n            self.v_proj = nn.Linear(\n                self.vdim, embed_dim, weight_attr, bias_attr=bias_attr)\n        self.out_proj = nn.Linear(\n            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)\n\n    def _fuse_prepare_qkv(self, query, use_cache=False, cache=None):\n        auto.shard_tensor(self.qkv_proj.weight, self.mesh[self.mesh_idx],\n                          [None, self.mesh.mp])\n\n        mix_layer = self.qkv_proj(query)\n        mix_layer = paddle.reshape_(mix_layer,\n                                    [0, 0, self.num_heads, 3 * self.head_dim])\n        mix_layer = paddle.transpose(mix_layer, [0, 2, 1, 3])\n        q, k, v = paddle.split(mix_layer, num_or_sections=3, axis=-1)\n\n        assert not isinstance(\n            cache, self.StaticCache\n        ), \"cache currently does not support the StaticCache type\"\n\n        if isinstance(cache, self.Cache):\n            # for decoder self-attention in inference\n            k = tensor.concat([cache.k, k], axis=2)\n            v = tensor.concat([cache.v, v], axis=2)\n        if use_cache is True:\n            cache = self.Cache(k, v)\n\n        return (q, k, v) if use_cache is False else (q, k, v, cache)\n\n    def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):\n        r\"\"\"\n        Prapares linear projected queries, keys and values for usage of subsequnt\n        multiple parallel attention. If `cache` is not None, using cached results\n        to reduce redundant calculations.\n\n        \"\"\"\n        auto.shard_tensor(self.q_proj.weight, self.mesh[self.mesh_idx],\n                          [None, self.mesh.mp])\n\n        q = self.q_proj(query)\n        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])\n        q = tensor.transpose(x=q, perm=[0, 2, 1, 3])\n\n        if isinstance(cache, self.StaticCache):\n            # for encoder-decoder attention in inference and has cached\n            k, v = cache.k, cache.v\n        else:\n            k, v = self.compute_kv(key, value)\n\n        if isinstance(cache, self.Cache):\n            # for decoder self-attention in inference\n            k = tensor.concat([cache.k, k], axis=2)\n            v = tensor.concat([cache.v, v], axis=2)\n        if use_cache is True:\n            cache = self.Cache(k, v)\n\n        return (q, k, v) if use_cache is False else (q, k, v, cache)\n\n    def compute_kv(self, key, value):\n        r\"\"\"\n        Applies linear projection on input keys and values, then splits heads\n        (reshape and transpose) to get keys and values from different representation\n        subspaces. The results are used as key-values pairs for subsequent multiple\n        parallel attention.\n\n        It is part of calculations in multi-head attention, and is provided as\n        a method to pre-compute and prefetch these results, thus we can use them\n        to construct cache for inference.\n\n        \"\"\"\n        auto.shard_tensor(self.k_proj.weight, self.mesh[self.mesh_idx],\n                          [None, self.mesh.mp])\n        auto.shard_tensor(self.v_proj.weight, self.mesh[self.mesh_idx],\n                          [None, self.mesh.mp])\n\n        k = self.k_proj(key)\n        v = self.v_proj(value)\n        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])\n        k = tensor.transpose(x=k, perm=[0, 2, 1, 3])\n        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])\n        v = tensor.transpose(x=v, perm=[0, 2, 1, 3])\n        return k, v\n\n    def gen_cache(self, key, value=None, type=Cache):\n        \"\"\"\n        Generates cache for `forward` usage in inference accroding to arguments.\n        The generated cache is an instance of `MultiHeadAttention.Cache` or an\n        instance of `MultiHeadAttention.StaticCache`.\n        \"\"\"\n        if type == MultiHeadAttention.StaticCache:  # static_kv\n            k, v = self.compute_kv(key, value)\n            return self.StaticCache(k, v)\n        elif value is None:  # incremental_state\n            k = layers.fill_constant_batch_size_like(\n                input=key,\n                shape=[-1, self.num_heads, 0, self.head_dim],\n                dtype=key.dtype,\n                value=0)\n            v = layers.fill_constant_batch_size_like(\n                input=key,\n                shape=[-1, self.num_heads, 0, self.head_dim],\n                dtype=key.dtype,\n                value=0)\n            return self.Cache(k, v)\n        else:\n            # incremental_state with initial value, mainly for usage like UniLM\n            return self.Cache(key, value)\n\n    def core_attn(self, q, k, v, attn_mask=None):\n        # scale dot product attention\n        product = paddle.matmul(\n            x=q, y=k, transpose_y=True) * self.head_dim**-0.5\n\n        if attn_mask is not None:\n            product = product + attn_mask\n            weights = F.softmax(product)\n        else:\n            weights = incubate.softmax_mask_fuse_upper_triangle(product)\n\n        if self.dropout:\n            # with get_rng_state_tracker().rng_state('local_seed'):\n            weights = F.dropout(\n                weights,\n                self.dropout,\n                training=self.training,\n                mode=\"upscale_in_train\")\n\n        out = paddle.matmul(weights, v)\n\n        # combine heads\n        out = tensor.transpose(out, perm=[0, 2, 1, 3])\n        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])\n\n        return out, weights\n\n    def forward(self,\n                query,\n                key,\n                value,\n                attn_mask=None,\n                use_cache=False,\n                cache=None):\n        r\"\"\"\n        Applies multi-head attention to map queries and a set of key-value pairs\n        to outputs.\n        \"\"\"\n        key = query if key is None else key\n        value = query if value is None else value\n        # compute q ,k ,v\n        if use_cache is False:\n            if self.fuse_attn_qkv:\n                q, k, v = self._fuse_prepare_qkv(query, use_cache, cache)\n            else:\n                q, k, v = self._prepare_qkv(query, key, value, use_cache,\n                                            cache)\n        else:\n            if self.fuse_attn_qkv:\n                q, k, v, cache = self._fuse_prepare_qkv(query, use_cache,\n                                                        cache)\n            else:\n                q, k, v, cache = self._prepare_qkv(query, key, value,\n                                                   use_cache, cache)\n\n        if self.use_recompute and self.recompute_granularity == \"core_attn\":\n            out, weights = auto.recompute(self.core_attn)(q,\n                                                          k,\n                                                          v,\n                                                          attn_mask=attn_mask)\n        else:\n            out, weights = self.core_attn(q, k, v, attn_mask=attn_mask)\n\n        auto.shard_tensor(self.out_proj.weight, self.mesh[self.mesh_idx],\n                          [self.mesh.mp, None])\n\n        # project to output\n        out = self.out_proj(out)\n\n        outs = [out]\n        if self.need_weights:\n            outs.append(weights)\n        if use_cache:\n            outs.append(cache)\n        return out if len(outs) == 1 else tuple(outs)\n\n\nclass TransformerDecoder(nn.Layer):\n    \"\"\"\n    TransformerDecoder is a stack of N decoder layers.\n    \"\"\"\n\n    def __init__(self,\n                 decoder_layers,\n                 num_layers,\n                 norm=None,\n                 hidden_size=None,\n                 use_recompute=False,\n                 recompute_granularity=\"full\"):\n        super(TransformerDecoder, self).__init__()\n\n        self.num_layers = num_layers\n        self.layers = decoder_layers\n        self.norm = norm\n        self.use_recompute = use_recompute\n        self.recompute_granularity = recompute_granularity\n        if norm == \"LayerNorm\":\n            self.norm = nn.LayerNorm(hidden_size, epsilon=1e-5)\n        elif norm is not None:\n            raise ValueError(\"Only support LayerNorm\")\n\n    def forward(self,\n                tgt,\n                memory,\n                tgt_mask=None,\n                memory_mask=None,\n                use_cache=False,\n                cache=None):\n        r\"\"\"\n        Applies a stack of N Transformer decoder layers on inputs. If `norm` is\n        provided, also applies layer normalization on the output of last decoder\n        layer.\n        \"\"\"\n        output = tgt\n        new_caches = []\n\n        for i, mod in enumerate(self.layers):\n            auto.shard_tensor(\n                output, mod.mesh[mod.mesh_idx],\n                [mod.mesh.dp] + [None for i in range(len(output.shape) - 1)])\n\n            if cache is None:\n                if use_cache:\n                    output, new_cache = mod(output,\n                                            memory,\n                                            tgt_mask=tgt_mask,\n                                            use_cache=use_cache,\n                                            cache=cache)\n                    new_caches.append(new_cache)\n                else:\n                    if self.use_recompute and self.recompute_granularity == \"full\":\n                        output = auto.recompute(mod)(output, memory, tgt_mask,\n                                                     use_cache, cache)\n                    else:\n                        output = mod(output, memory, tgt_mask, use_cache,\n                                     cache)\n            else:\n                output, new_cache = mod(output,\n                                        memory,\n                                        tgt_mask=tgt_mask,\n                                        use_cache=use_cache,\n                                        cache=cache[i])\n                new_caches.append(new_cache)\n\n        if self.norm is not None:\n            output = self.norm(output)\n        return output if use_cache is False else (output, new_caches)\n\n    def gen_cache(self, memory, do_zip=False):\n        r\"\"\"\n        Generates cache for `forward` usage. The generated cache is a list, and\n        each element in it is a tuple( :code:`(incremental_cache, static_cache)` )\n        produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache`\n        for more details. If `do_zip` is True, apply `zip` on these tuples to get\n        a list with two elements.\n       \"\"\"\n        cache = [layer.gen_cache(memory) for layer in self.layers]\n        if do_zip:\n            cache = list(zip(*cache))\n        return cache\n\n\nclass TransformerDecoderLayer(nn.Layer):\n    \"\"\"\n    The transformer decoder layer.\n\n    It contains multiheadattention and some linear layers.\n    \"\"\"\n\n    def __init__(self,\n                 d_model,\n                 nhead,\n                 dim_feedforward,\n                 dropout=0.1,\n                 activation=\"gelu\",\n                 attn_dropout=None,\n                 act_dropout=None,\n                 normalize_before=True,\n                 weight_attr=None,\n                 bias_attr=None,\n                 fuse_attn_qkv=False,\n                 use_recompute=False,\n                 recompute_granularity=\"full\",\n                 mesh=None,\n                 mesh_idx=None):\n        self._config = locals()\n        self._config.pop(\"self\")\n        self._config.pop(\"__class__\", None)  # py3\n\n        super(TransformerDecoderLayer, self).__init__()\n        attn_dropout = dropout if attn_dropout is None else attn_dropout\n        act_dropout = dropout if act_dropout is None else act_dropout\n        self.normalize_before = normalize_before\n        self.use_recompute = use_recompute\n        self.recompute_granularity = recompute_granularity\n        self.mesh = mesh\n        self.mesh_idx = mesh_idx\n\n        weight_attrs = _convert_param_attr_to_list(weight_attr, 3)\n        bias_attrs = _convert_param_attr_to_list(bias_attr, 3)\n\n        self.self_attn = MultiHeadAttention(\n            d_model,\n            nhead,\n            dropout=attn_dropout,\n            weight_attr=weight_attrs[0],\n            bias_attr=bias_attrs[0],\n            fuse_attn_qkv=fuse_attn_qkv,\n            use_recompute=use_recompute,\n            recompute_granularity=recompute_granularity,\n            mesh=mesh,\n            mesh_idx=mesh_idx)\n\n        self.linear1 = nn.Linear(\n            d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2])\n\n        self.linear2 = nn.Linear(\n            dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2])\n\n        self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)\n        self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5)\n        self.dropout1 = nn.Dropout(dropout, mode=\"upscale_in_train\")\n        self.dropout2 = nn.Dropout(act_dropout, mode=\"upscale_in_train\")\n        self.activation = getattr(F, activation)\n\n    def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):\n\n        auto.shard_tensor(self.linear1.weight, self.mesh[self.mesh_idx],\n                          [None, self.mesh.mp])\n        auto.shard_tensor(self.linear2.weight, self.mesh[self.mesh_idx],\n                          [self.mesh.mp, None])\n\n        residual = tgt\n\n        if self.normalize_before:\n            tgt = self.norm1(tgt)\n\n        if use_cache is False:\n            if self.use_recompute and self.recompute_granularity == \"full_attn\":\n                tgt = auto.recompute(self.self_attn)(tgt, tgt, tgt, tgt_mask,\n                                                     use_cache, cache)\n            else:\n                tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache)\n        else:\n            tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask,\n                                                    use_cache, cache)\n\n        # with get_rng_state_tracker().rng_state('global_seed'):\n        tgt = residual + self.dropout1(tgt)\n\n        if not self.normalize_before:\n            tgt = self.norm1(tgt)\n\n        residual = tgt\n        if self.normalize_before:\n            tgt = self.norm2(tgt)\n\n        # with get_rng_state_tracker().rng_state('global_seed'):\n        tgt = self.dropout2(\n            self.linear2(F.gelu(\n                self.linear1(tgt), approximate=True)))\n\n        tgt = residual + tgt\n\n        if not self.normalize_before:\n            tgt = self.norm2(tgt)\n\n        return tgt if use_cache is False else (tgt, incremental_cache)\n\n    def gen_cache(self, memory):\n        incremental_cache = self.self_attn.gen_cache(\n            memory, type=self.self_attn.Cache)\n        return incremental_cache\n\n\nclass GPTEmbeddings(nn.Layer):\n    \"\"\"\n    Include embeddings from word and position embeddings.\n    \"\"\"\n\n    def __init__(self,\n                 vocab_size,\n                 hidden_size=768,\n                 hidden_dropout_prob=0.1,\n                 max_position_embeddings=512,\n                 type_vocab_size=16,\n                 initializer_range=0.02,\n                 mesh=None):\n        super(GPTEmbeddings, self).__init__()\n        self.mesh = mesh\n\n        self.word_embeddings = nn.Embedding(\n            vocab_size,\n            hidden_size,\n            weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal(\n                mean=0.0, std=initializer_range)))\n\n        self.position_embeddings = nn.Embedding(\n            max_position_embeddings,\n            hidden_size,\n            weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal(\n                mean=0.0, std=initializer_range)))\n\n        self.dropout = nn.Dropout(hidden_dropout_prob)\n\n    def forward(self, input_ids, position_ids=None):\n        if position_ids is None:\n            ones = paddle.ones_like(input_ids, dtype=\"int64\")\n            seq_length = paddle.cumsum(ones, axis=-1)\n            position_ids = seq_length - ones\n\n        auto.shard_tensor(self.word_embeddings.weight, self.mesh[0],\n                          [self.mesh.mp, None])\n\n        input_embedings = self.word_embeddings(input_ids)\n        position_embeddings = self.position_embeddings(position_ids)\n        embeddings = input_embedings + position_embeddings\n        embeddings = self.dropout(embeddings)\n        return embeddings\n\n\nclass GPTModelAuto(nn.Layer):\n    def __init__(self,\n                 vocab_size=51200,\n                 hidden_size=768,\n                 num_layers=12,\n                 num_attention_heads=12,\n                 ffn_hidden_size=3072,\n                 hidden_dropout_prob=0.1,\n                 attention_probs_dropout_prob=0.1,\n                 max_position_embeddings=512,\n                 type_vocab_size=16,\n                 initializer_range=0.02,\n                 fuse_attn_qkv=False,\n                 use_recompute=False,\n                 recompute_granularity=\"full\",\n                 mesh=None):\n\n        super(GPTModelAuto, self).__init__()\n\n        self.initializer_range = initializer_range\n        self.hidden_size = hidden_size\n        self.vocab_size = vocab_size\n        self.use_recompute = use_recompute\n        self.recompute_granularity = recompute_granularity\n\n        if not mesh:\n            raise RuntimeError(\n                \"AutoPrallel modeling need `mesh` to annotate distributed attribute.\"\n            )\n        self.mesh = mesh\n\n        self.embeddings = GPTEmbeddings(\n            vocab_size, hidden_size, hidden_dropout_prob,\n            max_position_embeddings, type_vocab_size, self.initializer_range,\n            self.mesh)\n\n        stages = self.mesh.stages(num_layers)\n        decoder_layers = nn.LayerList()\n        for i in range(num_layers):\n            decoder_layers.append(\n                TransformerDecoderLayer(\n                    d_model=hidden_size,\n                    nhead=num_attention_heads,\n                    dim_feedforward=ffn_hidden_size,\n                    dropout=hidden_dropout_prob,\n                    activation=\"gelu\",\n                    attn_dropout=attention_probs_dropout_prob,\n                    act_dropout=hidden_dropout_prob,\n                    weight_attr=paddle.ParamAttr(\n                        initializer=nn.initializer.Normal(\n                            mean=0.0, std=self.initializer_range)),\n                    bias_attr=None,\n                    fuse_attn_qkv=fuse_attn_qkv,\n                    use_recompute=use_recompute,\n                    recompute_granularity=recompute_granularity,\n                    mesh=self.mesh,\n                    mesh_idx=stages[i]))\n\n        self.decoder = TransformerDecoder(\n            decoder_layers,\n            num_layers,\n            norm=\"LayerNorm\",\n            hidden_size=hidden_size,\n            use_recompute=use_recompute,\n            recompute_granularity=recompute_granularity)\n\n    def forward(self,\n                input_ids,\n                position_ids=None,\n                attention_mask=None,\n                use_cache=False,\n                cache=None):\n        if position_ids is None:\n            past_length = 0\n            if cache is not None:\n                past_length = paddle.shape(attention_mask)[-1] - 1\n            position_ids = paddle.arange(\n                past_length,\n                paddle.shape(input_ids)[-1] + past_length,\n                dtype=input_ids.dtype)\n            position_ids = position_ids.unsqueeze(0)\n            # .expand_as(input_ids)\n            position_ids = paddle.expand_as(position_ids, input_ids)\n\n        input_ids.stop_gradient = True\n        position_ids.stop_gradient = True\n        auto.shard_tensor(\n            input_ids, self.mesh[0],\n            [self.mesh.dp] + [None for i in range(len(input_ids.shape) - 1)])\n\n        embedding_output = self.embeddings(\n            input_ids=input_ids, position_ids=position_ids)\n\n        if self.training == False:\n            # TODO, use registered buffer\n            causal_mask = paddle.tensor.triu(\n                paddle.ones(\n                    (paddle.shape(input_ids)[-1], paddle.shape(input_ids)[-1]))\n                * -1e4,\n                diagonal=1)\n            if attention_mask is not None:\n                if len(attention_mask.shape) == 2:\n                    attention_mask = attention_mask[:, None, None, :]\n                attention_mask = attention_mask + causal_mask\n            else:\n                attention_mask = causal_mask\n            # The tensor returned by triu not in static graph.\n            attention_mask.stop_gradient = True\n\n        encoder_outputs = self.decoder(\n            embedding_output,\n            memory=None,\n            tgt_mask=None if self.training else\n            attention_mask,  # use softmax_mask_fuse_upper_triangle\n            use_cache=use_cache,\n            cache=cache)\n        return encoder_outputs\n\n\nclass GPTForPretrainingAuto(nn.Layer):\n    \"\"\"\n    GPT Model with pretraining tasks on top.\n\n    Args:\n        gpt (:class:`GPTModel`):\n            An instance of :class:`GPTModel`.\n\n    \"\"\"\n\n    def __init__(self, gpt):\n        super(GPTForPretrainingAuto, self).__init__()\n        self.gpt = gpt\n\n    def forward(self,\n                input_ids,\n                position_ids=None,\n                attention_mask=None,\n                masked_positions=None,\n                use_cache=False,\n                cache=None):\n\n        outputs = self.gpt(input_ids,\n                           position_ids=position_ids,\n                           attention_mask=attention_mask,\n                           use_cache=use_cache,\n                           cache=cache)\n        if use_cache:\n            encoder_outputs, cached_kvs = outputs[:2]\n        else:\n            encoder_outputs = outputs\n\n        x_dims_mapping = [self.gpt.mesh.dp] + [\n            None for i in range(len(encoder_outputs.shape) - 1)\n        ]\n        w_dims_mapping = [self.gpt.mesh.mp, None]\n        matmul = auto.shard_op(paddle.matmul, self.gpt.mesh[-1],\n                               [x_dims_mapping, w_dims_mapping, None])\n        logits = matmul(\n            encoder_outputs,\n            self.gpt.embeddings.word_embeddings.weight,\n            transpose_y=True)\n\n        if use_cache:\n            return logits, cached_kvs\n        else:\n            return logits\n\n\nclass GPTPretrainingCriterionAuto(nn.Layer):\n    \"\"\"\n    Criterion for GPT. It calculates the final loss.\n    \"\"\"\n\n    def __init__(self, mesh):\n        super(GPTPretrainingCriterionAuto, self).__init__()\n        self.mesh = mesh\n        self.loss_func = paddle.nn.CrossEntropyLoss(reduction=\"none\")\n\n    def forward(self, prediction_scores, masked_lm_labels, loss_mask):\n        \"\"\"\n        Args:\n            prediction_scores(Tensor):\n                The logits of masked token prediction. Its data type should be float32 and\n                its shape is [batch_size, sequence_length, vocab_size].\n            masked_lm_labels(Tensor):\n                The labels of the masked language modeling, the dimensionality of `masked_lm_labels`\n                is equal to `prediction_scores`. Its data type should be int64 and\n                its shape is [batch_size, sequence_length, 1].\n            loss_mask(Tensor):\n                Mask used for calculating the loss of the masked language modeling to avoid\n                calculating some unwanted tokens.\n                Its data type should be float32 and its shape is [batch_size, sequence_length, 1].\n\n        Returns:\n            Tensor: The pretraining loss. Its data type should be float32 and its shape is [1].\n\n        \"\"\"\n        masked_lm_labels.stop_gradient = True\n        loss_mask.stop_gradient = True\n        auto.shard_tensor(\n            loss_mask, self.mesh[-1],\n            [self.mesh.dp] + [None for i in range(len(loss_mask.shape) - 1)])\n\n        masked_lm_loss = self.loss_func(prediction_scores,\n                                        masked_lm_labels.unsqueeze(2))\n\n        loss_mask = loss_mask.reshape([-1])\n        masked_lm_loss = paddle.sum(masked_lm_loss.reshape([-1]) * loss_mask)\n        loss = masked_lm_loss / loss_mask.sum()\n        return loss\n\n\nclass GPTForGenerationAuto(nn.Layer):\n    \"\"\"\n    GPT Model with pretraining tasks on top.\n\n    Args:\n        gpt (:class:`GPTModel`):\n            An instance of :class:`GPTModel`.\n\n    \"\"\"\n\n    def __init__(self, gpt, configs):\n        super(GPTForGenerationAuto, self).__init__()\n        self.gpt = gpt\n        self.configs = configs\n\n        self.max_length = self.configs.get('max_dec_len', 20)\n        self.min_length = self.configs.get('min_dec_len', 0)\n        self.decode_strategy = self.configs.get('decode_strategy', 'sampling')\n        self.early_finish = self.configs.get('early_finish', True)\n        self.temperature = self.configs.get('temperature', 1.0)\n        self.top_k = self.configs.get('top_k', 0)\n        self.top_p = self.configs.get('top_p', 1.0)\n        self.use_topp_sampling = self.configs.get('use_topp_sampling', False)\n        self.inference = self.configs.get('inference', False)\n        self.repetition_penalty = self.configs.get('repetition_penalty', 1.0)\n        self.num_beams = self.configs.get('num_beams', 1)\n        self.num_beam_groups = self.configs.get('num_beam_groups', 1)\n        self.length_penalty = self.configs.get('length_penalty', 0.0)\n        self.early_stopping = self.configs.get('early_stopping', False)\n        self.bos_token_id = self.configs.get('bos_token_id', None)\n        self.eos_token_id = self.configs.get('eos_token_id', None)\n        self.pad_token_id = self.configs.get('pad_token_id', None)\n        self.decoder_start_token_id = self.configs.get(\n            'decoder_start_token_id', None)\n        self.forced_bos_token_id = self.configs.get('forced_bos_token_id',\n                                                    None)\n        self.forced_eos_token_id = self.configs.get('forced_eos_token_id',\n                                                    None)\n        self.num_return_sequences = self.configs.get('num_return_sequences', 1)\n        self.diversity_rate = self.configs.get('diversity_rate', 0.0)\n        self.use_cache = self.configs.get('use_cache', True)\n\n    def prepare_input_ids_for_generation(self,\n                                         bos_token_id,\n                                         encoder_output=None):\n        batch_size = 1\n        if bos_token_id is None:\n            raise ValueError(\"`bos_token_id` should be defined when no \"\n                             \"`input_ids` are provided.\")\n        if encoder_output is not None:\n            batch_size = encoder_output.shape[0]\n        return paddle.ones([batch_size, 1], dtype=\"int64\") * bos_token_id\n\n    def prepare_attention_mask_for_generation(self, input_ids, pad_token_id,\n                                              eos_token_id):\n        is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any(\n            input_ids == pad_token_id).numpy().item()\n        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (\n            (eos_token_id is not None) and (pad_token_id != eos_token_id))\n        if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:\n            attention_mask = (input_ids == pad_token_id\n                              ).astype(paddle.get_default_dtype()) * -1e9\n        else:\n            attention_mask = paddle.zeros_like(\n                input_ids, dtype=paddle.get_default_dtype())\n        return paddle.unsqueeze(attention_mask, axis=[1, 2])\n\n    def update_scores_for_generation(self, scores, next_scores, length,\n                                     unfinished_flag):\n        # update scores\n\n        unfinished_scores = (scores * length + next_scores) / (length + 1)\n        scores = paddle.where(unfinished_flag, unfinished_scores, scores)\n        return scores\n\n    def get_logits_processor(self,\n                             min_length=None,\n                             max_length=None,\n                             eos_token_id=None,\n                             forced_bos_token_id=None,\n                             forced_eos_token_id=None,\n                             num_beams=1,\n                             num_beam_groups=1,\n                             diversity_rate=0.0,\n                             repetition_penalty=None):\n        processors = LogitsProcessorList()\n\n        if min_length is not None and eos_token_id is not None and min_length > -1:\n            processors.append(\n                MinLengthLogitsProcessor(min_length, eos_token_id))\n        if num_beam_groups > 1 and diversity_rate > 0.0:\n            processors.append(\n                HammingDiversityLogitsProcessor(\n                    diversity_rate=diversity_rate,\n                    num_beams=num_beams,\n                    num_beam_groups=num_beam_groups))\n        if repetition_penalty is not None and repetition_penalty != 1.0:\n            processors.append(\n                RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))\n        if forced_bos_token_id is not None:\n            processors.append(\n                ForcedBOSTokenLogitsProcessor(forced_bos_token_id))\n        if forced_eos_token_id is not None:\n            processors.append(\n                ForcedEOSTokenLogitsProcessor(max_length, forced_eos_token_id))\n        # TODO\n        # Add more pre_processing for distribution\n\n        return processors\n\n    def expand_inputs_for_generation(self,\n                                     input_ids,\n                                     expand_size,\n                                     attention_mask=None,\n                                     **model_kwargs):\n\n        index = paddle.tile(\n            paddle.arange(paddle.shape(input_ids)[0]).unsqueeze(-1),\n            [1, expand_size]).reshape([-1])\n\n        input_ids = paddle.gather(input_ids, index)\n\n        if attention_mask is not None:\n            model_kwargs[\"attention_mask\"] = paddle.gather(attention_mask,\n                                                           index)\n\n        if \"token_type_ids\" in model_kwargs and model_kwargs[\n                \"token_type_ids\"] is not None:\n            token_type_ids = model_kwargs[\"token_type_ids\"]\n            model_kwargs[\"token_type_ids\"] = paddle.gather(token_type_ids,\n                                                           index)\n\n        if \"position_ids\" in model_kwargs and model_kwargs[\n                \"position_ids\"] is not None:\n            position_ids = model_kwargs[\"position_ids\"]\n            model_kwargs[\"position_ids\"] = paddle.gather(position_ids, index)\n\n        if \"seq_len\" in model_kwargs and model_kwargs[\"seq_len\"] is not None:\n            seq_len = model_kwargs[\"seq_len\"]\n            model_kwargs[\"seq_len\"] = paddle.gather(seq_len, index)\n\n        if \"encoder_output\" in model_kwargs and model_kwargs[\n                \"encoder_output\"] is not None:\n            encoder_output = model_kwargs[\"encoder_output\"]\n            model_kwargs[\"encoder_output\"] = paddle.gather(encoder_output,\n                                                           index)\n\n        if \"role_ids\" in model_kwargs and model_kwargs[\"role_ids\"] is not None:\n            role_ids = model_kwargs[\"role_ids\"]\n            model_kwargs[\"role_ids\"] = paddle.gather(role_ids, index)\n\n        return input_ids, model_kwargs\n\n    def prepare_inputs_for_generation(self,\n                                      input_ids,\n                                      use_cache=False,\n                                      cache=None,\n                                      **kwargs):\n        # only last token for inputs_ids if cache is defined in kwargs\n        position_ids = kwargs.get(\"position_ids\", None)\n        attention_mask = kwargs.get(\"attention_mask\", None)\n        if attention_mask is not None:\n            if len(attention_mask.shape) == 4:\n                attention_mask = attention_mask[:, -1, -1, :]\n            if \"int\" in paddle.common_ops_import.convert_dtype(\n                    attention_mask.dtype):\n                attention_mask = (1.0 - attention_mask) * -1e4\n        return {\n            \"input_ids\": input_ids,\n            \"position_ids\": position_ids,\n            \"attention_mask\": attention_mask,\n            \"cache\": cache\n        }\n\n    def update_model_kwargs_for_generation(self,\n                                           next_tokens,\n                                           outputs,\n                                           model_kwargs,\n                                           is_encoder_decoder=False):\n        # Update the model inputs during generation.\n        # Note that If `token_type_ids` and `attention_mask` in `model_kwargs`\n        # and they contain pad value, the result vectors updated by this method\n        # may be different from expected. In this case, you need to rewrite the\n        # method.\n\n        # update cache\n        if isinstance(outputs, tuple):\n            model_kwargs[\"cache\"] = outputs[1]\n\n        # update token_type_ids with last value\n        if \"token_type_ids\" in model_kwargs and model_kwargs[\n                \"token_type_ids\"] is not None:\n            token_type_ids = model_kwargs[\"token_type_ids\"]\n            model_kwargs[\"token_type_ids\"] = paddle.concat(\n                [token_type_ids, token_type_ids[:, -1:]], axis=-1)\n\n        # update position_ids\n        if \"position_ids\" in model_kwargs and model_kwargs[\n                \"position_ids\"] is not None:\n            position_ids = model_kwargs[\"position_ids\"]\n            model_kwargs[\"position_ids\"] = position_ids[:, -1:] + 1\n\n        # update attention_mask\n        if not is_encoder_decoder and \"attention_mask\" in model_kwargs:\n            attention_mask = model_kwargs[\"attention_mask\"]\n            # nn.Pad2D don't support the data type `bool`\n            if convert_dtype(attention_mask.dtype) == 'bool':\n                attention_mask = paddle.cast(attention_mask, 'int64')\n            if len(attention_mask.shape) == 4:\n                attention_mask = nn.Pad2D(\n                    [0, 0, 0, 1], mode='replicate')(attention_mask)\n                attention_mask = nn.Pad2D(\n                    [0, 1, 0, 0], value=-1e4)(attention_mask)\n                dtype = convert_dtype(attention_mask.dtype)\n                if 'int' in dtype:\n                    attention_mask[:, :, -1, -1] = 1\n                elif 'float' in dtype:\n                    attention_mask[:, :, -1, -1] = 0.0\n                else:\n                    raise ValueError(\n                        'The data type of input `attention_mask` must '\n                        'be bool, int or float')\n            else:\n                attention_mask = paddle.concat(\n                    [\n                        attention_mask, paddle.ones(\n                            [attention_mask.shape[0], 1], dtype=\"int64\")\n                    ],\n                    axis=-1)\n            model_kwargs[\"attention_mask\"] = attention_mask\n\n        # update role_ids\n        if \"role_ids\" in model_kwargs and model_kwargs[\"role_ids\"] is not None:\n            role_ids = model_kwargs[\"role_ids\"]\n            model_kwargs[\"role_ids\"] = paddle.concat(\n                [role_ids, role_ids[:, -1:]], axis=-1)\n\n        model_kwargs['res'] = paddle.concat(\n            [model_kwargs['res'], next_tokens], axis=1)\n\n        return model_kwargs\n\n    def sample(self,\n               input_ids,\n               logits_processors,\n               max_length,\n               pad_token_id,\n               eos_token_id,\n               top_k=None,\n               top_p=None,\n               temperature=None,\n               min_tokens_to_keep=1,\n               **model_kwargs):\n        def TopKProcess(probs, top_k, min_tokens_to_keep):\n            top_k = min(max(top_k, min_tokens_to_keep), probs.shape[-1])\n            # Remove all tokens with a probability less than the last token of the top-k\n            topk_probs, _ = paddle.topk(probs, k=top_k)\n            probs = paddle.where(probs >= topk_probs[:, -1:], probs,\n                                 paddle.full_like(probs, 0.0))\n            return probs\n\n        def TopPProcess(probs, top_p, min_tokens_to_keep):\n            sorted_probs = paddle.sort(probs, descending=True)\n            sorted_indices = paddle.argsort(probs, descending=True)\n            cumulative_probs = paddle.cumsum(sorted_probs, axis=-1)\n\n            # Remove tokens with cumulative probs above the top_p, But keep at\n            # least min_tokens_to_keep tokens\n            sorted_indices_to_remove = cumulative_probs > top_p\n            if min_tokens_to_keep > 1:\n                # Set 'min_tokens_to_keep - 1' because the first token is kept\n                sorted_indices_to_remove[:, :min_tokens_to_keep - 1] = 0\n            # Keep the first token\n            sorted_indices_to_remove = paddle.cast(\n                sorted_indices_to_remove, dtype='int64')\n            sorted_indices_to_remove[:, 1:] = (\n                sorted_indices_to_remove[:, :-1].clone())\n            sorted_indices_to_remove[:, 0] = 0\n\n            # Scatter sorted tensors to original indexing\n            sorted_indices = sorted_indices + paddle.arange(probs.shape[\n                0]).unsqueeze(-1) * probs.shape[-1]\n            condition = paddle.scatter(sorted_indices_to_remove.flatten(),\n                                       sorted_indices.flatten(),\n                                       sorted_indices_to_remove.flatten())\n            condition = paddle.cast(condition, 'bool').reshape(probs.shape)\n            probs = paddle.where(condition,\n                                 paddle.full_like(probs, 0.0), probs)\n            return probs\n\n        batch_size, cur_len = paddle.shape(input_ids)\n        # used for compute on gpu, avoid memcpy D2H\n        cur_len_gpu = paddle.full([1], cur_len, dtype='int64')\n\n        origin_len = paddle.shape(input_ids)[1]\n        # used for compute on gpu, avoid memcpy D2H\n        origin_len_gpu = paddle.full([1], origin_len, dtype='int64')\n\n        unfinished_flag = paddle.full([batch_size, 1], True, dtype='bool')\n        scores = paddle.full(\n            [batch_size, 1], 0.0, dtype=paddle.get_default_dtype())\n\n        res = paddle.assign(input_ids)\n        model_kwargs['res'] = res\n\n        # use_cache is immutable, we split it off other mutable kwargs.\n        assert 'use_cache' in model_kwargs\n        immutable = {'use_cache': model_kwargs['use_cache']}\n        del model_kwargs['use_cache']\n\n        def _forward_(**args):\n            model_inputs = self.prepare_inputs_for_generation(\n                input_ids, **args, **immutable)\n            return self.gpt(**model_inputs, **immutable)\n\n        def _post_process_(outputs, input_ids, cur_len, origin_len, scores,\n                           unfinished_flag, model_kwargs):\n\n            logits = outputs[0] if isinstance(outputs, tuple) else outputs\n\n            # logits = paddle.matmul(\n            #     logits,\n            #     self.gpt.embeddings.word_embeddings.weight,\n            #     transpose_y=True)\n\n            x_dims_mapping = [self.gpt.mesh.dp] + [\n                None for i in range(len(logits.shape) - 1)\n            ]\n            w_dims_mapping = [self.gpt.mesh.mp, None]\n            matmul = auto.shard_op(paddle.matmul, self.gpt.mesh[-1],\n                                   [x_dims_mapping, w_dims_mapping, None])\n            with paddle.fluid.name_scope('skip_quant'):\n                logits = matmul(\n                    logits,\n                    self.gpt.embeddings.word_embeddings.weight,\n                    transpose_y=True)\n\n            # [batch_size, vocab_size]\n            logits = logits[:, -1, :]\n\n            # pre-process distribution\n            logits = logits_processors(input_ids, logits)\n\n            # sample\n            origin_probs = F.softmax(logits)\n            if temperature is None or temperature == 1.0:\n                probs = paddle.assign(origin_probs)\n                origin_probs = paddle.log(origin_probs)\n            else:\n                origin_probs = paddle.log(origin_probs)\n                logits = logits / temperature\n                probs = F.softmax(logits)\n            if top_k is not None and top_k != 0:\n                probs = TopKProcess(probs, top_k, min_tokens_to_keep)\n            if top_p is not None and top_p < 1.0:\n                if self.use_topp_sampling:\n                    try:\n                        from ppfleetx_ops import topp_sampling\n                    except ImportError:\n                        raise ImportError(\n                            \"please install ppfleetx_ops by 'cd ppfleetx/ops && python setup_cuda.py install'!\"\n                        )\n                    top_ps_tensor = paddle.full(\n                        shape=[paddle.shape(probs)[0]],\n                        fill_value=top_p,\n                        dtype=probs.dtype)\n                    # TODO fake random seed here\n                    # Users should set the random seed dynamically when inference\n                    _,  next_tokens = topp_sampling(probs, top_ps_tensor, random_seed=100)\n                else:\n                    probs = TopPProcess(probs, top_p, min_tokens_to_keep)\n\n            if not self.use_topp_sampling:\n                next_tokens = paddle.multinomial(probs)\n\n            next_scores = paddle.index_sample(origin_probs, next_tokens)\n\n            if eos_token_id is not None:\n                next_tokens = paddle.where(\n                    unfinished_flag, next_tokens,\n                    paddle.full_like(next_tokens, pad_token_id))\n\n            scores = self.update_scores_for_generation(\n                scores, next_scores, cur_len - origin_len, unfinished_flag)\n\n            input_ids = next_tokens\n\n            if eos_token_id is not None:\n                unfinished_flag = paddle.logical_and(\n                    unfinished_flag, next_tokens != eos_token_id)\n\n            model_kwargs = self.update_model_kwargs_for_generation(\n                next_tokens,\n                outputs,\n                model_kwargs,\n                is_encoder_decoder=self.is_encoder_decoder)\n\n            return input_ids, scores, unfinished_flag, model_kwargs\n\n        # Note(GuoxiaWang):Pre-while call for inference, simulate a do while loop statement\n        # the value in model_kwargs should be tensor before while loop\n        outputs = _forward_(**model_kwargs)\n\n        input_ids, scores, unfinished_flag, model_kwargs = _post_process_(\n            outputs, input_ids, cur_len_gpu, origin_len_gpu, scores,\n            unfinished_flag, model_kwargs)\n        if not self.inference:\n            cur_len += 1\n        else:\n            # Note(ZhenyuLi): Avoid the synchronization caused by scale in dy2static\n            paddle.increment(cur_len)\n        paddle.increment(cur_len_gpu)\n\n        attn_mask = model_kwargs['attention_mask']\n        # make the shape of attention_mask = (-1, -1, -1, -1) in dy2static.\n        model_kwargs['attention_mask'] = paddle.reshape(\n            attn_mask, paddle.shape(attn_mask))\n        model_kwargs['cache'] = outputs[1] if isinstance(outputs,\n                                                         tuple) else None\n        max_length = paddle.to_tensor(max_length)\n        while cur_len < max_length:\n            # Note(GuoxiaWang): Remove outputs = _forward_(**model_kwargs) \n            # and change it to pass directly to _post_process_ to avoid \n            # closed-loop problem of dynamic-to-static model\n            input_ids, scores, unfinished_flag, model_kwargs = _post_process_(\n                _forward_(**model_kwargs), input_ids, cur_len_gpu,\n                origin_len_gpu, scores, unfinished_flag, model_kwargs)\n            if not self.inference:\n                cur_len += 1\n            else:\n                # Note(ZhenyuLi): Avoid the synchronization caused by scale in dy2static\n                paddle.increment(cur_len)\n            paddle.increment(cur_len_gpu)\n\n            # early finish should be True in generation scenes,\n            # If users want to test the inference speed, you can just set it False.\n            if self.early_finish and not paddle.any(unfinished_flag):\n                break\n\n        return model_kwargs['res'][:, origin_len:], scores\n\n    def forward(self, input_ids=None, **model_kwargs):\n\n        max_length = self.max_length\n        min_length = self.min_length\n        decode_strategy = self.decode_strategy\n        temperature = self.temperature\n        top_k = self.top_k\n        top_p = self.top_p\n        repetition_penalty = self.repetition_penalty\n        num_beams = self.num_beams\n        num_beam_groups = self.num_beam_groups\n        length_penalty = self.length_penalty\n        early_stopping = self.early_stopping\n        bos_token_id = self.bos_token_id\n        eos_token_id = self.eos_token_id\n        pad_token_id = self.pad_token_id\n        decoder_start_token_id = self.decoder_start_token_id\n        forced_bos_token_id = self.forced_bos_token_id\n        forced_eos_token_id = self.forced_eos_token_id\n        num_return_sequences = self.num_return_sequences\n        diversity_rate = self.diversity_rate\n        use_cache = self.use_cache\n\n        assert (\n            decode_strategy in [\"greedy_search\", \"sampling\", \"beam_search\"]\n        ), \"`decode_strategy` must be one of 'greedy_search', 'sampling' or 'beam_search' but received {}.\".format(\n            decode_strategy)\n\n        bos_token_id = bos_token_id if bos_token_id is not None else getattr(\n            self.gpt, 'bos_token_id', None)\n        eos_token_id = eos_token_id if eos_token_id is not None else getattr(\n            self.gpt, 'eos_token_id', None)\n        pad_token_id = pad_token_id if pad_token_id is not None else getattr(\n            self.gpt, 'pad_token_id', None)\n        forced_bos_token_id = forced_bos_token_id if forced_bos_token_id is not None else getattr(\n            self.gpt, 'forced_bos_token_id', None)\n        forced_eos_token_id = forced_eos_token_id if forced_eos_token_id is not None else getattr(\n            self.gpt, 'forced_eos_token_id', None)\n        decoder_start_token_id = decoder_start_token_id if decoder_start_token_id is not None else getattr(\n            self.gpt, 'decoder_start_token_id', None)\n\n        # params check\n        if input_ids is None:\n            # Init `input_ids` with bos_token_id\n            input_ids = self.prepare_input_ids_for_generation(bos_token_id)\n\n        if model_kwargs.get(\"attention_mask\", None) is None:\n            # TODO\n            # Init `attention_mask` depending on `pad_token_id`\n            model_kwargs[\n                \"attention_mask\"] = self.prepare_attention_mask_for_generation(\n                    input_ids, pad_token_id, eos_token_id)\n\n        if model_kwargs.get(\"position_ids\", None) is None:\n            model_kwargs['position_ids'] = paddle.arange(\n                0,\n                paddle.shape(model_kwargs['attention_mask'])[-1],\n                dtype=input_ids.dtype).unsqueeze(0)\n\n        self.is_encoder_decoder = False\n\n        model_kwargs[\"use_cache\"] = use_cache\n\n        if self.inference:\n            # Note(ZhenyuLi): Avoid the synchronization caused by scale in dy2static\n            min_len = input_ids.shape[-1]\n            max_len = input_ids.shape[-1]\n            paddle.increment(min_len, min_length)\n            paddle.increment(max_len, max_length)\n        else:\n            input_len = input_ids.shape[-1]\n            max_len = max_length + input_len\n            min_len = min_length + input_len\n\n        logits_processors = self.get_logits_processor(\n            min_length=min_len,\n            max_length=max_len,\n            eos_token_id=eos_token_id,\n            forced_bos_token_id=forced_bos_token_id,\n            forced_eos_token_id=forced_eos_token_id,\n            num_beams=num_beams,\n            num_beam_groups=num_beam_groups,\n            diversity_rate=diversity_rate,\n            repetition_penalty=repetition_penalty)\n\n        if decode_strategy == 'sampling':\n            if num_return_sequences > 1:\n                input_ids, model_kwargs = self.expand_inputs_for_generation(\n                    input_ids,\n                    expand_size=num_return_sequences,\n                    **model_kwargs)\n\n            ret = self.sample(input_ids, logits_processors, max_len,\n                              pad_token_id, eos_token_id, top_k, top_p,\n                              temperature, **model_kwargs)\n        else:\n            raise ValueError(f'Not support {decode_strategy} strategy yet!')\n        return ret\n"
  },
  {
    "path": "ppfleetx/models/language_model/gpt/auto/auto_module.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\nimport copy\nimport argparse\n\nimport numpy as np\nimport paddle\nimport paddle.distributed as dist\nfrom paddle import LazyGuard\nfrom paddle.static import InputSpec\nfrom paddle.distributed.fleet import auto\n\nfrom ...auto_utils import process_configs\n\nimport ppfleetx.models.language_model.gpt as gpt\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.data.tokenizers import GPTTokenizer\nfrom ppfleetx.core.module.basic_module import BasicModule\n\n\nclass LanguageModuleAuto(BasicModule):\n    def __init__(self, configs):\n        self.nranks = dist.get_world_size()\n        super(LanguageModuleAuto, self).__init__(configs)\n\n        self.loss_fn = self.get_loss_fn()\n\n    def process_configs(self, configs):\n        configs = process_configs(configs)\n        return configs\n\n    def get_model_size(self, l, h, v, s):\n        P = 12 * l * h * h * (1 + 13 / (12 * h) + (v + s) / (12 * l * h))\n        logger.info('Model Size: {:.2f} B'.format(P / 1000.0 / 1000.0 /\n                                                  1000.0))\n\n\nclass GPTModuleAuto(LanguageModuleAuto):\n    def __init__(self, configs):\n        super(GPTModuleAuto, self).__init__(configs)\n\n    def get_model(self):\n        model_setting = copy.deepcopy(self.configs.Model)\n        model_setting.pop(\"module\")\n        model_setting.pop(\"name\")\n\n        l = model_setting['num_layers']\n        h = model_setting['hidden_size']\n        v = model_setting['vocab_size']\n        s = self.configs.Data.Train.dataset.max_seq_len\n        self.get_model_size(l, h, v, s)\n\n        self.tokenizer = GPTTokenizer.from_pretrained(\"gpt2\")\n\n        with LazyGuard():\n            model = gpt.GPTForPretrainingAuto(\n                gpt.GPTModelAuto(**model_setting))\n        return model\n\n    def get_loss_fn(self):\n        model_setting = copy.deepcopy(self.configs.Model)\n        return gpt.GPTPretrainingCriterionAuto(model_setting['mesh'])\n\n\nclass GPTGenerationModuleAuto(BasicModule):\n    def __init__(self, configs):\n        self.configs = configs\n        self.generation_cfgs = configs.Generation\n        self.nranks = paddle.distributed.get_world_size()\n\n        super().__init__(configs)\n\n    def process_configs(self, configs):\n        configs = process_configs(configs)\n        return configs\n\n    def get_model(self):\n        model_setting = copy.deepcopy(self.configs.Model)\n        model_setting.pop(\"module\")\n        model_setting.pop(\"name\")\n\n        with LazyGuard():\n            model = gpt.GPTForGenerationAuto(\n                gpt.GPTModelAuto(**model_setting), self.generation_cfgs)\n\n        self.tokenizer = GPTTokenizer.from_pretrained(\"gpt2\")\n\n        self.generation_cfgs['max_dec_len'] = self.adjust_length_to_model(\n            self.generation_cfgs['max_dec_len'], 512)\n\n        self.generation_cfgs['bos_token_id'] = self.tokenizer.eos_token_id\n        self.generation_cfgs['eos_token_id'] = self.tokenizer.eos_token_id\n        self.generation_cfgs['pad_token_id'] = self.tokenizer.eos_token_id\n\n        return model\n\n    def adjust_length_to_model(self, length, max_sequence_length):\n        if length < 0 or length > max_sequence_length:\n            length = max_sequence_length\n        return length\n\n    def left_padding(self, inputs, pad_id, padding=\"longest\"):\n        assert \"input_ids\" in inputs, \"input_ids should be in inputs!\"\n        max_length = 0\n        for ids in inputs[\"input_ids\"]:\n            max_length = max(max_length, len(ids))\n\n        def extend_max_lenth(value, max_length, to_pad_id):\n            return [to_pad_id] * (max_length - len(value)) + value\n\n        def extend_filed(name, max_length, to_pad_id):\n            values = inputs[name]\n            res = []\n            for index, value in enumerate(values):\n                res.append(extend_max_lenth(value, max_length, to_pad_id))\n            inputs[name] = res\n\n        extend_filed(\"input_ids\", max_length, pad_id)\n        if \"attention_mask\" in inputs:\n            extend_filed(\"attention_mask\", max_length, 0)\n        if \"position_ids\" in inputs:\n            extend_filed(\"position_ids\", max_length, 0)\n\n        return inputs\n\n    def input_spec(self):\n        return [InputSpec(shape=[None, None], name=\"input_ids\", dtype='int64')]\n"
  },
  {
    "path": "ppfleetx/models/language_model/gpt/dygraph/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.\n# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport collections\nimport logging\nfrom distutils.util import strtobool\nimport os\nimport math\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport paddle.tensor as tensor\nfrom paddle.fluid import layers\nfrom paddle.nn.layer.transformer import _convert_param_attr_to_list\nimport paddle.incubate as incubate\nfrom paddle.common_ops_import import convert_dtype\n\nimport paddle.distributed.fleet as fleet\nfrom paddle.distributed.fleet.meta_parallel import get_rng_state_tracker\nfrom paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer, SharedLayerDesc\nfrom paddle.distributed.fleet.utils import recompute\nfrom paddle.autograd import PyLayer\nimport sys\n\nfrom .single_model import ExpertLayer\nfrom .sequence_parallel_utils import ScatterOp, GatherOp, \\\n        mark_as_sequence_parallel_parameter, ColumnSequenceParallelLinear, RowSequenceParallelLinear\nfrom .processor import (\n    LogitsProcessorList, MinLengthLogitsProcessor,\n    HammingDiversityLogitsProcessor, RepetitionPenaltyLogitsProcessor,\n    ForcedBOSTokenLogitsProcessor, ForcedEOSTokenLogitsProcessor)\n\nfrom ppfleetx.models.language_model.moe import MoELayer\nfrom ppfleetx.distributed.apis import env\nfrom ppfleetx.utils.log import logger\n\nimport numpy as np\n\ntry:\n    from paddle.nn.functional.flash_attention import flash_attention\nexcept:\n    flash_attention = None\n\n\ndef get_attr(layer, name):\n    if getattr(layer, name, None) is not None:\n        return getattr(layer, name, None)\n    else:\n        return get_attr(layer._layer, name)\n\n\ndef parallel_matmul(lm_output, logit_weights, parallel_output):\n    \"\"\"\n    \"\"\"\n    hcg = env.get_hcg()\n    model_parallel_group = hcg.get_model_parallel_group()\n    world_size = hcg.get_model_parallel_world_size()\n    rank = hcg.get_model_parallel_rank()\n\n    if world_size > 1:\n        input_parallel = paddle.distributed.collective._c_identity(\n            lm_output, group=model_parallel_group)\n\n        logits = paddle.matmul(input_parallel, logit_weights, transpose_y=True)\n\n        if parallel_output:\n            return logits\n\n        return paddle.distributed.collective._c_concat(\n            logits, group=model_parallel_group)\n    else:\n        logits = paddle.matmul(lm_output, logit_weights, transpose_y=True)\n        return logits\n\n\nclass MultiHeadAttention(nn.Layer):\n    \"\"\"\n    Attention mapps queries and a set of key-value pairs to outputs, and\n    Multi-Head Attention performs multiple parallel attention to jointly attending\n    to information from different representation subspaces.\n\n    \"\"\"\n\n    Cache = collections.namedtuple(\"Cache\", [\"k\", \"v\"])\n    StaticCache = collections.namedtuple(\"StaticCache\", [\"k\", \"v\"])\n\n    def __init__(self,\n                 embed_dim,\n                 num_heads,\n                 dropout=0.,\n                 kdim=None,\n                 vdim=None,\n                 need_weights=False,\n                 weight_attr=None,\n                 output_layer_weight_attr=None,\n                 bias_attr=None,\n                 fuse_attn_qkv=False,\n                 scale_qk_coeff=1.0,\n                 num_partitions=1,\n                 fused_linear=False,\n                 use_recompute=False,\n                 recompute_granularity=\"full\",\n                 sequence_parallel=False,\n                 do_recompute=True,\n                 use_flash_attn=False):\n        super(MultiHeadAttention, self).__init__()\n        self.embed_dim = embed_dim\n        self.kdim = kdim if kdim is not None else embed_dim\n        self.vdim = vdim if vdim is not None else embed_dim\n        self.num_heads = num_heads\n        self.dropout = dropout\n        self.need_weights = need_weights\n        self.fuse_attn_qkv = fuse_attn_qkv\n        self.scale_qk_coeff = scale_qk_coeff\n        self.use_recompute = use_recompute\n        self.recompute_granularity = recompute_granularity\n        self.do_recompute = do_recompute\n        self.sequence_parallel = sequence_parallel\n        self.use_flash_attn = use_flash_attn if flash_attention else None\n\n        if sequence_parallel:\n            ColumnParallelLinear = ColumnSequenceParallelLinear\n            RowParallelLinear = RowSequenceParallelLinear\n        else:\n            ColumnParallelLinear = fleet.meta_parallel.ColumnParallelLinear\n            RowParallelLinear = fleet.meta_parallel.RowParallelLinear\n\n        self.head_dim = embed_dim // num_heads\n        assert self.head_dim * num_heads == self.embed_dim, \"embed_dim must be divisible by num_heads\"\n\n        assert self.num_heads % num_partitions == 0, \"num_heads {} must be divisible by num_partitions {}\".format(\n            self.num_heads, num_partitions)\n        self.num_heads = self.num_heads // num_partitions\n\n        if self.fuse_attn_qkv:\n            assert self.kdim == embed_dim\n            assert self.vdim == embed_dim\n\n            self.qkv_proj = ColumnParallelLinear(\n                embed_dim,\n                3 * embed_dim,\n                mp_group=env.get_hcg().get_model_parallel_group(),\n                weight_attr=weight_attr,\n                has_bias=True,\n                gather_output=False,\n                fuse_matmul_bias=fused_linear)\n        else:\n            self.q_proj = ColumnParallelLinear(\n                embed_dim,\n                embed_dim,\n                mp_group=env.get_hcg().get_model_parallel_group(),\n                weight_attr=weight_attr,\n                has_bias=True,\n                gather_output=False,\n                fuse_matmul_bias=fused_linear)\n\n            self.k_proj = ColumnParallelLinear(\n                self.kdim,\n                embed_dim,\n                mp_group=env.get_hcg().get_model_parallel_group(),\n                weight_attr=weight_attr,\n                has_bias=True,\n                gather_output=False,\n                fuse_matmul_bias=fused_linear)\n\n            self.v_proj = ColumnParallelLinear(\n                self.vdim,\n                embed_dim,\n                mp_group=env.get_hcg().get_model_parallel_group(),\n                weight_attr=weight_attr,\n                has_bias=True,\n                gather_output=False,\n                fuse_matmul_bias=fused_linear)\n\n        self.out_proj = RowParallelLinear(\n            embed_dim,\n            embed_dim,\n            mp_group=env.get_hcg().get_model_parallel_group(),\n            weight_attr=output_layer_weight_attr,\n            has_bias=True,\n            input_is_parallel=True,\n            fuse_matmul_bias=fused_linear)\n\n    def _fuse_prepare_qkv(self, query, use_cache=False, cache=None):\n        mix_layer = self.qkv_proj(query)\n        mix_layer = paddle.reshape_(mix_layer, [0, 0, -1, 3 * self.head_dim])\n        q, k, v = paddle.split(mix_layer, num_or_sections=3, axis=-1)\n\n        assert not isinstance(\n            cache, self.StaticCache\n        ), \"cache currently does not support the StaticCache type\"\n\n        if isinstance(cache, self.Cache):\n            # for decoder self-attention in inference\n            k = tensor.concat([cache.k, k], axis=1)\n            v = tensor.concat([cache.v, v], axis=1)\n        if use_cache is True:\n            cache = self.Cache(k, v)\n\n        return (q, k, v, cache) if use_cache else (q, k, v, None)\n\n    def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):\n        r\"\"\"\n        Prapares linear projected queries, keys and values for usage of subsequnt\n        multiple parallel attention. If `cache` is not None, using cached results\n        to reduce redundant calculations.\n\n        \"\"\"\n        q = self.q_proj(query)\n        q = tensor.reshape(x=q, shape=[0, 0, -1, self.head_dim])\n\n        if isinstance(cache, self.StaticCache):\n            # for encoder-decoder attention in inference and has cached\n            k, v = cache.k, cache.v\n        else:\n            k, v = self.compute_kv(key, value)\n\n        if isinstance(cache, self.Cache):\n            # for decoder self-attention in inference\n            k = tensor.concat([cache.k, k], axis=1)\n            v = tensor.concat([cache.v, v], axis=1)\n        if use_cache is True:\n            cache = self.Cache(k, v)\n\n        return (q, k, v, cache) if use_cache else (q, k, v, None)\n\n    def compute_kv(self, key, value):\n        r\"\"\"\n        Applies linear projection on input keys and values, then splits heads\n        (reshape and transpose) to get keys and values from different representation\n        subspaces. The results are used as key-values pairs for subsequent multiple\n        parallel attention.\n\n        It is part of calculations in multi-head attention, and is provided as\n        a method to pre-compute and prefetch these results, thus we can use them\n        to construct cache for inference.\n\n        \"\"\"\n        k = self.k_proj(key)\n        v = self.v_proj(value)\n        k = tensor.reshape(x=k, shape=[0, 0, -1, self.head_dim])\n        v = tensor.reshape(x=v, shape=[0, 0, -1, self.head_dim])\n        return k, v\n\n    def gen_cache(self, key, value=None, type=Cache):\n        \"\"\"\n        Generates cache for `forward` usage in inference accroding to arguments.\n        The generated cache is an instance of `MultiHeadAttention.Cache` or an\n        instance of `MultiHeadAttention.StaticCache`.\n        \"\"\"\n        if type == MultiHeadAttention.StaticCache:  # static_kv\n            k, v = self.compute_kv(key, value)\n            return self.StaticCache(k, v)\n        elif value is None:  # incremental_state\n            k = layers.fill_constant_batch_size_like(\n                input=key,\n                shape=[-1, self.num_heads, 0, self.head_dim],\n                dtype=key.dtype,\n                value=0)\n            v = layers.fill_constant_batch_size_like(\n                input=key,\n                shape=[-1, self.num_heads, 0, self.head_dim],\n                dtype=key.dtype,\n                value=0)\n            return self.Cache(k, v)\n        else:\n            # incremental_state with initial value, mainly for usage like UniLM\n            return self.Cache(key, value)\n\n    def _flash_attention(self, q, k, v, attn_mask=None):\n        if self.sequence_parallel:\n            perm = [1, 0, 2, 3]\n            q = tensor.transpose(x=q, perm=perm)\n            k = tensor.transpose(x=k, perm=perm)\n            v = tensor.transpose(x=v, perm=perm)\n        out, weights = flash_attention(\n            q,\n            k,\n            v,\n            self.dropout,\n            causal=True,\n            return_softmax=self.need_weights)\n        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])\n        if self.sequence_parallel:\n            perm = [1, 0, 2]\n            out = tensor.transpose(x=out, perm=perm)\n        return out, weights\n\n    def core_attn(self, q, k, v, attn_mask=None):\n        perm = [1, 2, 0, 3] if self.sequence_parallel else [0, 2, 1, 3]\n        q = tensor.transpose(x=q, perm=perm)\n        k = tensor.transpose(x=k, perm=perm)\n        v = tensor.transpose(x=v, perm=perm)\n\n        # scale dot product attention\n        scale_qk_coeff = self.scale_qk_coeff * self.head_dim**0.5\n        product = paddle.matmul(\n            x=q.scale(1.0 / scale_qk_coeff), y=k, transpose_y=True)\n\n        if self.scale_qk_coeff != 1.0:\n            product = product.scale(self.scale_qk_coeff)\n\n        # softmax_mask_fuse_upper_triangle is not supported sif paddle is not compiled with cuda/rocm\n        if not paddle.is_compiled_with_cuda():\n            attn_mask = get_triangle_upper_mask(product, attn_mask)\n\n        if attn_mask is not None:\n            product = product + attn_mask\n            weights = F.softmax(product)\n        else:\n            weights = incubate.softmax_mask_fuse_upper_triangle(product)\n\n        if self.dropout:\n            with get_rng_state_tracker().rng_state('local_seed'):\n                weights = F.dropout(\n                    weights,\n                    self.dropout,\n                    training=self.training,\n                    mode=\"upscale_in_train\")\n\n        out = paddle.matmul(weights, v)\n\n        # combine heads\n        if self.sequence_parallel:\n            out = tensor.transpose(out, perm=[2, 0, 1, 3])\n        else:\n            out = tensor.transpose(out, perm=[0, 2, 1, 3])\n        # If sequence_parallel is true, out shape is [s, b, h] after reshape\n        # else out shape is [b, s, h]\n        out = tensor.reshape(x=out, shape=[0, 0, -1])\n\n        return out, weights\n\n    def forward(self,\n                query,\n                key,\n                value,\n                attn_mask=None,\n                use_cache=False,\n                cache=None):\n        r\"\"\"\n        Applies multi-head attention to map queries and a set of key-value pairs\n        to outputs.\n        \"\"\"\n        key = query if key is None else key\n        value = query if value is None else value\n        # if sequence_parallel is true, query, key, value shape are [s, b, h],\n        # else their shape are [b, s, h], n is mp parallelism.\n        # no matter sequence_parallel is true or false,\n        # after reshape, q, k, v shape should be [b, num_heads/n, s, head_dim]\n        # compute q ,k ,v\n        if self.fuse_attn_qkv:\n            q, k, v, cache = self._fuse_prepare_qkv(query, use_cache, cache)\n        else:\n            q, k, v, cache = self._prepare_qkv(query, key, value, use_cache,\n                                               cache)\n\n        if self.use_flash_attn and attn_mask is None:\n            attn_func = self._flash_attention\n        else:\n            attn_func = self.core_attn\n\n        if self.use_recompute and self.recompute_granularity == \"core_attn\" and self.do_recompute:\n            out, weights = recompute(attn_func, q, k, v, attn_mask)\n        else:\n            out, weights = attn_func(q, k, v, attn_mask=attn_mask)\n\n        # project to output\n        # if sequence_parallel is true, out shape are [s/n, b, h],\n        # else their shape are [b, s, h], n is mp parallelism.\n        out = self.out_proj(out)\n\n        outs = [out]\n        if self.need_weights:\n            outs.append(weights)\n        if use_cache:\n            outs.append(cache)\n        return out if len(outs) == 1 else tuple(outs)\n\n\nclass TransformerDecoder(nn.Layer):\n    \"\"\"\n    TransformerDecoder is a stack of N decoder layers.\n    \"\"\"\n\n    def __init__(self,\n                 decoder_layers,\n                 num_layers,\n                 norm=None,\n                 hidden_size=None,\n                 use_recompute=False,\n                 recompute_granularity=\"full\",\n                 sequence_parallel=False,\n                 no_recompute_layers=None):\n        super(TransformerDecoder, self).__init__()\n\n        if no_recompute_layers is None:\n            no_recompute_layers = []\n        self.no_recompute_layers = no_recompute_layers\n\n        self.num_layers = num_layers\n        self.layers = decoder_layers\n        self.norm = norm\n        self.use_recompute = use_recompute\n        self.recompute_granularity = recompute_granularity\n        self.sequence_parallel = sequence_parallel\n        if norm == \"LayerNorm\":\n            self.norm = nn.LayerNorm(hidden_size, epsilon=1e-5)\n            # if sequence parallel is true,\n            # register hook to all_reduce gradient of weight, bias\n            if self.sequence_parallel:\n                mark_as_sequence_parallel_parameter(self.norm.weight)\n                mark_as_sequence_parallel_parameter(self.norm.bias)\n        elif norm is not None:\n            raise ValueError(\"Only support LayerNorm\")\n\n    def forward(self,\n                tgt,\n                memory,\n                tgt_mask=None,\n                memory_mask=None,\n                use_cache=False,\n                cache=None):\n        r\"\"\"\n        Applies a stack of N Transformer decoder layers on inputs. If `norm` is\n        provided, also applies layer normalization on the output of last decoder\n        layer.\n        \"\"\"\n        output = tgt\n        new_caches = []\n\n        for i, mod in enumerate(self.layers):\n            if cache is None:\n                if use_cache:\n                    output, new_cache = mod(output,\n                                            memory,\n                                            tgt_mask=tgt_mask,\n                                            use_cache=use_cache,\n                                            cache=cache)\n                    new_caches.append(new_cache)\n                else:\n                    if self.use_recompute and self.recompute_granularity == \"full\" and i not in self.no_recompute_layers:\n                        output = recompute(mod, output, memory, tgt_mask,\n                                           use_cache, cache)\n                    else:\n                        output = mod(output, memory, tgt_mask, use_cache,\n                                     cache)\n\n            else:\n                output, new_cache = mod(output,\n                                        memory,\n                                        tgt_mask=tgt_mask,\n                                        use_cache=use_cache,\n                                        cache=cache[i])\n                new_caches.append(new_cache)\n\n        if self.norm is not None:\n            output = self.norm(output)\n        return output if use_cache is False else (output, new_caches)\n\n    def gen_cache(self, memory, do_zip=False):\n        r\"\"\"\n        Generates cache for `forward` usage. The generated cache is a list, and\n        each element in it is a tuple( :code:`(incremental_cache, static_cache)` )\n        produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache`\n        for more details. If `do_zip` is True, apply `zip` on these tuples to get\n        a list with two elements.\n       \"\"\"\n        cache = [layer.gen_cache(memory) for layer in self.layers]\n        if do_zip:\n            cache = list(zip(*cache))\n        return cache\n\n\nclass TransformerDecoderLayer(nn.Layer):\n    \"\"\"\n    The transformer decoder layer.\n\n    It contains multiheadattention and some linear layers.\n    \"\"\"\n\n    def __init__(self,\n                 d_model,\n                 nhead,\n                 dim_feedforward,\n                 dropout=0.1,\n                 activation=\"gelu\",\n                 attn_dropout=None,\n                 act_dropout=None,\n                 normalize_before=True,\n                 weight_attr=None,\n                 output_layer_weight_attr=None,\n                 bias_attr=None,\n                 num_partitions=1,\n                 fused_linear=False,\n                 fuse_attn_qkv=False,\n                 scale_qk_coeff=1.0,\n                 moe_configs=None,\n                 recompute_attn=False,\n                 use_recompute=False,\n                 recompute_granularity=\"full\",\n                 sequence_parallel=False,\n                 do_recompute=True,\n                 skip_quant_tensors=[],\n                 use_flash_attn=False):\n        self._config = locals()\n        self._config.pop(\"self\")\n        self._config.pop(\"__class__\", None)  # py3\n\n        super(TransformerDecoderLayer, self).__init__()\n        attn_dropout = dropout if attn_dropout is None else attn_dropout\n        act_dropout = dropout if act_dropout is None else act_dropout\n        self.normalize_before = normalize_before\n        self.use_recompute = use_recompute\n        self.recompute_granularity = recompute_granularity\n        self.sequence_parallel = sequence_parallel\n        self.do_recompute = do_recompute\n\n        self.expert_mode = False\n        # moe config\n        if moe_configs is not None:\n            self.gate = moe_configs.get('gate', 'gshard')\n            self.top_k = moe_configs.get('top_k', 2)\n            self.num_experts = moe_configs.get('num_experts', 1)\n            self.expert_mode = moe_configs.get('expert_mode', False)\n\n        if sequence_parallel:\n            ColumnParallelLinear = ColumnSequenceParallelLinear\n            RowParallelLinear = RowSequenceParallelLinear\n        else:\n            ColumnParallelLinear = fleet.meta_parallel.ColumnParallelLinear\n            RowParallelLinear = fleet.meta_parallel.RowParallelLinear\n\n        weight_attrs = _convert_param_attr_to_list(weight_attr, 3)\n        bias_attrs = _convert_param_attr_to_list(bias_attr, 3)\n        output_layer_weight_attrs = _convert_param_attr_to_list(\n            output_layer_weight_attr, 3)\n\n        self.self_attn = MultiHeadAttention(\n            d_model,\n            nhead,\n            dropout=attn_dropout,\n            weight_attr=weight_attrs[0],\n            bias_attr=bias_attrs[0],\n            output_layer_weight_attr=output_layer_weight_attrs[0],\n            num_partitions=num_partitions,\n            fused_linear=fused_linear,\n            fuse_attn_qkv=fuse_attn_qkv,\n            scale_qk_coeff=scale_qk_coeff,\n            use_recompute=use_recompute,\n            recompute_granularity=recompute_granularity,\n            sequence_parallel=sequence_parallel,\n            do_recompute=do_recompute,\n            use_flash_attn=use_flash_attn)\n\n        if self.expert_mode:\n            experts_list = nn.LayerList([\n                ExpertLayer(d_model, dim_feedforward)\n                for e in range(self.num_experts)\n            ])\n\n            hcg = env.get_hcg()\n            moe_group = hcg.get_expert_parallel_group()\n            mp_group = hcg.get_model_parallel_group()\n\n            self.moe_mlp = MoELayer(\n                d_model=d_model,\n                experts=experts_list,\n                gate=self.gate,\n                top_k=self.top_k,\n                moe_group=moe_group,\n                mp_group=mp_group,\n                recompute_interval=int(self.use_recompute))\n        else:\n            self.linear1 = ColumnParallelLinear(\n                d_model,\n                dim_feedforward,\n                mp_group=env.get_hcg().get_model_parallel_group(),\n                weight_attr=weight_attrs[2],\n                gather_output=False,\n                has_bias=True,\n                fuse_matmul_bias=fused_linear)\n\n            self.linear2 = RowParallelLinear(\n                dim_feedforward,\n                d_model,\n                mp_group=env.get_hcg().get_model_parallel_group(),\n                weight_attr=output_layer_weight_attrs[2],\n                input_is_parallel=True,\n                has_bias=True,\n                fuse_matmul_bias=fused_linear)\n\n            if 'linear1' in skip_quant_tensors:\n                self.linear1.skip_quant = True\n\n            if 'linear2' in skip_quant_tensors:\n                self.linear2.skip_quant = True\n\n        self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)\n        self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5)\n        if self.sequence_parallel:\n            # if sequence parallel is true, register hook to all_reduce gradient of bias\n            mark_as_sequence_parallel_parameter(self.norm1.weight)\n            mark_as_sequence_parallel_parameter(self.norm1.bias)\n            mark_as_sequence_parallel_parameter(self.norm2.weight)\n            mark_as_sequence_parallel_parameter(self.norm2.bias)\n        self.dropout1 = nn.Dropout(dropout, mode=\"upscale_in_train\")\n        self.dropout2 = nn.Dropout(act_dropout, mode=\"upscale_in_train\")\n        self.activation = getattr(F, activation)\n\n    def forward(self,\n                tgt,\n                memory=None,\n                tgt_mask=None,\n                use_cache=False,\n                cache=None):\n        residual = tgt\n\n        if self.normalize_before:\n            tgt = self.norm1(tgt)\n\n        if use_cache is False:\n            if self.use_recompute and self.recompute_granularity == \"full_attn\" and self.do_recompute:\n                tgt = recompute(self.self_attn, tgt, None, None, tgt_mask,\n                                use_cache, cache)\n            else:\n                tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache)\n        else:\n            tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask,\n                                                    use_cache, cache)\n        # If use sequence_parallel, different input partition in dropout\n        # should use different seed.\n        if self.sequence_parallel:\n            current_seed = 'local_seed'\n        else:\n            current_seed = 'global_seed'\n        with get_rng_state_tracker().rng_state(current_seed):\n            tgt = residual + self.dropout1(tgt)\n\n        if not self.normalize_before:\n            tgt = self.norm1(tgt)\n\n        residual = tgt\n        if self.normalize_before:\n            tgt = self.norm2(tgt)\n\n        if self.expert_mode:\n            tgt = self.moe_mlp(tgt)\n        else:\n            with get_rng_state_tracker().rng_state(current_seed):\n                tgt = self.dropout2(\n                    self.linear2(F.gelu(\n                        self.linear1(tgt), approximate=True)))\n\n        tgt = residual + tgt\n\n        if not self.normalize_before:\n            tgt = self.norm2(tgt)\n\n        return tgt if use_cache is False else (tgt, incremental_cache)\n\n    def gen_cache(self, memory):\n        incremental_cache = self.self_attn.gen_cache(\n            memory, type=self.self_attn.Cache)\n        return incremental_cache\n\n\nclass GPTEmbeddings(nn.Layer):\n    \"\"\"\n    Include embeddings from word and position embeddings.\n    \"\"\"\n\n    def __init__(self,\n                 vocab_size,\n                 hidden_size=768,\n                 hidden_dropout_prob=0.1,\n                 max_position_embeddings=512,\n                 type_vocab_size=16,\n                 initializer_range=0.02,\n                 sequence_parallel=False,\n                 freeze_embedding=False):\n        super(GPTEmbeddings, self).__init__()\n\n        self.sequence_parallel = sequence_parallel\n        self.word_embeddings = fleet.meta_parallel.VocabParallelEmbedding(\n            vocab_size,\n            hidden_size,\n            mp_group=env.get_hcg().get_model_parallel_group(),\n            weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal(\n                mean=0.0, std=initializer_range)))\n\n        self.position_embeddings = nn.Embedding(\n            max_position_embeddings,\n            hidden_size,\n            weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal(\n                mean=0.0, std=initializer_range)))\n\n        if freeze_embedding:\n            self.word_embeddings.weight.learning_rate = 0.0\n            self.position_embeddings.weight.learning_rate = 0.0\n\n        self.dropout = nn.Dropout(hidden_dropout_prob)\n\n    def forward(self, input_ids, position_ids=None):\n        if position_ids is None:\n            ones = paddle.ones_like(input_ids, dtype=\"int64\")\n            seq_length = paddle.cumsum(ones, axis=-1)\n            position_ids = seq_length - ones\n\n        input_embedings = self.word_embeddings(input_ids)\n        position_embeddings = self.position_embeddings(position_ids)\n        embeddings = input_embedings + position_embeddings\n        # if sequence parallel is true, change embedding shape [b, s, h] to [s, b, h]\n        # set the sequence dim as first, so the split in sequence dim is data-continuous\n        if self.sequence_parallel:\n            embeddings = paddle.transpose(embeddings, perm=[1, 0, 2])\n            embeddings = ScatterOp.apply(embeddings)\n            with get_rng_state_tracker().rng_state('local_seed'):\n                embeddings = self.dropout(embeddings)\n        else:\n            embeddings = self.dropout(embeddings)\n        return embeddings\n\n\nclass GPTModelHybrid(nn.Layer):\n    def __init__(self,\n                 vocab_size=51200,\n                 hidden_size=768,\n                 num_layers=12,\n                 num_attention_heads=12,\n                 ffn_hidden_size=3072,\n                 hidden_dropout_prob=0.1,\n                 attention_probs_dropout_prob=0.1,\n                 max_position_embeddings=512,\n                 type_vocab_size=16,\n                 initializer_range=0.02,\n                 num_partitions=1,\n                 moe_configs=None,\n                 use_recompute=False,\n                 fused_linear=False,\n                 fuse_attn_qkv=False,\n                 scale_qk_by_layer_num=True,\n                 recompute_granularity=\"full\",\n                 sequence_parallel=False,\n                 no_recompute_layers=None,\n                 skip_tensor_map={},\n                 freeze_embedding=False,\n                 use_flash_attn=False,\n                 fused_softmax_with_triangular=False):\n\n        super(GPTModelHybrid, self).__init__()\n\n        if no_recompute_layers is None:\n            no_recompute_layers = []\n        self.initializer_range = initializer_range\n        self.hidden_size = hidden_size\n        self.vocab_size = vocab_size\n        self.fused_softmax_with_triangular = fused_softmax_with_triangular\n\n        if use_flash_attn:\n            if flash_attention:\n                logger.info(\"Flash-attention enabled.\")\n            else:\n                use_flash_attn = False\n                logger.warning(\n                    \"Flash-attention is not support in this Paddle version.\")\n\n        hcg = env.get_hcg()\n        mp_size = hcg.get_model_parallel_world_size()\n        if mp_size <= 1:\n            sequence_parallel = False\n            logging.warning(\n                \"If mp_size <= 1, sequence_parallel strategy will be turned off in GPTModelHybrid model.\"\n            )\n\n        self.embeddings = GPTEmbeddings(\n            vocab_size, hidden_size, hidden_dropout_prob,\n            max_position_embeddings, type_vocab_size, self.initializer_range,\n            sequence_parallel, freeze_embedding)\n        self.sequence_parallel = sequence_parallel\n\n        decoder_layers = nn.LayerList()\n        for i in range(num_layers):\n            decoder_layers.append(\n                TransformerDecoderLayer(\n                    d_model=hidden_size,\n                    nhead=num_attention_heads,\n                    dim_feedforward=ffn_hidden_size,\n                    dropout=hidden_dropout_prob,\n                    activation=\"gelu\",\n                    attn_dropout=attention_probs_dropout_prob,\n                    act_dropout=hidden_dropout_prob,\n                    weight_attr=paddle.ParamAttr(\n                        initializer=nn.initializer.Normal(\n                            mean=0.0, std=self.initializer_range)),\n                    output_layer_weight_attr=paddle.ParamAttr(\n                        initializer=nn.initializer.Normal(\n                            mean=0.0,\n                            std=self.initializer_range / math.sqrt(\n                                2.0 * num_layers))),\n                    bias_attr=None,\n                    num_partitions=num_partitions,\n                    fused_linear=fused_linear,\n                    fuse_attn_qkv=fuse_attn_qkv,\n                    scale_qk_coeff=num_layers\n                    if scale_qk_by_layer_num else 1.0,\n                    moe_configs=moe_configs,\n                    use_recompute=use_recompute,\n                    recompute_granularity=recompute_granularity,\n                    sequence_parallel=sequence_parallel,\n                    do_recompute=i not in no_recompute_layers,\n                    skip_quant_tensors=skip_tensor_map.get('block_{}'.format(\n                        i), []),\n                    use_flash_attn=use_flash_attn))\n\n        self.decoder = TransformerDecoder(\n            decoder_layers,\n            num_layers,\n            norm=\"LayerNorm\",\n            hidden_size=hidden_size,\n            use_recompute=use_recompute,\n            recompute_granularity=recompute_granularity,\n            sequence_parallel=sequence_parallel,\n            no_recompute_layers=no_recompute_layers)\n\n    def forward(self,\n                input_ids,\n                position_ids=None,\n                attention_mask=None,\n                use_cache=False,\n                cache=None):\n\n        if position_ids is None:\n            past_length = 0\n            if cache is not None:\n                past_length = paddle.shape(attention_mask)[-1] - 1\n            position_ids = paddle.arange(\n                past_length,\n                paddle.shape(input_ids)[-1] + past_length,\n                dtype=input_ids.dtype)\n            position_ids = position_ids.unsqueeze(0)\n            # .expand_as(input_ids)\n            position_ids = paddle.expand_as(position_ids, input_ids)\n        # if sequence_parallel is true, embedding_output shape is [s/n, b, h]\n        # else its shape is [b, s, h], n is mp parallelism\n        embedding_output = self.embeddings(\n            input_ids=input_ids, position_ids=position_ids)\n\n        # fused_softmax_with_triangular is only suppported on GPU/DCU.\n        # If on non-GPU devices, we use user defined mask and non-fused softmax.\n        if not self.fused_softmax_with_triangular or not paddle.is_compiled_with_cuda(\n        ):\n            # TODO, use registered buffer\n            causal_mask = paddle.tensor.triu(\n                paddle.ones(\n                    (paddle.shape(input_ids)[-1], paddle.shape(input_ids)[-1]))\n                * -1e4,\n                diagonal=1)\n            if attention_mask is not None:\n                if len(attention_mask.shape) == 2:\n                    attention_mask = attention_mask[:, None, None, :]\n                attention_mask = attention_mask + causal_mask\n            else:\n                attention_mask = causal_mask\n            # The tensor returned by triu not in static graph.\n            attention_mask.stop_gradient = True\n\n        encoder_outputs = self.decoder(\n            embedding_output,\n            memory=None,\n            tgt_mask=None if (self.fused_softmax_with_triangular and\n                              self.training and paddle.is_compiled_with_cuda())\n            else attention_mask,  # use softmax_mask_fuse_upper_triangle\n            use_cache=use_cache,\n            cache=cache)\n\n        if self.sequence_parallel:\n            encoder_outputs = GatherOp.apply(encoder_outputs)\n\n        return encoder_outputs\n\n\nclass GPTForPretrainingHybrid(nn.Layer):\n    \"\"\"\n    GPT Model with pretraining tasks on top.\n\n    Args:\n        gpt (:class:`GPTModel`):\n            An instance of :class:`GPTModel`.\n\n    \"\"\"\n\n    def __init__(self, gpt):\n        super(GPTForPretrainingHybrid, self).__init__()\n        self.gpt = gpt\n        # extra_parameters using for sharding stage3 to register extra_parameters\n        self.extra_parameters = [\n            get_attr(self.gpt.embeddings.word_embeddings, \"weight\")\n        ]\n\n    def forward(self,\n                input_ids,\n                position_ids=None,\n                attention_mask=None,\n                masked_positions=None,\n                use_cache=False,\n                cache=None):\n\n        outputs = self.gpt(input_ids,\n                           position_ids=position_ids,\n                           attention_mask=attention_mask,\n                           use_cache=use_cache,\n                           cache=cache)\n        if use_cache:\n            encoder_outputs, cached_kvs = outputs[:2]\n        else:\n            encoder_outputs = outputs\n\n        logits = parallel_matmul(\n            encoder_outputs,\n            get_attr(self.gpt.embeddings.word_embeddings, \"weight\"), True)\n\n        if use_cache:\n            return logits, cached_kvs\n        else:\n            return logits\n\n\nclass GPTPretrainingCriterionHybird(nn.Layer):\n    \"\"\"\n    Criterion for GPT. It calculates the final loss.\n    \"\"\"\n\n    def __init__(self, topo=None, sequence_parallel=False):\n        super(GPTPretrainingCriterionHybird, self).__init__()\n        self.loss_func = paddle.nn.CrossEntropyLoss(reduction=\"none\")\n        self.parallel_loss_func = \\\n            fleet.meta_parallel.ParallelCrossEntropy(mp_group=env.get_hcg().get_model_parallel_group())\n        self.sequence_parallel = sequence_parallel\n\n    def forward(self, prediction_scores, masked_lm_labels, loss_mask):\n        \"\"\"\n        Args:\n            prediction_scores(Tensor):\n                The logits of masked token prediction. Its data type should be float32 and\n                its shape is [batch_size, sequence_length, vocab_size].\n            masked_lm_labels(Tensor):\n                The labels of the masked language modeling, the dimensionality of `masked_lm_labels`\n                is equal to `prediction_scores`. Its data type should be int64 and\n                its shape is [batch_size, sequence_length, 1].\n            loss_mask(Tensor):\n                Mask used for calculating the loss of the masked language modeling to avoid\n                calculating some unwanted tokens.\n                Its data type should be float32 and its shape is [batch_size, sequence_length, 1].\n\n        Returns:\n            Tensor: The pretraining loss. Its data type should be float32 and its shape is [1].\n\n        \"\"\"\n        hcg = env.get_hcg()\n        mp_size = hcg.get_model_parallel_world_size()\n        if self.sequence_parallel:\n            masked_lm_labels = masked_lm_labels.transpose([1, 0])\n            loss_mask = loss_mask.transpose([1, 0])\n\n        if mp_size > 1:\n            if paddle.is_compiled_with_cuda() and True:\n                masked_lm_loss = self.parallel_loss_func(\n                    prediction_scores, masked_lm_labels.unsqueeze(2))\n            else:\n                prediction_scores = ConcatSoftmaxInput.apply(\n                    prediction_scores,\n                    group=env.get_hcg().get_model_parallel_group())\n                masked_lm_loss = self.loss_func(prediction_scores,\n                                                masked_lm_labels.unsqueeze(2))\n        else:\n            masked_lm_loss = self.loss_func(prediction_scores,\n                                            masked_lm_labels.unsqueeze(2))\n        loss_mask = loss_mask.reshape([-1])\n        masked_lm_loss = paddle.sum(masked_lm_loss.reshape([-1]) * loss_mask)\n        loss = masked_lm_loss / loss_mask.sum()\n        return loss\n\n\n# these Layers is just for PipelineParallel\n\n\nclass GPTPretrainingCriterionPipe(GPTPretrainingCriterionHybird):\n    \"\"\"Extends GPTPretrainingCriterion to meet the input standard.\"\"\"\n\n    def forward(self, prediction_scores, args):\n        masked_lm_labels = args[0]\n        loss_mask = args[1]\n        loss = super().forward(prediction_scores, masked_lm_labels, loss_mask)\n        return loss\n\n\nclass EmbeddingPipe(GPTEmbeddings):\n    \"\"\"Extends GPTEmbeddings to forward attention_mask through the pipeline.\"\"\"\n\n    @property\n    def embedding_weight(self):\n        return get_attr(self.word_embeddings, \"weight\")\n\n    def forward(self, tensors):\n        input_ids, position_ids = tensors\n        embeddings = super().forward(\n            input_ids=input_ids, position_ids=position_ids)\n        return embeddings\n\n\nclass LayerNormPipe(nn.Layer):\n    def __init__(self,\n                 normalized_shape,\n                 epsilon=1e-05,\n                 weight_attr=None,\n                 bias_attr=None,\n                 name=None,\n                 sequence_parallel=False,\n                 is_last=False):\n        super(LayerNormPipe, self).__init__()\n        self.sequence_parallel = sequence_parallel\n        self.is_last = is_last\n        self.norm = nn.LayerNorm(\n            normalized_shape=normalized_shape,\n            epsilon=epsilon,\n            weight_attr=weight_attr,\n            bias_attr=bias_attr,\n            name=name)\n        if self.sequence_parallel:\n            mark_as_sequence_parallel_parameter(self.norm.weight)\n            mark_as_sequence_parallel_parameter(self.norm.bias)\n\n    def forward(self, input):\n        output = self.norm(input)\n        if self.sequence_parallel and self.is_last:\n            output = GatherOp.apply(output)\n        return output\n\n\nclass GPTForPretrainingPipe(PipelineLayer):\n    \"\"\"GPTForPretraining adapted for pipeline parallelism.\n\n    The largest change is flattening the GPTModel class so we can express it as a\n    sequence of layers including embedding, transformer layers, and output.\n    \"\"\"\n\n    def __init__(self,\n                 vocab_size,\n                 hidden_size=768,\n                 num_layers=12,\n                 num_attention_heads=12,\n                 ffn_hidden_size=3072,\n                 hidden_act=\"gelu\",\n                 hidden_dropout_prob=0.1,\n                 attention_probs_dropout_prob=0.1,\n                 max_position_embeddings=512,\n                 type_vocab_size=16,\n                 initializer_range=0.02,\n                 num_partitions=1,\n                 topology=None,\n                 use_recompute=False,\n                 fused_linear=False,\n                 fuse_attn_qkv=False,\n                 scale_qk_by_layer_num=True,\n                 moe_configs=None,\n                 recompute_granularity=\"full\",\n                 virtual_pp_degree=1,\n                 sequence_parallel=False,\n                 no_recompute_layers=None,\n                 pp_recompute_interval=1,\n                 use_flash_attn=False,\n                 fused_softmax_with_triangular=False):\n\n        # forward desc\n        self.descs = []\n\n        if no_recompute_layers is None:\n            no_recompute_layers = []\n        else:\n            if recompute_granularity == 'full':\n                assert len(no_recompute_layers) == 0, \\\n                    \"for pp with full recompute, no_recompute_layers is not support\"\n\n        if use_flash_attn:\n            if flash_attention:\n                logger.info(\"Flash-attention enabled.\")\n            else:\n                use_flash_attn = False\n                logger.warning(\n                    \"Flash-attention is not support in this Paddle version.\")\n\n        hcg = env.get_hcg()\n        mp_size = hcg.get_model_parallel_world_size()\n        if mp_size <= 1:\n            sequence_parallel = False\n            logging.warning(\n                \"If mp_size <= 1, sequence_parallel strategy will be turned off in GPTForPretrainingPipe model.\"\n            )\n\n        self.descs.append(\n            SharedLayerDesc(\n                'embed',\n                EmbeddingPipe,\n                shared_weight_attr='embedding_weight',\n                vocab_size=vocab_size,\n                hidden_size=hidden_size,\n                hidden_dropout_prob=hidden_dropout_prob,\n                max_position_embeddings=max_position_embeddings,\n                type_vocab_size=type_vocab_size,\n                initializer_range=0.02,\n                sequence_parallel=sequence_parallel))\n\n        for i in range(num_layers):\n            self.descs.append(\n                LayerDesc(\n                    TransformerDecoderLayer,\n                    d_model=hidden_size,\n                    nhead=num_attention_heads,\n                    dim_feedforward=ffn_hidden_size,\n                    dropout=hidden_dropout_prob,\n                    activation=hidden_act,\n                    attn_dropout=attention_probs_dropout_prob,\n                    act_dropout=hidden_dropout_prob,\n                    weight_attr=paddle.ParamAttr(\n                        initializer=nn.initializer.Normal(\n                            mean=0.0, std=initializer_range)),\n                    output_layer_weight_attr=paddle.\n                    ParamAttr(initializer=nn.initializer.Normal(\n                        mean=0.0,\n                        std=initializer_range / math.sqrt(2.0 * num_layers))),\n                    bias_attr=None,\n                    num_partitions=num_partitions,\n                    moe_configs=moe_configs,\n                    fused_linear=fused_linear,\n                    fuse_attn_qkv=fuse_attn_qkv,\n                    scale_qk_coeff=num_layers\n                    if scale_qk_by_layer_num else 1.0,\n                    use_recompute=use_recompute,\n                    recompute_granularity=recompute_granularity,\n                    sequence_parallel=sequence_parallel,\n                    do_recompute=i not in no_recompute_layers,\n                    use_flash_attn=use_flash_attn))\n\n        self.descs.append(\n            LayerDesc(\n                LayerNormPipe,\n                normalized_shape=hidden_size,\n                sequence_parallel=sequence_parallel,\n                is_last=True))\n\n        def _logits_helper(embedding, output):\n            return parallel_matmul(output, embedding.embedding_weight, True)\n\n        self.descs.append(\n            SharedLayerDesc(\n                'embed',\n                EmbeddingPipe,\n                forward_func=_logits_helper,\n                shared_weight_attr='embedding_weight',\n                vocab_size=vocab_size,\n                hidden_size=hidden_size,\n                hidden_dropout_prob=hidden_dropout_prob,\n                max_position_embeddings=max_position_embeddings,\n                type_vocab_size=type_vocab_size,\n                initializer_range=0.02))\n\n        recompute_interval = 0\n        if recompute and recompute_granularity == \"full\":\n            assert pp_recompute_interval <= \\\n                   num_layers // (virtual_pp_degree *\n                                  env.get_hcg().topology().get_dim_size(\"pipe\")), \\\n                \"pp recompute interval should smaller than num layers of each pp chunk\"\n            recompute_interval = pp_recompute_interval\n\n        seg_method = \"layer:TransformerDecoderLayer\"\n        if num_layers % env.get_hcg().topology().get_dim_size(\"pipe\") != 0:\n            seg_method = \"uniform\"\n\n        super().__init__(\n            layers=self.descs,\n            loss_fn=GPTPretrainingCriterionPipe(\n                sequence_parallel=sequence_parallel),\n            topology=env.get_hcg().topology(),\n            seg_method=seg_method,\n            recompute_interval=recompute_interval,\n            recompute_ctx={\n                \"mp_group\": env.get_hcg().get_model_parallel_group(),\n                \"offload\": False,\n                \"partition\": False,\n            },\n            num_virtual_pipeline_stages=virtual_pp_degree)\n\n\nclass GPTForGenerationHybrid(nn.Layer):\n    \"\"\"\n    GPT Model with pretraining tasks on top.\n\n    Args:\n        gpt (:class:`GPTModel`):\n            An instance of :class:`GPTModel`.\n\n    \"\"\"\n\n    def __init__(self, gpt, configs):\n        super(GPTForGenerationHybrid, self).__init__()\n        self.gpt = gpt\n        # extra_parameters using for sharding stage3 to register extra_parameters\n        self.extra_parameters = [\n            get_attr(self.gpt.embeddings.word_embeddings, \"weight\")\n        ]\n        self.configs = configs\n\n        self.max_length = self.configs.get('max_dec_len', 20)\n        self.min_length = self.configs.get('min_dec_len', 0)\n        self.decode_strategy = self.configs.get('decode_strategy', 'sampling')\n        self.temperature = self.configs.get('temperature', 1.0)\n        self.top_k = self.configs.get('top_k', 0)\n        self.top_p = self.configs.get('top_p', 1.0)\n        self.repetition_penalty = self.configs.get('repetition_penalty', 1.0)\n        self.num_beams = self.configs.get('num_beams', 1)\n        self.num_beam_groups = self.configs.get('num_beam_groups', 1)\n        self.length_penalty = self.configs.get('length_penalty', 0.0)\n        self.early_stopping = self.configs.get('early_stopping', False)\n        self.bos_token_id = self.configs.get('bos_token_id', None)\n        self.eos_token_id = self.configs.get('eos_token_id', None)\n        self.pad_token_id = self.configs.get('pad_token_id', None)\n        self.decoder_start_token_id = self.configs.get(\n            'decoder_start_token_id', None)\n        self.forced_bos_token_id = self.configs.get('forced_bos_token_id',\n                                                    None)\n        self.forced_eos_token_id = self.configs.get('forced_eos_token_id',\n                                                    None)\n        self.num_return_sequences = self.configs.get('num_return_sequences', 1)\n        self.diversity_rate = self.configs.get('diversity_rate', 0.0)\n        self.use_cache = self.configs.get('use_cache', True)\n\n    def prepare_input_ids_for_generation(self,\n                                         bos_token_id,\n                                         encoder_output=None):\n        batch_size = 1\n        if bos_token_id is None:\n            raise ValueError(\"`bos_token_id` should be defined when no \"\n                             \"`input_ids` are provided.\")\n        if encoder_output is not None:\n            batch_size = encoder_output.shape[0]\n        return paddle.ones([batch_size, 1], dtype=\"int64\") * bos_token_id\n\n    def prepare_attention_mask_for_generation(self, input_ids, pad_token_id,\n                                              eos_token_id):\n        is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any(\n            input_ids == pad_token_id).numpy().item()\n        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (\n            (eos_token_id is not None) and (pad_token_id != eos_token_id))\n        if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:\n            attention_mask = (input_ids == pad_token_id\n                              ).astype(paddle.get_default_dtype()) * -1e9\n        else:\n            attention_mask = paddle.zeros_like(\n                input_ids, dtype=paddle.get_default_dtype())\n        return paddle.unsqueeze(attention_mask, axis=[1, 2])\n\n    def update_scores_for_generation(self, scores, next_scores, length,\n                                     unfinished_flag):\n        # update scores\n\n        unfinished_scores = (scores * length + next_scores) / (length + 1)\n        scores = paddle.where(unfinished_flag, unfinished_scores, scores)\n        return scores\n\n    def get_logits_processor(self,\n                             min_length=None,\n                             max_length=None,\n                             eos_token_id=None,\n                             forced_bos_token_id=None,\n                             forced_eos_token_id=None,\n                             num_beams=1,\n                             num_beam_groups=1,\n                             diversity_rate=0.0,\n                             repetition_penalty=None):\n        processors = LogitsProcessorList()\n\n        if min_length is not None and eos_token_id is not None and min_length > -1:\n            processors.append(\n                MinLengthLogitsProcessor(min_length, eos_token_id))\n        if num_beam_groups > 1 and diversity_rate > 0.0:\n            processors.append(\n                HammingDiversityLogitsProcessor(\n                    diversity_rate=diversity_rate,\n                    num_beams=num_beams,\n                    num_beam_groups=num_beam_groups))\n        if repetition_penalty is not None and repetition_penalty != 1.0:\n            processors.append(\n                RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))\n        if forced_bos_token_id is not None:\n            processors.append(\n                ForcedBOSTokenLogitsProcessor(forced_bos_token_id))\n        if forced_eos_token_id is not None:\n            processors.append(\n                ForcedEOSTokenLogitsProcessor(max_length, forced_eos_token_id))\n        # TODO\n        # Add more pre_processing for distribution\n\n        return processors\n\n    def expand_inputs_for_generation(self,\n                                     input_ids,\n                                     expand_size,\n                                     attention_mask=None,\n                                     **model_kwargs):\n\n        index = paddle.tile(\n            paddle.arange(paddle.shape(input_ids)[0]).unsqueeze(-1),\n            [1, expand_size]).reshape([-1])\n\n        input_ids = paddle.gather(input_ids, index)\n\n        if attention_mask is not None:\n            model_kwargs[\"attention_mask\"] = paddle.gather(attention_mask,\n                                                           index)\n\n        if \"token_type_ids\" in model_kwargs and model_kwargs[\n                \"token_type_ids\"] is not None:\n            token_type_ids = model_kwargs[\"token_type_ids\"]\n            model_kwargs[\"token_type_ids\"] = paddle.gather(token_type_ids,\n                                                           index)\n\n        if \"position_ids\" in model_kwargs and model_kwargs[\n                \"position_ids\"] is not None:\n            position_ids = model_kwargs[\"position_ids\"]\n            model_kwargs[\"position_ids\"] = paddle.gather(position_ids, index)\n\n        if \"seq_len\" in model_kwargs and model_kwargs[\"seq_len\"] is not None:\n            seq_len = model_kwargs[\"seq_len\"]\n            model_kwargs[\"seq_len\"] = paddle.gather(seq_len, index)\n\n        if \"encoder_output\" in model_kwargs and model_kwargs[\n                \"encoder_output\"] is not None:\n            encoder_output = model_kwargs[\"encoder_output\"]\n            model_kwargs[\"encoder_output\"] = paddle.gather(encoder_output,\n                                                           index)\n\n        if \"role_ids\" in model_kwargs and model_kwargs[\"role_ids\"] is not None:\n            role_ids = model_kwargs[\"role_ids\"]\n            model_kwargs[\"role_ids\"] = paddle.gather(role_ids, index)\n\n        return input_ids, model_kwargs\n\n    def prepare_inputs_for_generation(self,\n                                      input_ids,\n                                      use_cache=False,\n                                      cache=None,\n                                      **kwargs):\n        # only last token for inputs_ids if cache is defined in kwargs\n        position_ids = kwargs.get(\"position_ids\", None)\n        attention_mask = kwargs.get(\"attention_mask\", None)\n        if attention_mask is not None:\n            if len(attention_mask.shape) == 4:\n                attention_mask = attention_mask[:, -1, -1, :]\n            if \"int\" in paddle.common_ops_import.convert_dtype(\n                    attention_mask.dtype):\n                attention_mask = (1.0 - attention_mask) * -1e4\n        if cache is not None:\n            input_ids = input_ids[:, -1].unsqueeze(-1)\n            if position_ids is not None:\n                position_ids = position_ids[:, -1].unsqueeze(-1)\n        return {\n            \"input_ids\": input_ids,\n            \"position_ids\": position_ids,\n            \"attention_mask\": attention_mask,\n            \"cache\": cache\n        }\n\n    def update_model_kwargs_for_generation(self,\n                                           outputs,\n                                           model_kwargs,\n                                           is_encoder_decoder=False):\n        # Update the model inputs during generation.\n        # Note that If `token_type_ids` and `attention_mask` in `model_kwargs`\n        # and they contain pad value, the result vectors updated by this method\n        # may be different from expected. In this case, you need to rewrite the\n        # method.\n\n        # update cache\n        if isinstance(outputs, tuple):\n            model_kwargs[\"cache\"] = outputs[1]\n\n        # update token_type_ids with last value\n        if \"token_type_ids\" in model_kwargs and model_kwargs[\n                \"token_type_ids\"] is not None:\n            token_type_ids = model_kwargs[\"token_type_ids\"]\n            model_kwargs[\"token_type_ids\"] = paddle.concat(\n                [token_type_ids, token_type_ids[:, -1:]], axis=-1)\n\n        # update position_ids\n        if \"position_ids\" in model_kwargs and model_kwargs[\n                \"position_ids\"] is not None:\n            position_ids = model_kwargs[\"position_ids\"]\n            model_kwargs[\"position_ids\"] = paddle.concat(\n                [position_ids, position_ids[:, -1:] + 1], axis=-1)\n\n        # update attention_mask\n        if not is_encoder_decoder and \"attention_mask\" in model_kwargs:\n            attention_mask = model_kwargs[\"attention_mask\"]\n            # nn.Pad2D don't support the data type `bool`\n            if convert_dtype(attention_mask.dtype) == 'bool':\n                attention_mask = paddle.cast(attention_mask, 'int64')\n            if len(attention_mask.shape) == 4:\n                attention_mask = nn.Pad2D(\n                    [0, 0, 0, 1], mode='replicate')(attention_mask)\n                attention_mask = nn.Pad2D(\n                    [0, 1, 0, 0], value=-1e4)(attention_mask)\n                dtype = convert_dtype(attention_mask.dtype)\n                if 'int' in dtype:\n                    attention_mask[:, :, -1, -1] = 1\n                elif 'float' in dtype:\n                    attention_mask[:, :, -1, -1] = 0.0\n                else:\n                    raise ValueError(\n                        'The data type of input `attention_mask` must '\n                        'be bool, int or float')\n            else:\n                attention_mask = paddle.concat(\n                    [\n                        attention_mask, paddle.ones(\n                            [attention_mask.shape[0], 1], dtype=\"int64\")\n                    ],\n                    axis=-1)\n            model_kwargs[\"attention_mask\"] = attention_mask\n\n        # update role_ids\n        if \"role_ids\" in model_kwargs and model_kwargs[\"role_ids\"] is not None:\n            role_ids = model_kwargs[\"role_ids\"]\n            model_kwargs[\"role_ids\"] = paddle.concat(\n                [role_ids, role_ids[:, -1:]], axis=-1)\n\n        return model_kwargs\n\n    def sample(self,\n               input_ids,\n               logits_processors,\n               max_length,\n               pad_token_id,\n               eos_token_id,\n               top_k=None,\n               top_p=None,\n               temperature=None,\n               min_tokens_to_keep=1,\n               **model_kwargs):\n        def TopKProcess(probs, top_k, min_tokens_to_keep):\n            top_k = min(max(top_k, min_tokens_to_keep), probs.shape[-1])\n            # Remove all tokens with a probability less than the last token of the top-k\n            topk_probs, _ = paddle.topk(probs, k=top_k)\n            probs = paddle.where(probs >= topk_probs[:, -1:], probs,\n                                 paddle.full_like(probs, 0.0))\n            return probs\n\n        def TopPProcess(probs, top_p, min_tokens_to_keep):\n            sorted_probs = paddle.sort(probs, descending=True)\n            sorted_indices = paddle.argsort(probs, descending=True)\n            cumulative_probs = paddle.cumsum(sorted_probs, axis=-1)\n\n            # Remove tokens with cumulative probs above the top_p, But keep at\n            # least min_tokens_to_keep tokens\n            sorted_indices_to_remove = cumulative_probs > top_p\n            if min_tokens_to_keep > 1:\n                # Set 'min_tokens_to_keep - 1' because the first token is kept\n                sorted_indices_to_remove[:, :min_tokens_to_keep - 1] = 0\n            # Keep the first token\n            sorted_indices_to_remove = paddle.cast(\n                sorted_indices_to_remove, dtype='int64')\n            sorted_indices_to_remove[:, 1:] = (\n                sorted_indices_to_remove[:, :-1].clone())\n            sorted_indices_to_remove[:, 0] = 0\n\n            # Scatter sorted tensors to original indexing\n            sorted_indices = sorted_indices + paddle.arange(probs.shape[\n                0]).unsqueeze(-1) * probs.shape[-1]\n            condition = paddle.scatter(sorted_indices_to_remove.flatten(),\n                                       sorted_indices.flatten(),\n                                       sorted_indices_to_remove.flatten())\n            condition = paddle.cast(condition, 'bool').reshape(probs.shape)\n            probs = paddle.where(condition,\n                                 paddle.full_like(probs, 0.0), probs)\n            return probs\n\n        batch_size, cur_len = input_ids.shape\n        origin_len = input_ids.shape[1]\n        unfinished_flag = paddle.full([batch_size, 1], True, dtype='bool')\n        scores = paddle.full(\n            [batch_size, 1], 0.0, dtype=paddle.get_default_dtype())\n\n        # use_cache is immutable, we split it off other mutable kwargs.\n        assert 'use_cache' in model_kwargs\n        immutable = {'use_cache': model_kwargs['use_cache']}\n        del model_kwargs['use_cache']\n\n        def _forward_(**args):\n            model_inputs = self.prepare_inputs_for_generation(\n                input_ids, **args, **immutable)\n            return self.gpt(**model_inputs, **immutable)\n\n        def _post_process_(outputs, input_ids, cur_len, origin_len, scores,\n                           unfinished_flag, model_kwargs):\n\n            logits = outputs[0] if isinstance(outputs, tuple) else outputs\n\n            logits = parallel_matmul(\n                logits,\n                get_attr(self.gpt.embeddings.word_embeddings, \"weight\"), False)\n\n            # [batch_size, vocab_size]\n            logits = logits[:, -1, :]\n\n            # pre-process distribution\n            logits = logits_processors(input_ids, logits)\n\n            # sample\n            origin_probs = F.softmax(logits)\n            origin_probs = paddle.log(origin_probs)\n            if temperature is not None and temperature != 1.0:\n                logits = logits / temperature\n            probs = F.softmax(logits)\n            if top_k is not None and top_k != 0:\n                probs = TopKProcess(probs, top_k, min_tokens_to_keep)\n            if top_p is not None and top_p < 1.0:\n                probs = TopPProcess(probs, top_p, min_tokens_to_keep)\n            next_tokens = paddle.multinomial(probs)\n\n            next_scores = paddle.index_sample(origin_probs, next_tokens)\n\n            if eos_token_id is not None:\n                next_tokens = paddle.where(\n                    unfinished_flag, next_tokens,\n                    paddle.full_like(next_tokens, pad_token_id))\n\n            scores = self.update_scores_for_generation(\n                scores, next_scores, cur_len - origin_len, unfinished_flag)\n\n            input_ids = paddle.concat([input_ids, next_tokens], axis=1)\n\n            if eos_token_id is not None:\n                unfinished_flag = paddle.logical_and(\n                    unfinished_flag, next_tokens != eos_token_id)\n\n            model_kwargs = self.update_model_kwargs_for_generation(\n                outputs,\n                model_kwargs,\n                is_encoder_decoder=self.is_encoder_decoder)\n\n            return input_ids, scores, unfinished_flag, model_kwargs\n\n        # Note(GuoxiaWang):Pre-while call for inference, simulate a do while loop statement\n        # the value in model_kwargs should be tensor before while loop\n        outputs = _forward_(**model_kwargs)\n\n        input_ids, scores, unfinished_flag, model_kwargs = _post_process_(\n            outputs, input_ids, cur_len, origin_len, scores, unfinished_flag,\n            model_kwargs)\n        cur_len += 1\n\n        attn_mask = model_kwargs['attention_mask']\n        # make the shape of attention_mask = (-1, -1, -1, -1) in dy2static.\n        model_kwargs['attention_mask'] = paddle.reshape(\n            attn_mask, paddle.shape(attn_mask))\n        model_kwargs['cache'] = outputs[1] if isinstance(outputs,\n                                                         tuple) else None\n        while cur_len < max_length:\n            # Note(GuoxiaWang): Remove outputs = _forward_(**model_kwargs)\n            # and change it to pass directly to _post_process_ to avoid\n            # closed-loop problem of dynamic-to-static model\n            input_ids, scores, unfinished_flag, model_kwargs = _post_process_(\n                _forward_(**model_kwargs), input_ids, cur_len, origin_len,\n                scores, unfinished_flag, model_kwargs)\n            cur_len += 1\n\n            if not paddle.any(unfinished_flag):\n                break\n\n        return input_ids[:, origin_len:], scores\n\n    def forward(self, input_ids=None, **model_kwargs):\n\n        max_length = self.max_length\n        min_length = self.min_length\n        decode_strategy = self.decode_strategy\n        temperature = self.temperature\n        top_k = self.top_k\n        top_p = self.top_p\n        repetition_penalty = self.repetition_penalty\n        num_beams = self.num_beams\n        num_beam_groups = self.num_beam_groups\n        length_penalty = self.length_penalty\n        early_stopping = self.early_stopping\n        bos_token_id = self.bos_token_id\n        eos_token_id = self.eos_token_id\n        pad_token_id = self.pad_token_id\n        decoder_start_token_id = self.decoder_start_token_id\n        forced_bos_token_id = self.forced_bos_token_id\n        forced_eos_token_id = self.forced_eos_token_id\n        num_return_sequences = self.num_return_sequences\n        diversity_rate = self.diversity_rate\n        use_cache = self.use_cache\n\n        assert (\n            decode_strategy in [\"greedy_search\", \"sampling\", \"beam_search\"]\n        ), \"`decode_strategy` must be one of 'greedy_search', 'sampling' or 'beam_search' but received {}.\".format(\n            decode_strategy)\n\n        bos_token_id = bos_token_id if bos_token_id is not None else getattr(\n            self.gpt, 'bos_token_id', None)\n        eos_token_id = eos_token_id if eos_token_id is not None else getattr(\n            self.gpt, 'eos_token_id', None)\n        pad_token_id = pad_token_id if pad_token_id is not None else getattr(\n            self.gpt, 'pad_token_id', None)\n        forced_bos_token_id = forced_bos_token_id if forced_bos_token_id is not None else getattr(\n            self.gpt, 'forced_bos_token_id', None)\n        forced_eos_token_id = forced_eos_token_id if forced_eos_token_id is not None else getattr(\n            self.gpt, 'forced_eos_token_id', None)\n        decoder_start_token_id = decoder_start_token_id if decoder_start_token_id is not None else getattr(\n            self.gpt, 'decoder_start_token_id', None)\n\n        # params check\n        if input_ids is None:\n            # Init `input_ids` with bos_token_id\n            input_ids = self.prepare_input_ids_for_generation(bos_token_id)\n\n        if model_kwargs.get(\"attention_mask\", None) is None:\n            # TODO\n            # Init `attention_mask` depending on `pad_token_id`\n            model_kwargs[\n                \"attention_mask\"] = self.prepare_attention_mask_for_generation(\n                    input_ids, pad_token_id, eos_token_id)\n        self.is_encoder_decoder = False\n\n        model_kwargs[\"use_cache\"] = use_cache\n\n        max_length += input_ids.shape[-1]\n        min_length += input_ids.shape[-1]\n\n        logits_processors = self.get_logits_processor(\n            min_length=min_length,\n            max_length=max_length,\n            eos_token_id=eos_token_id,\n            forced_bos_token_id=forced_bos_token_id,\n            forced_eos_token_id=forced_eos_token_id,\n            num_beams=num_beams,\n            num_beam_groups=num_beam_groups,\n            diversity_rate=diversity_rate,\n            repetition_penalty=repetition_penalty)\n\n        if decode_strategy == 'sampling':\n            if num_return_sequences > 1:\n                input_ids, model_kwargs = self.expand_inputs_for_generation(\n                    input_ids,\n                    expand_size=num_return_sequences,\n                    **model_kwargs)\n\n            ret = self.sample(input_ids, logits_processors, max_length,\n                              pad_token_id, eos_token_id, top_k, top_p,\n                              temperature, **model_kwargs)\n        else:\n            raise ValueError(f'Not support {decoding_strategy} strategy yet!')\n        return ret\n\n\ndef get_triangle_upper_mask(x, mask):\n    if mask is not None:\n        return mask\n    mask = paddle.full_like(x, -np.inf)\n    mask.stop_gradient = True\n    mask = paddle.triu(mask, diagonal=1)\n    mask.stop_gradient = True\n    return mask\n\n\nclass ConcatSoftmaxInput(PyLayer):\n    @staticmethod\n    def forward(ctx, inp, group=None):\n        inputs = []\n        paddle.distributed.all_gather(inputs, inp, group=group)\n        with paddle.no_grad():\n            cat = paddle.concat(inputs, axis=-1)\n        ctx.cat_args = group\n        return cat\n\n    @staticmethod\n    def backward(ctx, grad):\n        group = ctx.cat_args\n        with paddle.no_grad():\n            grads = paddle.split(\n                grad, paddle.distributed.get_world_size(group), axis=-1)\n        grad = grads[paddle.distributed.get_rank(group)]\n        return grad\n"
  },
  {
    "path": "ppfleetx/models/language_model/gpt/dygraph/processor.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import List\nimport inspect\nfrom abc import ABC\n\nimport paddle\n\n\nclass LogitsProcessorList(List):\n    def __call__(self, input_ids, logits, **kwargs):\n        for processor in self:\n            processor_args = inspect.signature(processor.__call__).parameters\n            if len(processor_args) > 2:\n                assert all(\n                    arg in kwargs for arg in list(processor_args.keys())[2:]\n                ), f\"The parameters don't match for {processor.__class__}\"\n                logits = processor(input_ids, logits, **kwargs)\n            else:\n                logits = processor(input_ids, logits)\n        return logits\n\n\nclass LogitsProcessor(ABC):\n    \"\"\"\n    Abstract base class for all logit processors that can be applied during \n    generation.\n    \"\"\"\n\n    def __call__(self, input_ids, logits):\n        raise NotImplementedError(\n            f\"{self.__class__} is an abstract class. \"\n            \"Only classes inheriting this class can be called.\")\n\n\nclass MinLengthLogitsProcessor(LogitsProcessor):\n    r\"\"\"\n    Enforcing a min-length by setting EOS probability to 0.\n    Args:\n        min_length (int): The minimum length of generation sequence.\n        eos_token_id (int): The id of the `end-of-sequence` token.\n    \"\"\"\n\n    def __init__(self, min_length, eos_token_id):\n        if not isinstance(min_length, int) or min_length < 0:\n            raise ValueError(\n                \"`min_length` should be a positive integer, but get {}\".format(\n                    min_length))\n\n        if not isinstance(eos_token_id, int) or eos_token_id < 0:\n            raise ValueError(\n                \"`eos_token_id` should be a positive integer, but get {}\".\n                format(eos_token_id))\n\n        self.min_length = min_length\n        self.eos_token_id = eos_token_id\n\n    def __call__(self, input_ids, logits):\n        cur_len = input_ids.shape[-1]\n        if cur_len < self.min_length:\n            logits[:, self.eos_token_id] = -float(\"inf\")\n        return logits\n\n\nclass RepetitionPenaltyLogitsProcessor(LogitsProcessor):\n    r\"\"\"\n    Enforcing an exponential penalty on repeated sequences.\n    Args:\n        repetition_penalty (float):\n            The parameter for repetition penalty. 1.0 means no penalty. See `this paper\n            <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.\n    \"\"\"\n\n    def __init__(self, penalty: float):\n        if not isinstance(penalty, float) or not (penalty > 0):\n            raise ValueError(\n                f\"`penalty` has to be a strictly positive float, but is {penalty}\"\n            )\n\n        self.penalty = penalty\n\n    def __call__(self, input_ids, logits):\n        score = paddle.index_sample(logits, input_ids)\n        score = paddle.where(score < 0, score * self.penalty,\n                             score / self.penalty)\n        input_ids = input_ids + paddle.arange(logits.shape[0]).unsqueeze(\n            -1) * logits.shape[-1]\n        outputs = paddle.scatter(logits.flatten(),\n                                 input_ids.flatten(),\n                                 score.flatten()).reshape(logits.shape)\n        return outputs\n\n\nclass HammingDiversityLogitsProcessor(LogitsProcessor):\n    \"\"\"\n    This `LogitsProcessor` enforces diverse beam search. Note that this logits\n    processor is only effective for `group_beam_search`. See \n    `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details.\n    Args:\n        diversity_rate (float): This value is subtracted from a beam's score if \n            it generates a token same as any beam from other group at a particular \n            time. \n        num_beams (int): Number of beams used for group beam search. \n        num_beam_groups (int): Number of groups to divide `num_beams` into in order \n            to ensure diversity among different groups of beams. \n    \"\"\"\n\n    def __init__(self, diversity_rate, num_beams, num_beam_groups):\n        if not isinstance(diversity_rate, float) or (not diversity_rate > 0.0):\n            raise ValueError(\n                \"`diversity_rate` should be a float strictly larger than 0.\")\n        self._diversity_rate = diversity_rate\n        if not isinstance(num_beams, int) or num_beams < 2:\n            raise ValueError(\n                \"`num_beams` should be an integer strictly larger than 1.\")\n        self._num_beams = num_beams\n        if not isinstance(num_beam_groups, int) or num_beam_groups < 2:\n            raise ValueError(\n                \"`num_beam_groups` should be an integer strictly larger than 1.\"\n            )\n        self._num_sub_beams = num_beams // num_beam_groups\n\n    def __call__(self, input_ids, scores, current_tokens, beam_group_idx):\n        batch_size = current_tokens.shape[0] // self._num_beams\n        group_start_idx = beam_group_idx * self._num_sub_beams\n        group_end_idx = min(group_start_idx + self._num_sub_beams,\n                            self._num_beams)\n        group_size = group_end_idx - group_start_idx\n        vocab_size = scores.shape[-1]\n\n        if group_start_idx == 0:\n            return scores\n\n        for batch_idx in range(batch_size):\n            previous_group_tokens = current_tokens[\n                batch_idx * self._num_beams:batch_idx * self._num_beams +\n                group_start_idx]\n            token_frequency = paddle.bincount(\n                previous_group_tokens, minlength=vocab_size)\n            scores[batch_idx * group_size:(batch_idx + 1) *\n                   group_size] -= self._diversity_rate * token_frequency\n\n        return scores\n\n\nclass ForcedBOSTokenLogitsProcessor(LogitsProcessor):\n    \"\"\"\n    This `LogitsProcessor` enforces the first generated token to be the selected `forced_bos_token`.\n    Args:\n        forced_bos_token_id (:obj:`int`):\n            The id of the token to to be generated as the first token.\n    \"\"\"\n\n    def __init__(self, forced_bos_token_id):\n        self.forced_bos_token_id = forced_bos_token_id\n\n    def __call__(self, input_ids, scores):\n        cur_len = input_ids.shape[-1]\n        if cur_len == 1:\n            num_tokens = scores.shape[1]\n            scores[:, [\n                i for i in range(num_tokens) if i != self.forced_bos_token_id\n            ]] = -float(\"inf\")\n            scores[:, self.forced_bos_token_id] = 0\n        return scores\n\n\nclass ForcedEOSTokenLogitsProcessor(LogitsProcessor):\n    \"\"\"\n    This `LogitsProcessor` enforces the last generated token to be the selected `forced_eos_token`.\n    Args:\n        max_length (int): The maximum length of the sequence to be generated.\n        forced_eos_token_id (int): The id of the token to to be generated as the last token.\n    \"\"\"\n\n    def __init__(self, max_length, forced_eos_token_id):\n        self.max_length = max_length\n        self.forced_eos_token_id = forced_eos_token_id\n\n    def __call__(self, input_ids, scores):\n        cur_len = input_ids.shape[-1]\n        if cur_len == self.max_length - 1:\n            num_tokens = scores.shape[1]\n            scores[:, [\n                i for i in range(num_tokens) if i != self.forced_eos_token_id\n            ]] = -1e9  #TODO change back to -inf after paddle.topk is fixed\n            scores[:, self.forced_eos_token_id] = 0\n        return scores\n"
  },
  {
    "path": "ppfleetx/models/language_model/gpt/dygraph/sequence_parallel_utils.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.\n# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport numpy as np\nimport paddle\nfrom paddle import framework\nfrom paddle import distributed as dist\nfrom paddle.nn import functional as F\nfrom paddle.autograd import PyLayer\nfrom paddle.fluid import core\nfrom paddle.nn.layer.layers import Layer\nfrom paddle.distributed import fleet\nfrom paddle.distributed.fleet.base import topology as tp\nfrom paddle.distributed.fleet.meta_parallel import get_rng_state_tracker\nfrom paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients_with_group\n\nfrom ppfleetx.distributed.apis import env\n\nimport numpy as np\n\n####################################################\n#                                                  #\n#        Distributed Communication Operator        #\n#                                                  #\n####################################################\n\n\ndef scatter(input):\n    hcg = env.get_hcg()\n    group = hcg.get_model_parallel_group()\n    parallelism = group.nranks\n    rank = group.rank\n    seq_len = input.shape[0]\n    assert seq_len % parallelism == 0, \"Input sequence length {} can't be divided exactly by sequence parallelism {}\".format(\n        seq_len, parallelism)\n    interval = seq_len // parallelism\n    input = paddle.slice(\n        input,\n        axes=[0],\n        starts=[interval * rank],\n        ends=[interval * (rank + 1)])\n    return input\n\n\ndef all_gather(input):\n    hcg = env.get_hcg()\n    group = hcg.get_model_parallel_group()\n    parallelism = group.nranks\n    output_shape = input.shape\n    output_shape[0] = output_shape[0] * parallelism\n    output = paddle.empty(shape=output_shape, dtype=input.dtype)\n    group.process_group.all_gather(input, output).wait()\n    return output\n\n\ndef reduce_scatter(input):\n    hcg = env.get_hcg()\n    group = hcg.get_model_parallel_group()\n    parallelism = group.nranks\n    output_shape = input.shape\n    assert input.shape[\n        0] % parallelism == 0, \"Input sequence length {0} can't be divided exactly by sequence parallelism {1}\".format(\n            input.shape[0], parallelism)\n    output_shape[0] = output_shape[0] // parallelism\n    output = paddle.empty(shape=output_shape, dtype=input.dtype)\n    dist.stream.reduce_scatter(\n        output, input, op=dist.ReduceOp.SUM, group=group, sync_op=True)\n    return output\n\n\nclass ScatterOp(PyLayer):\n    # input shape: [s, b, h], n is mp parallelism\n    # after forward shape: [s/n, b, h]\n    @staticmethod\n    def forward(ctx, input):\n        return scatter(input)\n\n    @staticmethod\n    def backward(ctx, grad):\n        return all_gather(grad)\n\n\nclass GatherOp(PyLayer):\n    # input shape: [s/n, b, h], n is mp parallelism\n    # after forward shape: [s, b, h]\n    @staticmethod\n    def forward(ctx, input):\n        return all_gather(input)\n\n    @staticmethod\n    def backward(ctx, grad):\n        return scatter(grad)\n\n\n# All gather along the first dim during forward pass\n# All reduce and scatter along the first dim during backward pass\nclass AllGatherOp(PyLayer):\n    # input shape: [s/n, b, h], n is mp parallelism\n    # after forward shape: [s, b, h]\n    @staticmethod\n    def forward(ctx, input):\n        return all_gather(input)\n\n    # grad shape: [s, b, h], n is mp parallelism\n    # after forward shape: [s/n, b, h]\n    @staticmethod\n    def backward(ctx, grad):\n        return reduce_scatter(grad)\n\n\n# All reduce and scatter along the first dim during forward pass\n# All gather along the first dim during backward pass\nclass ReduceScatterOp(PyLayer):\n    # input shape: [s, b, h], n is mp parallelism\n    # after forward shape: [s/n, b, h]\n    @staticmethod\n    def forward(ctx, input):\n        return reduce_scatter(input)\n\n    # grad shape: [s/n, b, h], n is mp parallelism\n    # after forward shape: [s, b, h]\n    @staticmethod\n    def backward(ctx, grad):\n        return all_gather(grad)\n\n\n###################################################\n#                                                 #\n#        Modified Parallel Linear Operator        #\n#                                                 #\n###################################################\n\n\ndef mark_as_sequence_parallel_parameter(parameter):\n    setattr(parameter, 'sequence_parallel', True)\n\n\ndef is_sequence_parallel_parameter(parameter):\n    return getattr(parameter, 'sequence_parallel', False)\n\n\ndef create_fused_allreduce_gradient_hook(parameter_list, accumulation_steps):\n    hcg = env.get_hcg()\n    group = hcg.get_model_parallel_group()\n\n    step = [0]\n    accumulation_steps *= len(parameter_list)\n\n    def __impl__(grad):\n        step[0] += 1\n        if step[0] == accumulation_steps:\n            step[0] = 0\n            fused_allreduce_gradients_with_group(\n                parameter_list, group=group, scale=1.0)\n        return grad\n\n    return __impl__\n\n\ndef create_non_fused_allreduce_gradient_hook(param, accumulation_steps):\n    hcg = env.get_hcg()\n    pg = hcg.get_model_parallel_group().process_group\n    step = [0]\n\n    @paddle.autograd.no_grad()\n    def __impl__():\n        step[0] += 1\n        if (step[0] % accumulation_steps) == 0:\n            if hasattr(param, \"main_grad\"):\n                pg.allreduce(param.main_grad).wait()\n            else:\n                pg.allreduce(param.grad).wait()\n\n    return __impl__\n\n\ndef register_sequence_parallel_allreduce_hooks(\n        model, accumulation_steps, fuse_sequence_parallel_allreduce):\n    if accumulation_steps <= 0 or not paddle.distributed.is_initialized():\n        return\n\n    mp_group = env.get_hcg().get_model_parallel_group()\n    if mp_group.nranks <= 1:\n        return\n\n    params = []\n    for p in model.parameters():\n        if is_sequence_parallel_parameter(p):\n            params.append(p)\n\n    if fuse_sequence_parallel_allreduce:\n        hook = create_fused_allreduce_gradient_hook(params, accumulation_steps)\n        for p in params:\n            p._register_backward_hook(hook)\n    else:\n        for p in params:\n            hook = create_non_fused_allreduce_gradient_hook(p,\n                                                            accumulation_steps)\n            p._register_backward_hook(hook)\n\n\ndef is_fused_matmul_bias_supported():\n    if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm():\n        return hasattr(core.eager.ops.legacy, 'fused_gemm_epilogue')\n    else:\n        return False\n\n\nclass ColumnSequenceParallelLinear(Layer):\n    def __init__(self,\n                 in_features,\n                 out_features,\n                 weight_attr=None,\n                 has_bias=None,\n                 gather_output=True,\n                 fuse_matmul_bias=False,\n                 mp_group=None,\n                 name=None):\n        super(ColumnSequenceParallelLinear, self).__init__()\n\n        hcg = env.get_hcg()\n        self.model_parallel_group = hcg.get_model_parallel_group(\n        ) if mp_group is None else mp_group\n        self.world_size = hcg.get_model_parallel_group(\n        ).nranks if mp_group is None else mp_group.nranks\n        self._name = name\n        self.is_mp = (self.world_size > 1)\n\n        assert gather_output is False, \"If sequence_parallel is True, \\\n                                        gather_output is False\"\n\n        self.gather_output = gather_output\n        assert out_features % self.world_size == 0, (\n            \"Number of column of the weight for linear ({}) must be\"\n            \" divisible by model parallel size ({})\".format(out_features,\n                                                            self.world_size))\n        self.output_size_per_partition = out_features // self.world_size\n\n        self._weight_attr = weight_attr\n        self._dtype = self._helper.get_default_dtype()\n\n        if self.is_mp and paddle.in_dynamic_mode():\n            with get_rng_state_tracker().rng_state():\n                self.weight = self.create_parameter(\n                    shape=[in_features, self.output_size_per_partition],\n                    attr=self._weight_attr,\n                    dtype=self._dtype,\n                    is_bias=False)\n        else:\n            self.weight = self.create_parameter(\n                shape=[in_features, self.output_size_per_partition],\n                attr=self._weight_attr,\n                dtype=self._dtype,\n                is_bias=False)\n\n        self.weight.is_distributed = True if self.is_mp else False\n\n        if has_bias:\n            # initialize bias to zero like Megatron\n            self.bias = self.create_parameter(\n                shape=[self.output_size_per_partition],\n                attr=paddle.nn.initializer.Constant(value=0.0),\n                dtype=self._dtype,\n                is_bias=True)\n            self.bias.is_distributed = True if self.is_mp else False\n        else:\n            self.bias = None\n\n        self.linear = F.linear\n\n        if fuse_matmul_bias:\n            if not is_fused_matmul_bias_supported():\n                raise NotImplementedError(\n                    \"You set fuse_matmul_bias=True in ColumnSequenceParallelLinear, \"\n                    \"however, the paddle you are using not support this operation. \"\n                    \"Please set fuse_matmul_bias=False or use paddle compiled \"\n                    \"with cuda 11.6 or higher.\")\n            from paddle.incubate.nn.functional import fused_linear\n            self.linear = fused_linear\n\n    def forward(self, x):\n        # sequence parallelism is same as model parallelism\n        # if sequence parallel is true, input shape is [s, b, h]\n        # else input shape is [b, s, h]\n        if self.is_mp:\n            input_parallel = AllGatherOp.apply(x)\n        else:\n            input_parallel = x\n        output = self.linear(\n            input_parallel, self.weight, self.bias, name=self._name)\n        return output\n\n\nclass RowSequenceParallelLinear(Layer):\n    def __init__(self,\n                 in_features,\n                 out_features,\n                 weight_attr=None,\n                 has_bias=True,\n                 input_is_parallel=False,\n                 fuse_matmul_bias=False,\n                 mp_group=None,\n                 name=None):\n        super(RowSequenceParallelLinear, self).__init__()\n\n        self.in_features = in_features\n        self.out_features = out_features\n        assert input_is_parallel is True, \"If sequence_parallel is True, \\\n                                           input_is_parallel should be true.\"\n\n        self.input_is_parallel = input_is_parallel\n        self._weight_attr = weight_attr\n        self._dtype = self._helper.get_default_dtype()\n        self._name = name\n\n        hcg = env.get_hcg()\n        self.model_parallel_group = hcg.get_model_parallel_group(\n        ) if mp_group is None else mp_group\n        self.world_size = hcg.get_model_parallel_group(\n        ).nranks if mp_group is None else mp_group.nranks\n        self.rank = hcg.get_model_parallel_group(\n        ).rank if mp_group is None else mp_group.rank\n\n        self.is_mp = (self.world_size > 1)\n        assert in_features % self.world_size == 0, (\n            \"Number of row of the weight for linear ({}) must be\"\n            \" divisible by model parallel size ({})\".format(in_features,\n                                                            self.world_size))\n\n        self.input_size_per_partition = in_features // self.world_size\n\n        if self.is_mp and paddle.in_dynamic_mode():\n            with get_rng_state_tracker().rng_state():\n                self.weight = self.create_parameter(\n                    shape=[self.input_size_per_partition, self.out_features],\n                    attr=self._weight_attr,\n                    dtype=self._dtype,\n                    is_bias=False)\n        else:\n            self.weight = self.create_parameter(\n                shape=[self.input_size_per_partition, self.out_features],\n                attr=self._weight_attr,\n                dtype=self._dtype,\n                is_bias=False)\n\n        self.weight.is_distributed = True if self.is_mp else False\n\n        # if sequence parallel is true,\n        # register hook to all_reduce gradient of weight and bias\n        if has_bias:\n            self.bias = self.create_parameter(\n                shape=[self.out_features],\n                attr=paddle.nn.initializer.Constant(value=0.0),\n                dtype=self._dtype,\n                is_bias=True)\n            if self.is_mp:\n                mark_as_sequence_parallel_parameter(self.bias)\n        else:\n            self.bias = None\n\n        self.linear = F.linear\n\n        if fuse_matmul_bias:\n            if not is_fused_matmul_bias_supported():\n                raise NotImplementedError(\n                    \"You set fuse_matmul_bias=True in RowParallelLinear, \"\n                    \"however, the paddle you are using not support this operation. \"\n                    \"Please set fuse_matmul_bias=False or use paddle compiled \"\n                    \"with cuda 11.6 or higher.\")\n            from paddle.incubate.nn.functional import fused_linear\n            self.linear = fused_linear\n\n    def forward(self, x):\n        input_parallel = x\n        if self.is_mp:\n            output_parallel = self.linear(\n                input_parallel, self.weight, name=self._name)\n            output_ = ReduceScatterOp.apply(output_parallel)\n            # if self.bias is not none, sequence parallel will use\n            # register_hook to all_reduce self.bias\n            output = output_ + self.bias if self.bias is not None else output_\n        else:\n            output = self.linear(\n                input_parallel, self.weight, self.bias, name=self._name)\n        return output\n"
  },
  {
    "path": "ppfleetx/models/language_model/gpt/dygraph/single_model.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.\n# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport collections\nimport logging\nfrom distutils.util import strtobool\nimport os\nimport numpy as np\nimport math\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nimport paddle.tensor as tensor\nfrom paddle.fluid import layers\nfrom paddle.nn.layer.transformer import _convert_param_attr_to_list\nfrom paddle.common_ops_import import convert_dtype\nimport paddle.incubate as incubate\nfrom paddle.distributed.fleet.utils import recompute\nfrom paddle.incubate.nn import FusedLinear\nfrom .processor import (\n    LogitsProcessorList, MinLengthLogitsProcessor,\n    HammingDiversityLogitsProcessor, RepetitionPenaltyLogitsProcessor,\n    ForcedBOSTokenLogitsProcessor, ForcedEOSTokenLogitsProcessor)\n\nfrom ppfleetx.models.language_model.moe import MoELayer\nfrom ppfleetx.models.language_model.moe_exp.layer import MoE\n\nfrom ppfleetx.utils.log import logger\ntry:\n    from paddle.nn.functional.flash_attention import flash_attention\nexcept:\n    flash_attention = None\n\n\ndef get_attr(layer, name):\n    if getattr(layer, name, None) is not None:\n        return getattr(layer, name, None)\n    else:\n        return get_attr(layer._layer, name)\n\n\nclass ExpertLayer(nn.Layer):\n    def __init__(self, d_model, d_hidden, name=None):\n        super(ExpertLayer, self).__init__()\n\n        self.htoh4 = nn.Linear(\n            d_model,\n            d_hidden,\n            weight_attr=nn.initializer.KaimingUniform(),\n            bias_attr=nn.initializer.Constant(value=0.0))\n        self.h4toh = nn.Linear(\n            d_hidden,\n            d_model,\n            weight_attr=nn.initializer.KaimingUniform(),\n            bias_attr=nn.initializer.Constant(value=0.0))\n\n        self.htoh4.weight.name = \"expert_\" + self.htoh4.weight.name\n        self.h4toh.weight.name = \"expert_\" + self.h4toh.weight.name\n        self.htoh4.bias.name = \"expert_\" + self.htoh4.bias.name\n        self.h4toh.bias.name = \"expert_\" + self.h4toh.bias.name\n\n    def forward(self, x):\n        x = self.htoh4(x)\n        x = F.gelu(x, approximate=True)\n        x = self.h4toh(x)\n        return x\n\n\nclass MultiHeadAttention(nn.Layer):\n    \"\"\"\n    Attention mapps queries and a set of key-value pairs to outputs, and\n    Multi-Head Attention performs multiple parallel attention to jointly attending\n    to information from different representation subspaces.\n\n    \"\"\"\n\n    Cache = collections.namedtuple(\"Cache\", [\"k\", \"v\"])\n    StaticCache = collections.namedtuple(\"StaticCache\", [\"k\", \"v\"])\n\n    def __init__(self,\n                 embed_dim,\n                 num_heads,\n                 dropout=0.,\n                 kdim=None,\n                 vdim=None,\n                 need_weights=False,\n                 weight_attr=None,\n                 bias_attr=None,\n                 output_layer_weight_attr=None,\n                 fuse_attn_qkv=False,\n                 scale_qk_coeff=1.0,\n                 fused_linear=False,\n                 use_recompute=False,\n                 recompute_granularity=\"full\",\n                 do_recompute=True,\n                 use_flash_attn=False):\n        super(MultiHeadAttention, self).__init__()\n        self.embed_dim = embed_dim\n        self.kdim = kdim if kdim is not None else embed_dim\n        self.vdim = vdim if vdim is not None else embed_dim\n        self.num_heads = num_heads\n        self.dropout = dropout\n        self.need_weights = need_weights\n        self.fuse_attn_qkv = fuse_attn_qkv\n        self.scale_qk_coeff = scale_qk_coeff\n        self.use_recompute = use_recompute\n        self.recompute_granularity = recompute_granularity\n        self.do_recompute = do_recompute\n        self.use_flash_attn = use_flash_attn if flash_attention else None\n\n        self.head_dim = embed_dim // num_heads\n        assert self.head_dim * \\\n            num_heads == self.embed_dim, \"embed_dim must be divisible by num_heads\"\n\n        Linear = FusedLinear if fused_linear else nn.Linear\n\n        if self.fuse_attn_qkv:\n            assert self.kdim == embed_dim\n            assert self.vdim == embed_dim\n            self.qkv_proj = Linear(\n                embed_dim, 3 * embed_dim, weight_attr, bias_attr=bias_attr)\n        else:\n            self.q_proj = Linear(\n                embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)\n            self.k_proj = Linear(\n                self.kdim, embed_dim, weight_attr, bias_attr=bias_attr)\n            self.v_proj = Linear(\n                self.vdim, embed_dim, weight_attr, bias_attr=bias_attr)\n\n        self.out_proj = Linear(\n            embed_dim,\n            embed_dim,\n            output_layer_weight_attr,\n            bias_attr=bias_attr)\n\n    def _fuse_prepare_qkv(self, query, use_cache=False, cache=None):\n        mix_layer = self.qkv_proj(query)\n        mix_layer = paddle.reshape_(mix_layer, [0, 0, -1, 3 * self.head_dim])\n        q, k, v = paddle.split(mix_layer, num_or_sections=3, axis=-1)\n\n        assert not isinstance(\n            cache, self.StaticCache\n        ), \"cache currently does not support the StaticCache type\"\n\n        if isinstance(cache, self.Cache):\n            # for decoder self-attention in inference\n            k = tensor.concat([cache.k, k], axis=1)\n            v = tensor.concat([cache.v, v], axis=1)\n        if use_cache is True:\n            cache = self.Cache(k, v)\n\n        return (q, k, v, cache) if use_cache else (q, k, v, None)\n\n    def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):\n        r\"\"\"\n        Prapares linear projected queries, keys and values for usage of subsequnt\n        multiple parallel attention. If `cache` is not None, using cached results\n        to reduce redundant calculations.\n\n        \"\"\"\n        q = self.q_proj(query)\n        q = tensor.reshape(x=q, shape=[0, 0, -1, self.head_dim])\n\n        if isinstance(cache, self.StaticCache):\n            # for encoder-decoder attention in inference and has cached\n            k, v = cache.k, cache.v\n        else:\n            k, v = self.compute_kv(key, value)\n\n        if isinstance(cache, self.Cache):\n            # for decoder self-attention in inference\n            k = tensor.concat([cache.k, k], axis=1)\n            v = tensor.concat([cache.v, v], axis=1)\n        if use_cache is True:\n            cache = self.Cache(k, v)\n\n        return (q, k, v, cache) if use_cache else (q, k, v, None)\n\n    def compute_kv(self, key, value):\n        r\"\"\"\n        Applies linear projection on input keys and values, then splits heads\n        (reshape and transpose) to get keys and values from different representation\n        subspaces. The results are used as key-values pairs for subsequent multiple\n        parallel attention.\n\n        It is part of calculations in multi-head attention, and is provided as\n        a method to pre-compute and prefetch these results, thus we can use them\n        to construct cache for inference.\n\n        \"\"\"\n        k = self.k_proj(key)\n        v = self.v_proj(value)\n        k = tensor.reshape(x=k, shape=[0, 0, -1, self.head_dim])\n        v = tensor.reshape(x=v, shape=[0, 0, -1, self.head_dim])\n        return k, v\n\n    def gen_cache(self, key, value=None, type=Cache):\n        \"\"\"\n        Generates cache for `forward` usage in inference accroding to arguments.\n        The generated cache is an instance of `MultiHeadAttention.Cache` or an\n        instance of `MultiHeadAttention.StaticCache`.\n        \"\"\"\n        if type == MultiHeadAttention.StaticCache:  # static_kv\n            k, v = self.compute_kv(key, value)\n            return self.StaticCache(k, v)\n        elif value is None:  # incremental_state\n            k = layers.fill_constant_batch_size_like(\n                input=key,\n                shape=[-1, self.num_heads, 0, self.head_dim],\n                dtype=key.dtype,\n                value=0)\n            v = layers.fill_constant_batch_size_like(\n                input=key,\n                shape=[-1, self.num_heads, 0, self.head_dim],\n                dtype=key.dtype,\n                value=0)\n            return self.Cache(k, v)\n        else:\n            # incremental_state with initial value, mainly for usage like UniLM\n            return self.Cache(key, value)\n\n    def _flash_attention(self, q, k, v, attn_mask=None):\n        out, weights = flash_attention(\n            q,\n            k,\n            v,\n            self.dropout,\n            causal=True,\n            return_softmax=self.need_weights)\n        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])\n        return out, weights\n\n    def core_attn(self, q, k, v, attn_mask=None):\n        perm = [0, 2, 1, 3]\n        q = tensor.transpose(x=q, perm=perm)\n        k = tensor.transpose(x=k, perm=perm)\n        v = tensor.transpose(x=v, perm=perm)\n\n        # scale dot product attention\n        scale_qk_coeff = self.scale_qk_coeff * self.head_dim**0.5\n        product = paddle.matmul(\n            x=q.scale(1.0 / scale_qk_coeff), y=k, transpose_y=True)\n\n        if self.scale_qk_coeff != 1.0:\n            product = product.scale(self.scale_qk_coeff)\n\n        if attn_mask is not None:\n            product = product + attn_mask\n            weights = F.softmax(product)\n        else:\n            weights = incubate.softmax_mask_fuse_upper_triangle(product)\n\n        if self.dropout:\n            weights = F.dropout(\n                weights,\n                self.dropout,\n                training=self.training,\n                mode=\"upscale_in_train\")\n\n        out = paddle.matmul(weights, v)\n\n        # combine heads\n        out = tensor.transpose(out, perm=[0, 2, 1, 3])\n        out = tensor.reshape(x=out, shape=[0, 0, -1])\n\n        return out, weights\n\n    def forward(self,\n                query,\n                key,\n                value,\n                attn_mask=None,\n                use_cache=False,\n                cache=None):\n        r\"\"\"\n        Applies multi-head attention to map queries and a set of key-value pairs\n        to outputs.\n        \"\"\"\n        key = query if key is None else key\n        value = query if value is None else value\n        # compute q ,k ,v\n        if self.fuse_attn_qkv:\n            q, k, v, cache = self._fuse_prepare_qkv(query, use_cache, cache)\n        else:\n            q, k, v, cache = self._prepare_qkv(query, key, value, use_cache,\n                                               cache)\n\n        if self.use_recompute and self.recompute_granularity == \"core_attn\" and self.do_recompute:\n            out, weights = recompute(self.core_attn, q, k, v, attn_mask)\n        elif self.use_flash_attn and attn_mask is None:\n            out, weights = self._flash_attention(q, k, v)\n        else:\n            out, weights = self.core_attn(q, k, v, attn_mask=attn_mask)\n\n        # project to output\n        out = self.out_proj(out)\n\n        outs = [out]\n        if self.need_weights:\n            outs.append(weights)\n        if use_cache:\n            outs.append(cache)\n        return out if len(outs) == 1 else tuple(outs)\n\n\nclass TransformerDecoder(nn.Layer):\n    \"\"\"\n    TransformerDecoder is a stack of N decoder layers.\n    \"\"\"\n\n    def __init__(self,\n                 decoder_layers,\n                 num_layers,\n                 norm=None,\n                 hidden_size=None,\n                 use_recompute=False,\n                 recompute_granularity=\"full\",\n                 no_recompute_layers=None):\n        super(TransformerDecoder, self).__init__()\n\n        if no_recompute_layers is None:\n            no_recompute_layers = []\n        self.no_recompute_layers = no_recompute_layers\n\n        self.num_layers = num_layers\n        self.layers = decoder_layers\n        self.norm = norm\n        self.use_recompute = use_recompute\n        self.recompute_granularity = recompute_granularity\n        if norm == \"LayerNorm\":\n            self.norm = nn.LayerNorm(hidden_size, epsilon=1e-5)\n        elif norm is not None:\n            raise ValueError(\"Only support LayerNorm\")\n\n    def forward(self,\n                tgt,\n                memory,\n                tgt_mask=None,\n                memory_mask=None,\n                use_cache=False,\n                cache=None):\n        r\"\"\"\n        Applies a stack of N Transformer decoder layers on inputs. If `norm` is\n        provided, also applies layer normalization on the output of last decoder\n        layer.\n        \"\"\"\n        output = tgt\n        new_caches = []\n\n        for i, mod in enumerate(self.layers):\n            if cache is None:\n                if use_cache:\n                    output, new_cache = mod(output,\n                                            memory,\n                                            tgt_mask=tgt_mask,\n                                            use_cache=use_cache,\n                                            cache=cache)\n                    new_caches.append(new_cache)\n                else:\n                    if self.use_recompute and self.recompute_granularity == \"full\" and i not in self.no_recompute_layers:\n                        output = recompute(mod, output, memory, tgt_mask,\n                                           use_cache, cache)\n                    else:\n                        output = mod(output, memory, tgt_mask, use_cache,\n                                     cache)\n            else:\n                output, new_cache = mod(output,\n                                        memory,\n                                        tgt_mask=tgt_mask,\n                                        use_cache=use_cache,\n                                        cache=cache[i])\n                new_caches.append(new_cache)\n\n        if self.norm is not None:\n            output = self.norm(output)\n        return output if use_cache is False else (output, new_caches)\n\n    def gen_cache(self, memory, do_zip=False):\n        r\"\"\"\n        Generates cache for `forward` usage. The generated cache is a list, and\n        each element in it is a tuple( :code:`(incremental_cache, static_cache)` )\n        produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache`\n        for more details. If `do_zip` is True, apply `zip` on these tuples to get\n        a list with two elements.\n       \"\"\"\n        cache = [layer.gen_cache(memory) for layer in self.layers]\n        if do_zip:\n            cache = list(zip(*cache))\n        return cache\n\n\nclass TransformerDecoderLayer(nn.Layer):\n    \"\"\"\n    The transformer decoder layer.\n\n    It contains multiheadattention and some linear layers.\n    \"\"\"\n\n    def __init__(self,\n                 d_model,\n                 nhead,\n                 dim_feedforward,\n                 num_experts=1,\n                 dropout=0.1,\n                 activation=\"gelu\",\n                 attn_dropout=None,\n                 act_dropout=None,\n                 normalize_before=True,\n                 topk=1,\n                 moe_use_residual=False,\n                 moe_train_capacity_factor=1.0,\n                 moe_eval_capacity_factor=1.0,\n                 moe_min_capacity=4,\n                 moe_token_dropping=True,\n                 enable_expert_tensor_parallelism=False,\n                 weight_attr=None,\n                 bias_attr=None,\n                 output_layer_weight_attr=None,\n                 fused_linear=False,\n                 fuse_attn_qkv=False,\n                 scale_qk_coeff=1.0,\n                 use_recompute=False,\n                 recompute_granularity=\"full\",\n                 do_recompute=True,\n                 skip_quant_tensors=[],\n                 use_flash_attn=False):\n        self._config = locals()\n        self._config.pop(\"self\")\n        self._config.pop(\"__class__\", None)  # py3\n\n        super(TransformerDecoderLayer, self).__init__()\n        attn_dropout = dropout if attn_dropout is None else attn_dropout\n        act_dropout = dropout if act_dropout is None else act_dropout\n        self.normalize_before = normalize_before\n        self.use_recompute = use_recompute\n        self.recompute_granularity = recompute_granularity\n        self.do_recompute = do_recompute\n\n        self.num_experts = num_experts\n\n        weight_attrs = _convert_param_attr_to_list(weight_attr, 3)\n        bias_attrs = _convert_param_attr_to_list(bias_attr, 3)\n        output_layer_weight_attrs = _convert_param_attr_to_list(\n            output_layer_weight_attr, 3)\n\n        Linear = FusedLinear if fused_linear else nn.Linear\n\n        self.self_attn = MultiHeadAttention(\n            d_model,\n            nhead,\n            dropout=attn_dropout,\n            weight_attr=weight_attrs[0],\n            bias_attr=bias_attrs[0],\n            output_layer_weight_attr=output_layer_weight_attrs[0],\n            fused_linear=fused_linear,\n            fuse_attn_qkv=fuse_attn_qkv,\n            scale_qk_coeff=scale_qk_coeff,\n            use_recompute=use_recompute,\n            recompute_granularity=recompute_granularity,\n            do_recompute=do_recompute,\n            use_flash_attn=use_flash_attn)\n\n        self.moe_mlp = None\n        if self.num_experts > 1:\n            assert (topk == 1, \"Only support topk=1 currently.\")\n            self.moe_mlp = MoE(\n                d_model,\n                ExpertLayer(d_model, dim_feedforward),\n                self.num_experts,\n                ep_size=1,\n                k=topk,\n                use_residual=moe_use_residual,\n                capacity_factor=moe_train_capacity_factor,\n                eval_capacity_factor=moe_eval_capacity_factor,\n                min_capacity=moe_min_capacity,\n                drop_tokens=moe_token_dropping,\n                enable_expert_tensor_parallelism=enable_expert_tensor_parallelism\n            )\n        else:\n            self.linear1 = Linear(\n                d_model,\n                dim_feedforward,\n                weight_attrs[2],\n                bias_attr=bias_attrs[2])\n            self.linear2 = Linear(\n                dim_feedforward,\n                d_model,\n                output_layer_weight_attrs[2],\n                bias_attr=bias_attrs[2])\n\n            if 'linear1' in skip_quant_tensors:\n                self.linear1.skip_quant = True\n\n            if 'linear2' in skip_quant_tensors:\n                self.linear2.skip_quant = True\n\n        self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)\n        self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5)\n        self.dropout1 = nn.Dropout(dropout, mode=\"upscale_in_train\")\n        self.dropout2 = nn.Dropout(act_dropout, mode=\"upscale_in_train\")\n        if activation == 'gelu':\n            self.activation = nn.GELU(approximate=True)\n        else:\n            self.activation = getattr(F, activation)\n\n    def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):\n        residual = tgt\n\n        if self.normalize_before:\n            tgt = self.norm1(tgt)\n\n        if use_cache is False:\n            if self.use_recompute and self.recompute_granularity == \"full_attn\" and self.do_recompute:\n                tgt = recompute(self.self_attn, tgt, None, None, tgt_mask,\n                                use_cache, cache)\n            else:\n                tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache)\n        else:\n            tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask,\n                                                    use_cache, cache)\n        tgt = residual + self.dropout1(tgt)\n        if not self.normalize_before:\n            tgt = self.norm1(tgt)\n        residual = tgt\n        if self.normalize_before:\n            tgt = self.norm2(tgt)\n\n        # if self.expert_mode:\n        #     tgt = self.moe_mlp(tgt)\n        if self.num_experts > 1:\n            tgt = self.moe_mlp(tgt)\n        else:\n            tgt = self.dropout2(\n                self.linear2(self.activation(self.linear1(tgt))))\n\n        tgt = residual + tgt\n\n        if not self.normalize_before:\n            tgt = self.norm2(tgt)\n\n        return tgt if use_cache is False else (tgt, incremental_cache)\n\n    def gen_cache(self, memory):\n        incremental_cache = self.self_attn.gen_cache(\n            memory, type=self.self_attn.Cache)\n        return incremental_cache\n\n\nclass GPTEmbeddings(nn.Layer):\n    \"\"\"\n    Include embeddings from word and position embeddings.\n    \"\"\"\n\n    def __init__(self,\n                 vocab_size,\n                 hidden_size=768,\n                 hidden_dropout_prob=0.1,\n                 max_position_embeddings=512,\n                 type_vocab_size=16,\n                 initializer_range=0.02,\n                 freeze_embedding=False):\n        super(GPTEmbeddings, self).__init__()\n        self.word_embeddings = nn.Embedding(\n            vocab_size,\n            hidden_size,\n            weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal(\n                mean=0.0, std=initializer_range)))\n\n        self.position_embeddings = nn.Embedding(\n            max_position_embeddings,\n            hidden_size,\n            weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal(\n                mean=0.0, std=initializer_range)))\n\n        if freeze_embedding:\n            self.word_embeddings.weight.learning_rate = 0.0\n            self.position_embeddings.weight.learning_rate = 0.0\n\n        self.dropout = nn.Dropout(hidden_dropout_prob)\n\n    def forward(self, input_ids, position_ids=None):\n        if position_ids is None:\n            ones = paddle.ones_like(input_ids, dtype=\"int64\")\n            seq_length = paddle.cumsum(ones, axis=-1)\n            position_ids = seq_length - ones\n\n        input_embedings = self.word_embeddings(input_ids)\n        position_embeddings = self.position_embeddings(position_ids)\n        embeddings = input_embedings + position_embeddings\n        embeddings = self.dropout(embeddings)\n        return embeddings\n\n\nclass GPTModel(nn.Layer):\n    def __init__(self,\n                 vocab_size=51200,\n                 hidden_size=768,\n                 num_layers=12,\n                 num_attention_heads=12,\n                 ffn_hidden_size=3072,\n                 hidden_dropout_prob=0.1,\n                 attention_probs_dropout_prob=0.1,\n                 max_position_embeddings=512,\n                 type_vocab_size=16,\n                 use_recompute=False,\n                 initializer_range=0.02,\n                 num_experts=[1],\n                 expert_interval=2,\n                 topk=1,\n                 moe_use_residual=False,\n                 moe_train_capacity_factor=1.0,\n                 moe_eval_capacity_factor=1.0,\n                 moe_min_capacity=4,\n                 moe_token_dropping=True,\n                 enable_expert_tensor_parallelism=False,\n                 fused_linear=False,\n                 fuse_attn_qkv=False,\n                 scale_qk_by_layer_num=True,\n                 recompute_granularity=\"full\",\n                 sequence_parallel=False,\n                 no_recompute_layers=None,\n                 skip_tensor_map={},\n                 freeze_embedding=False,\n                 use_flash_attn=False,\n                 fused_softmax_with_triangular=False):\n\n        super(GPTModel, self).__init__()\n\n        if no_recompute_layers is None:\n            no_recompute_layers = []\n        self.initializer_range = initializer_range\n        self.hidden_size = hidden_size\n        self.vocab_size = vocab_size\n        self.fused_softmax_with_triangular = fused_softmax_with_triangular\n\n        if use_flash_attn:\n            if flash_attention:\n                logger.info(\"Flash-attention enabled.\")\n            else:\n                use_flash_attn = False\n                logger.warning(\n                    \"Flash-attention is not support in this Paddle version.\")\n\n        self.embeddings = GPTEmbeddings(\n            vocab_size, hidden_size, hidden_dropout_prob,\n            max_position_embeddings, type_vocab_size, self.initializer_range,\n            freeze_embedding)\n\n        assert len(num_experts) == 1 or len(num_experts) == num_layers // expert_interval, \\\n            'num_experts must be either a single value or a list of the same length as the number of MoE layers'\n\n        # Expand the list of MoE experts num to MoE layers num\n        if len(num_experts) == 1:\n            num_experts = num_experts * (num_layers // expert_interval)\n\n        decoder_layers = nn.LayerList()\n        for i in range(num_layers):\n            # TODO: original layer_num = i + 1 + offset here\n            layer_num = i + 1\n            if layer_num % expert_interval == 0:\n                n_e = num_experts[(layer_num - 1) // expert_interval]\n            else:\n                n_e = 1\n            decoder_layers.append(\n                TransformerDecoderLayer(\n                    d_model=hidden_size,\n                    nhead=num_attention_heads,\n                    dim_feedforward=ffn_hidden_size,\n                    num_experts=n_e,\n                    dropout=hidden_dropout_prob,\n                    activation=\"gelu\",\n                    attn_dropout=attention_probs_dropout_prob,\n                    act_dropout=hidden_dropout_prob,\n                    topk=topk,\n                    moe_use_residual=moe_use_residual,\n                    moe_train_capacity_factor=moe_train_capacity_factor,\n                    moe_eval_capacity_factor=moe_eval_capacity_factor,\n                    moe_min_capacity=moe_min_capacity,\n                    moe_token_dropping=moe_token_dropping,\n                    enable_expert_tensor_parallelism=enable_expert_tensor_parallelism,\n                    weight_attr=paddle.ParamAttr(\n                        initializer=nn.initializer.Normal(\n                            mean=0.0, std=self.initializer_range)),\n                    output_layer_weight_attr=paddle.ParamAttr(\n                        initializer=nn.initializer.Normal(\n                            mean=0.0,\n                            std=self.initializer_range / math.sqrt(\n                                2.0 * num_layers))),\n                    bias_attr=None,\n                    fused_linear=fused_linear,\n                    fuse_attn_qkv=fuse_attn_qkv,\n                    scale_qk_coeff=num_layers\n                    if scale_qk_by_layer_num else 1.0,\n                    use_recompute=use_recompute,\n                    recompute_granularity=recompute_granularity,\n                    do_recompute=i not in no_recompute_layers,\n                    skip_quant_tensors=skip_tensor_map.get('block_{}'.format(\n                        i), []),\n                    use_flash_attn=use_flash_attn))\n\n        self.decoder = TransformerDecoder(\n            decoder_layers,\n            num_layers,\n            norm=\"LayerNorm\",\n            hidden_size=hidden_size,\n            use_recompute=use_recompute,\n            recompute_granularity=recompute_granularity,\n            no_recompute_layers=no_recompute_layers)\n\n    def forward(self,\n                input_ids,\n                position_ids=None,\n                attention_mask=None,\n                use_cache=False,\n                cache=None):\n\n        if position_ids is None:\n            past_length = 0\n            if cache is not None:\n                past_length = paddle.shape(attention_mask)[-1] - 1\n            position_ids = paddle.arange(\n                past_length,\n                paddle.shape(input_ids)[-1] + past_length,\n                dtype=input_ids.dtype)\n            position_ids = position_ids.unsqueeze(0)\n            # .expand_as(input_ids)\n            position_ids = paddle.expand_as(position_ids, input_ids)\n\n        embedding_output = self.embeddings(\n            input_ids=input_ids, position_ids=position_ids)\n\n        # fused_softmax_with_triangular is only suppported on GPU/DCU.\n        # If on non-GPU devices, we use user defined mask and non-fused softmax.\n        if not self.fused_softmax_with_triangular or not paddle.is_compiled_with_cuda(\n        ):\n            # TODO, use registered buffer\n            causal_mask = paddle.tensor.triu(\n                paddle.ones(\n                    (paddle.shape(input_ids)[-1], paddle.shape(input_ids)[-1]))\n                * -1e4,\n                diagonal=1)\n            if attention_mask is not None:\n                if len(attention_mask.shape) == 2:\n                    attention_mask = attention_mask[:, None, None, :]\n                attention_mask = attention_mask + causal_mask\n            else:\n                attention_mask = causal_mask\n            # The tensor returned by triu not in static graph.\n            attention_mask.stop_gradient = True\n\n        encoder_outputs = self.decoder(\n            embedding_output,\n            memory=None,\n            tgt_mask=None if (self.fused_softmax_with_triangular and\n                              self.training and paddle.is_compiled_with_cuda())\n            else attention_mask,  # use softmax_mask_fuse_upper_triangle\n            use_cache=use_cache,\n            cache=cache)\n\n        return encoder_outputs\n\n\nclass GPTForPretraining(nn.Layer):\n    \"\"\"\n    GPT Model with pretraining tasks on top.\n\n    Args:\n        gpt (:class:`GPTModel`):\n            An instance of :class:`GPTModel`.\n\n    \"\"\"\n\n    def __init__(self, gpt):\n        super(GPTForPretraining, self).__init__()\n        self.gpt = gpt\n\n    def forward(self,\n                input_ids,\n                position_ids=None,\n                attention_mask=None,\n                masked_positions=None,\n                use_cache=False,\n                cache=None):\n\n        outputs = self.gpt(input_ids,\n                           position_ids=position_ids,\n                           attention_mask=attention_mask,\n                           use_cache=use_cache,\n                           cache=cache)\n        if use_cache:\n            encoder_outputs, cached_kvs = outputs[:2]\n        else:\n            encoder_outputs = outputs\n        logits = paddle.matmul(\n            encoder_outputs,\n            get_attr(self.gpt.embeddings.word_embeddings, \"weight\"),\n            transpose_y=True)\n\n        if use_cache:\n            return logits, cached_kvs\n        else:\n            return logits\n\n\nclass GPTPretrainingCriterion(nn.Layer):\n    \"\"\"\n    Criterion for GPT. It calculates the final loss.\n    \"\"\"\n\n    def __init__(self, topo=None):\n        super(GPTPretrainingCriterion, self).__init__()\n        self.loss_func = paddle.nn.CrossEntropyLoss(reduction=\"none\")\n\n    def forward(self, prediction_scores, masked_lm_labels, loss_mask):\n        \"\"\"\n        Args:\n            prediction_scores(Tensor):\n                The logits of masked token prediction. Its data type should be float32 and\n                its shape is [batch_size, sequence_length, vocab_size].\n            masked_lm_labels(Tensor):\n                The labels of the masked language modeling, the dimensionality of `masked_lm_labels`\n                is equal to `prediction_scores`. Its data type should be int64 and\n                its shape is [batch_size, sequence_length, 1].\n            loss_mask(Tensor):\n                Mask used for calculating the loss of the masked language modeling to avoid\n                calculating some unwanted tokens.\n                Its data type should be float32 and its shape is [batch_size, sequence_length, 1].\n\n        Returns:\n            Tensor: The pretraining loss. Its data type should be float32 and its shape is [1].\n\n        \"\"\"\n        masked_lm_loss = self.loss_func(prediction_scores,\n                                        masked_lm_labels.unsqueeze(2))\n\n        loss_mask = loss_mask.reshape([-1])\n        masked_lm_loss = paddle.sum(masked_lm_loss.reshape([-1]) * loss_mask)\n        loss = masked_lm_loss / loss_mask.sum()\n        return loss\n\n\nclass GPTForSequenceClassification(nn.Layer):\n    \"\"\"\n    GPT Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.\n    for GLUE tasks.\n    Args:\n        gpt (:class:`GPTModel`):\n            An instance of GPTModel.\n        num_classes (int, optional):\n            The number of classes. Defaults to `2`.\n\n    \"\"\"\n\n    def __init__(self, gpt, num_classes=2):\n        super(GPTForSequenceClassification, self).__init__()\n        self.gpt = gpt\n        self.score = nn.Linear(\n            self.gpt.hidden_size, num_classes, bias_attr=False)\n\n        from paddle.nn.initializer import Normal\n        normal_ = Normal(std=self.gpt.initializer_range)\n        normal_(self.score.weight)\n\n    def forward(self, input_ids, position_ids=None, attention_mask=None):\n\n        output = self.gpt(input_ids,\n                          position_ids=position_ids,\n                          attention_mask=attention_mask)\n\n        logits = self.score(output)\n        # padding index maybe 0\n        eos_token_id = 0\n        # sequence_lengths shape [bs,]\n        sequence_lengths = (input_ids != eos_token_id).astype(\"int64\").sum(\n            axis=-1) - 1\n\n        pooled_logits = logits.gather_nd(\n            paddle.stack(\n                [paddle.arange(output.shape[0]), sequence_lengths], axis=-1))\n\n        return pooled_logits\n\n\nclass GPTForGeneration(nn.Layer):\n    \"\"\"\n    GPT Model with pretraining tasks on top.\n\n    Args:\n        gpt (:class:`GPTModel`):\n            An instance of :class:`GPTModel`.\n\n    \"\"\"\n\n    def __init__(self, gpt, configs):\n        super(GPTForGeneration, self).__init__()\n        self.gpt = gpt\n        self.configs = configs\n\n        self.max_length = self.configs.get('max_dec_len', 20)\n        self.min_length = self.configs.get('min_dec_len', 0)\n        self.decode_strategy = self.configs.get('decode_strategy', 'sampling')\n        self.temperature = self.configs.get('temperature', 1.0)\n        self.top_k = self.configs.get('top_k', 0)\n        self.top_p = self.configs.get('top_p', 1.0)\n        self.use_topp_sampling = self.configs.get('use_topp_sampling', False)\n        self.inference = self.configs.get('inference', False)\n        self.repetition_penalty = self.configs.get('repetition_penalty', 1.0)\n        self.num_beams = self.configs.get('num_beams', 1)\n        self.num_beam_groups = self.configs.get('num_beam_groups', 1)\n        self.length_penalty = self.configs.get('length_penalty', 0.0)\n        self.early_stopping = self.configs.get('early_stopping', False)\n        self.bos_token_id = self.configs.get('bos_token_id', None)\n        self.eos_token_id = self.configs.get('eos_token_id', None)\n        self.pad_token_id = self.configs.get('pad_token_id', None)\n        self.decoder_start_token_id = self.configs.get(\n            'decoder_start_token_id', None)\n        self.forced_bos_token_id = self.configs.get('forced_bos_token_id',\n                                                    None)\n        self.forced_eos_token_id = self.configs.get('forced_eos_token_id',\n                                                    None)\n        self.num_return_sequences = self.configs.get('num_return_sequences', 1)\n        self.diversity_rate = self.configs.get('diversity_rate', 0.0)\n        self.use_cache = self.configs.get('use_cache', True)\n\n    def prepare_input_ids_for_generation(self,\n                                         bos_token_id,\n                                         encoder_output=None):\n        batch_size = 1\n        if bos_token_id is None:\n            raise ValueError(\"`bos_token_id` should be defined when no \"\n                             \"`input_ids` are provided.\")\n        if encoder_output is not None:\n            batch_size = encoder_output.shape[0]\n        return paddle.ones([batch_size, 1], dtype=\"int64\") * bos_token_id\n\n    def prepare_attention_mask_for_generation(self, input_ids, pad_token_id,\n                                              eos_token_id):\n        is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any(\n            input_ids == pad_token_id).numpy().item()\n        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (\n            (eos_token_id is not None) and (pad_token_id != eos_token_id))\n        if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:\n            attention_mask = (input_ids == pad_token_id\n                              ).astype(paddle.get_default_dtype()) * -1e9\n        else:\n            attention_mask = paddle.zeros_like(\n                input_ids, dtype=paddle.get_default_dtype())\n        return paddle.unsqueeze(attention_mask, axis=[1, 2])\n\n    def update_scores_for_generation(self, scores, next_scores, length,\n                                     unfinished_flag):\n        # update scores\n\n        unfinished_scores = (scores * length + next_scores) / (length + 1)\n        scores = paddle.where(unfinished_flag, unfinished_scores, scores)\n        return scores\n\n    def get_logits_processor(self,\n                             min_length=None,\n                             max_length=None,\n                             eos_token_id=None,\n                             forced_bos_token_id=None,\n                             forced_eos_token_id=None,\n                             num_beams=1,\n                             num_beam_groups=1,\n                             diversity_rate=0.0,\n                             repetition_penalty=None):\n        processors = LogitsProcessorList()\n\n        if min_length is not None and eos_token_id is not None and min_length > -1:\n            processors.append(\n                MinLengthLogitsProcessor(min_length, eos_token_id))\n        if num_beam_groups > 1 and diversity_rate > 0.0:\n            processors.append(\n                HammingDiversityLogitsProcessor(\n                    diversity_rate=diversity_rate,\n                    num_beams=num_beams,\n                    num_beam_groups=num_beam_groups))\n        if repetition_penalty is not None and repetition_penalty != 1.0:\n            processors.append(\n                RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))\n        if forced_bos_token_id is not None:\n            processors.append(\n                ForcedBOSTokenLogitsProcessor(forced_bos_token_id))\n        if forced_eos_token_id is not None:\n            processors.append(\n                ForcedEOSTokenLogitsProcessor(max_length, forced_eos_token_id))\n        # TODO\n        # Add more pre_processing for distribution\n\n        return processors\n\n    def expand_inputs_for_generation(self,\n                                     input_ids,\n                                     expand_size,\n                                     attention_mask=None,\n                                     **model_kwargs):\n\n        index = paddle.tile(\n            paddle.arange(paddle.shape(input_ids)[0]).unsqueeze(-1),\n            [1, expand_size]).reshape([-1])\n\n        input_ids = paddle.gather(input_ids, index)\n\n        if attention_mask is not None:\n            model_kwargs[\"attention_mask\"] = paddle.gather(attention_mask,\n                                                           index)\n\n        if \"token_type_ids\" in model_kwargs and model_kwargs[\n                \"token_type_ids\"] is not None:\n            token_type_ids = model_kwargs[\"token_type_ids\"]\n            model_kwargs[\"token_type_ids\"] = paddle.gather(token_type_ids,\n                                                           index)\n\n        if \"position_ids\" in model_kwargs and model_kwargs[\n                \"position_ids\"] is not None:\n            position_ids = model_kwargs[\"position_ids\"]\n            model_kwargs[\"position_ids\"] = paddle.gather(position_ids, index)\n\n        if \"seq_len\" in model_kwargs and model_kwargs[\"seq_len\"] is not None:\n            seq_len = model_kwargs[\"seq_len\"]\n            model_kwargs[\"seq_len\"] = paddle.gather(seq_len, index)\n\n        if \"encoder_output\" in model_kwargs and model_kwargs[\n                \"encoder_output\"] is not None:\n            encoder_output = model_kwargs[\"encoder_output\"]\n            model_kwargs[\"encoder_output\"] = paddle.gather(encoder_output,\n                                                           index)\n\n        if \"role_ids\" in model_kwargs and model_kwargs[\"role_ids\"] is not None:\n            role_ids = model_kwargs[\"role_ids\"]\n            model_kwargs[\"role_ids\"] = paddle.gather(role_ids, index)\n\n        return input_ids, model_kwargs\n\n    def prepare_inputs_for_generation(self,\n                                      input_ids,\n                                      use_cache=False,\n                                      cache=None,\n                                      **kwargs):\n        # only last token for inputs_ids if cache is defined in kwargs\n        position_ids = kwargs.get(\"position_ids\", None)\n        attention_mask = kwargs.get(\"attention_mask\", None)\n        if attention_mask is not None:\n            if len(attention_mask.shape) == 4:\n                attention_mask = attention_mask[:, -1, -1, :]\n            if \"int\" in paddle.common_ops_import.convert_dtype(\n                    attention_mask.dtype):\n                attention_mask = (1.0 - attention_mask) * -1e4\n        return {\n            \"input_ids\": input_ids,\n            \"position_ids\": position_ids,\n            \"attention_mask\": attention_mask,\n            \"cache\": cache\n        }\n\n    def update_model_kwargs_for_generation(self,\n                                           next_tokens,\n                                           outputs,\n                                           model_kwargs,\n                                           is_encoder_decoder=False):\n        # Update the model inputs during generation.\n        # Note that If `token_type_ids` and `attention_mask` in `model_kwargs`\n        # and they contain pad value, the result vectors updated by this method\n        # may be different from expected. In this case, you need to rewrite the\n        # method.\n\n        # update cache\n        if isinstance(outputs, tuple):\n            model_kwargs[\"cache\"] = outputs[1]\n\n        # update token_type_ids with last value\n        if \"token_type_ids\" in model_kwargs and model_kwargs[\n                \"token_type_ids\"] is not None:\n            token_type_ids = model_kwargs[\"token_type_ids\"]\n            model_kwargs[\"token_type_ids\"] = paddle.concat(\n                [token_type_ids, token_type_ids[:, -1:]], axis=-1)\n\n        # update position_ids\n        if \"position_ids\" in model_kwargs and model_kwargs[\n                \"position_ids\"] is not None:\n            position_ids = model_kwargs[\"position_ids\"]\n            model_kwargs[\"position_ids\"] = position_ids[:, -1:] + 1\n\n        # update attention_mask\n        if not is_encoder_decoder and \"attention_mask\" in model_kwargs:\n            attention_mask = model_kwargs[\"attention_mask\"]\n            # nn.Pad2D don't support the data type `bool`\n            if convert_dtype(attention_mask.dtype) == 'bool':\n                attention_mask = paddle.cast(attention_mask, 'int64')\n            if len(attention_mask.shape) == 4:\n                attention_mask = nn.Pad2D(\n                    [0, 0, 0, 1], mode='replicate')(attention_mask)\n                attention_mask = nn.Pad2D(\n                    [0, 1, 0, 0], value=-1e4)(attention_mask)\n                dtype = convert_dtype(attention_mask.dtype)\n                if 'int' in dtype:\n                    attention_mask[:, :, -1, -1] = 1\n                elif 'float' in dtype:\n                    attention_mask[:, :, -1, -1] = 0.0\n                else:\n                    raise ValueError(\n                        'The data type of input `attention_mask` must '\n                        'be bool, int or float')\n            else:\n                attention_mask = paddle.concat(\n                    [\n                        attention_mask, paddle.ones(\n                            [attention_mask.shape[0], 1], dtype=\"int64\")\n                    ],\n                    axis=-1)\n            model_kwargs[\"attention_mask\"] = attention_mask\n\n        # update role_ids\n        if \"role_ids\" in model_kwargs and model_kwargs[\"role_ids\"] is not None:\n            role_ids = model_kwargs[\"role_ids\"]\n            model_kwargs[\"role_ids\"] = paddle.concat(\n                [role_ids, role_ids[:, -1:]], axis=-1)\n\n        model_kwargs['res'] = paddle.concat(\n            [model_kwargs['res'], next_tokens], axis=1)\n\n        return model_kwargs\n\n    def sample(self,\n               input_ids,\n               logits_processors,\n               max_length,\n               pad_token_id,\n               eos_token_id,\n               top_k=None,\n               top_p=None,\n               temperature=None,\n               min_tokens_to_keep=1,\n               **model_kwargs):\n        def TopKProcess(probs, top_k, min_tokens_to_keep):\n            top_k = min(max(top_k, min_tokens_to_keep), probs.shape[-1])\n            # Remove all tokens with a probability less than the last token of the top-k\n            topk_probs, _ = paddle.topk(probs, k=top_k)\n            probs = paddle.where(probs >= topk_probs[:, -1:], probs,\n                                 paddle.full_like(probs, 0.0))\n            return probs\n\n        def TopPProcess(probs, top_p, min_tokens_to_keep):\n            sorted_probs = paddle.sort(probs, descending=True)\n            sorted_indices = paddle.argsort(probs, descending=True)\n            cumulative_probs = paddle.cumsum(sorted_probs, axis=-1)\n\n            # Remove tokens with cumulative probs above the top_p, But keep at\n            # least min_tokens_to_keep tokens\n            sorted_indices_to_remove = cumulative_probs > top_p\n            if min_tokens_to_keep > 1:\n                # Set 'min_tokens_to_keep - 1' because the first token is kept\n                sorted_indices_to_remove[:, :min_tokens_to_keep - 1] = 0\n            # Keep the first token\n            sorted_indices_to_remove = paddle.cast(\n                sorted_indices_to_remove, dtype='int64')\n            sorted_indices_to_remove[:, 1:] = (\n                sorted_indices_to_remove[:, :-1].clone())\n            sorted_indices_to_remove[:, 0] = 0\n\n            # Scatter sorted tensors to original indexing\n            sorted_indices = sorted_indices + paddle.arange(probs.shape[\n                0]).unsqueeze(-1) * probs.shape[-1]\n            condition = paddle.scatter(sorted_indices_to_remove.flatten(),\n                                       sorted_indices.flatten(),\n                                       sorted_indices_to_remove.flatten())\n            condition = paddle.cast(condition, 'bool').reshape(probs.shape)\n            probs = paddle.where(condition,\n                                 paddle.full_like(probs, 0.0), probs)\n            return probs\n\n        batch_size, cur_len = input_ids.shape\n        # used for compute on gpu, avoid memcpy D2H\n        cur_len_gpu = paddle.full([1], cur_len, dtype='int64')\n\n        origin_len = input_ids.shape[1]\n        # used for compute on gpu, avoid memcpy D2H\n        origin_len_gpu = paddle.full([1], origin_len, dtype='int64')\n\n        unfinished_flag = paddle.full([batch_size, 1], True, dtype='bool')\n        scores = paddle.full(\n            [batch_size, 1], 0.0, dtype=paddle.get_default_dtype())\n\n        res = paddle.assign(input_ids)\n        model_kwargs['res'] = res\n\n        # use_cache is immutable, we split it off other mutable kwargs.\n        assert 'use_cache' in model_kwargs\n        immutable = {'use_cache': model_kwargs['use_cache']}\n        del model_kwargs['use_cache']\n\n        def _forward_(**args):\n            model_inputs = self.prepare_inputs_for_generation(\n                input_ids, **args, **immutable)\n            return self.gpt(**model_inputs, **immutable)\n\n        def _post_process_(outputs, input_ids, cur_len, origin_len, scores,\n                           unfinished_flag, model_kwargs):\n\n            logits = outputs[0] if isinstance(outputs, tuple) else outputs\n\n            logits = paddle.matmul(\n                logits,\n                self.gpt.embeddings.word_embeddings.weight,\n                transpose_y=True)\n\n            # [batch_size, vocab_size]\n            logits = logits[:, -1, :]\n\n            # pre-process distribution\n            logits = logits_processors(input_ids, logits)\n\n            # sample\n            origin_probs = F.softmax(logits)\n            if temperature is None or temperature == 1.0:\n                probs = paddle.assign(origin_probs)\n                origin_probs = paddle.log(origin_probs)\n            else:\n                origin_probs = paddle.log(origin_probs)\n                logits = logits / temperature\n                probs = F.softmax(logits)\n            if top_k is not None and top_k != 0:\n                probs = TopKProcess(probs, top_k, min_tokens_to_keep)\n            if top_p is not None and top_p < 1.0:\n                if self.use_topp_sampling:\n                    try:\n                        from ppfleetx_ops import topp_sampling\n                    except ImportError:\n                        raise ImportError(\n                            \"please install ppfleetx_ops by 'cd ppfleetx/ops && python setup_cuda.py install'!\"\n                        )\n                    top_ps_tensor = paddle.full(\n                        shape=[paddle.shape(probs)[0]],\n                        fill_value=top_p,\n                        dtype=probs.dtype)\n                    _, next_tokens = topp_sampling(\n                        probs, top_ps_tensor, random_seed=100)\n                else:\n                    probs = TopPProcess(probs, top_p, min_tokens_to_keep)\n\n            if not self.use_topp_sampling:\n                next_tokens = paddle.multinomial(probs)\n\n            next_scores = paddle.index_sample(origin_probs, next_tokens)\n\n            if eos_token_id is not None:\n                next_tokens = paddle.where(\n                    unfinished_flag, next_tokens,\n                    paddle.full_like(next_tokens, pad_token_id))\n\n            scores = self.update_scores_for_generation(\n                scores, next_scores, cur_len - origin_len, unfinished_flag)\n\n            input_ids = next_tokens\n\n            if eos_token_id is not None:\n                unfinished_flag = paddle.logical_and(\n                    unfinished_flag, next_tokens != eos_token_id)\n\n            model_kwargs = self.update_model_kwargs_for_generation(\n                next_tokens,\n                outputs,\n                model_kwargs,\n                is_encoder_decoder=self.is_encoder_decoder)\n\n            return input_ids, scores, unfinished_flag, model_kwargs\n\n        # Note(GuoxiaWang):Pre-while call for inference, simulate a do while loop statement\n        # the value in model_kwargs should be tensor before while loop\n        outputs = _forward_(**model_kwargs)\n\n        input_ids, scores, unfinished_flag, model_kwargs = _post_process_(\n            outputs, input_ids, cur_len_gpu, origin_len_gpu, scores,\n            unfinished_flag, model_kwargs)\n        if not self.inference:\n            cur_len += 1\n        else:\n            # Note(ZhenyuLi): Avoid the synchronization caused by scale in dy2static\n            paddle.increment(cur_len)\n        paddle.increment(cur_len_gpu)\n\n        attn_mask = model_kwargs['attention_mask']\n        # make the shape of attention_mask = (-1, -1, -1, -1) in dy2static.\n        model_kwargs['attention_mask'] = paddle.reshape(\n            attn_mask, paddle.shape(attn_mask))\n        model_kwargs['cache'] = outputs[1] if isinstance(outputs,\n                                                         tuple) else None\n        while cur_len < max_length:\n            # Note(GuoxiaWang): Remove outputs = _forward_(**model_kwargs)\n            # and change it to pass directly to _post_process_ to avoid\n            # closed-loop problem of dynamic-to-static model\n            input_ids, scores, unfinished_flag, model_kwargs = _post_process_(\n                _forward_(**model_kwargs), input_ids, cur_len_gpu,\n                origin_len_gpu, scores, unfinished_flag, model_kwargs)\n            if not self.inference:\n                cur_len += 1\n            else:\n                # Note(ZhenyuLi): Avoid the synchronization caused by scale in dy2static\n                paddle.increment(cur_len)\n            paddle.increment(cur_len_gpu)\n\n            if not paddle.any(unfinished_flag):\n                break\n\n        return model_kwargs['res'][:, origin_len:], scores\n\n    def forward(self, input_ids=None, **model_kwargs):\n\n        max_length = self.max_length\n        min_length = self.min_length\n        decode_strategy = self.decode_strategy\n        temperature = self.temperature\n        top_k = self.top_k\n        top_p = self.top_p\n        repetition_penalty = self.repetition_penalty\n        num_beams = self.num_beams\n        num_beam_groups = self.num_beam_groups\n        length_penalty = self.length_penalty\n        early_stopping = self.early_stopping\n        bos_token_id = self.bos_token_id\n        eos_token_id = self.eos_token_id\n        pad_token_id = self.pad_token_id\n        decoder_start_token_id = self.decoder_start_token_id\n        forced_bos_token_id = self.forced_bos_token_id\n        forced_eos_token_id = self.forced_eos_token_id\n        num_return_sequences = self.num_return_sequences\n        diversity_rate = self.diversity_rate\n        use_cache = self.use_cache\n\n        assert (\n            decode_strategy in [\"greedy_search\", \"sampling\", \"beam_search\"]\n        ), \"`decode_strategy` must be one of 'greedy_search', 'sampling' or 'beam_search' but received {}.\".format(\n            decode_strategy)\n\n        bos_token_id = bos_token_id if bos_token_id is not None else getattr(\n            self.gpt, 'bos_token_id', None)\n        eos_token_id = eos_token_id if eos_token_id is not None else getattr(\n            self.gpt, 'eos_token_id', None)\n        pad_token_id = pad_token_id if pad_token_id is not None else getattr(\n            self.gpt, 'pad_token_id', None)\n        forced_bos_token_id = forced_bos_token_id if forced_bos_token_id is not None else getattr(\n            self.gpt, 'forced_bos_token_id', None)\n        forced_eos_token_id = forced_eos_token_id if forced_eos_token_id is not None else getattr(\n            self.gpt, 'forced_eos_token_id', None)\n        decoder_start_token_id = decoder_start_token_id if decoder_start_token_id is not None else getattr(\n            self.gpt, 'decoder_start_token_id', None)\n\n        # params check\n        if input_ids is None:\n            # Init `input_ids` with bos_token_id\n            input_ids = self.prepare_input_ids_for_generation(bos_token_id)\n\n        if model_kwargs.get(\"attention_mask\", None) is None:\n            # TODO\n            # Init `attention_mask` depending on `pad_token_id`\n            model_kwargs[\n                \"attention_mask\"] = self.prepare_attention_mask_for_generation(\n                    input_ids, pad_token_id, eos_token_id)\n\n        if model_kwargs.get(\"position_ids\", None) is None:\n            model_kwargs['position_ids'] = paddle.arange(\n                0,\n                paddle.shape(model_kwargs['attention_mask'])[-1],\n                dtype=input_ids.dtype).unsqueeze(0)\n\n        self.is_encoder_decoder = False\n\n        model_kwargs[\"use_cache\"] = use_cache\n\n        if self.inference:\n            # Note(ZhenyuLi): Avoid the synchronization caused by scale in dy2static\n            min_len = input_ids.shape[-1]\n            max_len = input_ids.shape[-1]\n            paddle.increment(min_len, min_length)\n            paddle.increment(max_len, max_length)\n        else:\n            input_len = input_ids.shape[-1]\n            max_len = max_length + input_len\n            min_len = min_length + input_len\n\n        logits_processors = self.get_logits_processor(\n            min_length=min_len,\n            max_length=max_len,\n            eos_token_id=eos_token_id,\n            forced_bos_token_id=forced_bos_token_id,\n            forced_eos_token_id=forced_eos_token_id,\n            num_beams=num_beams,\n            num_beam_groups=num_beam_groups,\n            diversity_rate=diversity_rate,\n            repetition_penalty=repetition_penalty)\n\n        if decode_strategy == 'sampling':\n            if num_return_sequences > 1:\n                input_ids, model_kwargs = self.expand_inputs_for_generation(\n                    input_ids,\n                    expand_size=num_return_sequences,\n                    **model_kwargs)\n\n            ret = self.sample(input_ids, logits_processors, max_len,\n                              pad_token_id, eos_token_id, top_k, top_p,\n                              temperature, **model_kwargs)\n        else:\n            raise ValueError(f'Not support {decode_strategy} strategy yet!')\n        return ret\n"
  },
  {
    "path": "ppfleetx/models/language_model/language_module.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport logging\nimport os\nimport sys\nimport copy\nimport math\nimport numpy as np\nimport types\n\nimport paddle\nfrom paddle.static import InputSpec\nimport paddle.distributed.fleet as fleet\n\nfrom ppfleetx.core.module.basic_module import BasicModule\nimport ppfleetx.models.language_model.gpt as gpt\nfrom ppfleetx.models.language_model.gpt.dygraph.sequence_parallel_utils import register_sequence_parallel_allreduce_hooks\nfrom ppfleetx.distributed.apis import env\nfrom ppfleetx.utils.log import logger\nfrom .utils import process_configs\nfrom ppfleetx.data.tokenizers import GPTTokenizer\nfrom .metrics import *\n\n# TODO(haohongxiang): to solve the problem of cross-reference\nimport paddlenlp\nfrom paddlenlp.transformers.gpt.tokenizer import GPTChineseTokenizer\n\nMODEL_CLASSES = {\n    \"GPT\": (GPTTokenizer, \"gpt2\"),\n    \"MoE\": (GPTTokenizer, \"gpt2\"),\n    \"GPT-cn\": (GPTChineseTokenizer, \"gpt-cpm-large-cn\"),\n}\n\n\ndef get_model_size(l, h, v, s):\n    P = 0\n    # embedding\n    P += (v + s) * h\n    # attention\n    P += (4 * h * h + 4 * h) * l\n    # layer_norm of decoder\n    P += (2 * (2 * h)) * l\n    # FFN Layer\n    P += (8 * h * h + 5 * h) * l\n    # layer_norm of transformer\n    P += 2 * h\n    logger.info('Model Size: {:.2f} B'.format(P / 1000.0 / 1000.0 / 1000.0))\n\n\ndef vocab_size_with_padding(vocab_size, div_unit, mp_degree):\n    padded_size = vocab_size\n    multiple = div_unit * mp_degree\n    while (padded_size % multiple) != 0:\n        padded_size += 1\n    logging.warning(' > padded vocab (size: {}) with {} dummy tokens '\n                    '(new size: {})'.format(vocab_size, padded_size -\n                                            vocab_size, padded_size))\n    return padded_size\n\n\nclass LanguageModule(BasicModule):\n    def __init__(self, configs):\n        self.nranks = paddle.distributed.get_world_size()\n        self.data_world_size = env.get_data_world_size()\n        super(LanguageModule, self).__init__(configs)\n\n        self.loss_fn = self.get_loss_fn()\n\n    def process_configs(self, configs):\n        configs = process_configs(configs)\n        return configs\n\n    def forward(self, tokens, ids):\n        return self.model(tokens, ids)\n\n    def training_step(self, batch):\n        tokens, position_ids, labels, loss_mask = batch\n\n        loss_mask.stop_gradient = True\n        labels.stop_gradient = True\n        position_ids.stop_gradient = True\n\n        preds = self(tokens, position_ids)\n        loss = self.loss_fn(preds, labels, loss_mask)\n\n        return loss\n\n    def training_step_end(self, log_dict):\n        speed = 1. / log_dict['train_cost']\n        default_global_tokens_num = self.configs.Global.global_batch_size * \\\n            self.configs.Data.Train.dataset.max_seq_len\n\n        loss_scale_str = \"loss_scale: %.9f,\" % (\n            log_dict['loss_scale']) if log_dict.get('loss_scale',\n                                                    None) is not None else \"\"\n        logger.info(\n            \"[train] epoch: [%d/%d], batch: [%d/%d], loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, \" \\\n            \"ips_total: %.0f tokens/s, ips: %.0f tokens/s, %s learning rate: %.5e, found_inf: %.0f\"\n            % (log_dict['epoch'], log_dict['total_epoch'], log_dict['batch'], log_dict['total_step'], log_dict['loss'],\n               log_dict['train_cost'], speed, speed * default_global_tokens_num, speed * default_global_tokens_num / self.data_world_size, \\\n               loss_scale_str, log_dict['lr'], log_dict['found_inf']))\n\n    def validation_step(self, batch):\n        tokens, position_ids, labels, loss_mask = batch\n        preds = self(tokens, position_ids)\n        preds = paddle.cast(preds, dtype=\"float32\")\n        loss = self.loss_fn(preds, labels, loss_mask)\n        return loss\n\n    def validation_step_end(self, log_dict):\n        speed = 1. / log_dict['eval_cost']\n        logger.info(\n            \"[eval] epoch: %d, batch: %d/%d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s\"\n            % (log_dict['epoch'], log_dict['batch'], log_dict['total_batch'],\n               log_dict['loss'], log_dict['eval_cost'], speed))\n\n    def test_step(self, batch):\n        tokens, position_ids, labels, loss_mask = batch\n        preds = self(tokens, position_ids)\n        preds = paddle.cast(preds, dtype=\"float32\")\n        loss = self.loss_fn(preds, labels, loss_mask)\n        return loss\n\n    def test_step_end(self, log_dict):\n        speed = 1. / log_dict['test_cost']\n        logger.info(\n            \"[test] epoch: %d, batch: %d, loss: %.9f, avg_test_cost: %.5f sec, speed: %.2f step/s\"\n            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'],\n               log_dict['test_cost'], speed))\n\n    def training_epoch_end(self, log_dict):\n        logger.info(\"[Training] epoch: %d, total time: %.5f sec\" %\n                    (log_dict['epoch'], log_dict['train_cost']))\n\n\nclass GPTModule(LanguageModule):\n    def __init__(self, configs):\n        super(GPTModule, self).__init__(configs)\n        if configs.Model.sequence_parallel:\n            register_sequence_parallel_allreduce_hooks(\n                self, configs.Engine.accumulate_steps,\n                configs.Distributed.fuse_sequence_parallel_allreduce)\n\n    def get_model(self):\n        model_setting = copy.deepcopy(self.configs.Model)\n        if 'Compress' in self.configs and 'Quantization' in self.configs.Compress:\n            quant_setting = copy.deepcopy(self.configs.Compress.Quantization)\n            skip_tensor_map = quant_setting.get('skip_tensor_map', {})\n            freeze_embedding = quant_setting.get('freeze_embedding', False)\n            model_setting['skip_tensor_map'] = skip_tensor_map\n            model_setting['freeze_embedding'] = freeze_embedding\n        model_setting.pop(\"module\")\n\n        model_name = model_setting.pop(\"name\")\n        tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]\n        self.tokenizer = tokenizer_class.from_pretrained(pretrained_name)\n\n        model_setting['vocab_size'] = vocab_size_with_padding(\n            model_setting.get('vocab_size', self.tokenizer.vocab_size),\n            model_setting.pop('vocab_size_divisible_unit', 128),\n            self.configs.Distributed.get('mp_degree', 1))\n\n        l = model_setting['num_layers']\n        h = model_setting['hidden_size']\n        v = model_setting['vocab_size']\n        s = self.configs.Data.Train.dataset.max_seq_len\n        get_model_size(l, h, v, s)\n\n        if self.nranks == 1:\n            model_setting.pop(\"sequence_parallel\")\n            model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting))\n        else:\n            model_setting[\n                'num_partitions'] = self.configs.Distributed.mp_degree\n            if self.configs.Distributed.pp_degree == 1:\n                model_setting.pop(\"virtual_pp_degree\", None)\n                model = gpt.GPTForPretrainingHybrid(\n                    gpt.GPTModelHybrid(**model_setting))\n            else:\n                model = gpt.GPTForPretrainingPipe(**model_setting)\n\n        return model\n\n    def get_loss_fn(self):\n        if self.nranks == 1:\n            loss_fn = gpt.GPTPretrainingCriterion()\n        else:\n            loss_fn = gpt.GPTPretrainingCriterionHybird(\n                sequence_parallel=self.configs.Model.sequence_parallel)\n        return loss_fn\n\n    def pretreating_batch(self, batch):\n        if self.configs.Distributed.pp_degree > 1:\n            tokens, position_ids, labels, loss_mask = batch\n            data = [(tokens, position_ids), (labels, loss_mask)]\n            return data\n        else:\n            return batch\n\n    def input_spec(self):\n        return [\n            InputSpec(\n                shape=[None, None], name=\"tokens\", dtype='int64'), InputSpec(\n                    shape=[None, None], name=\"ids\", dtype='int64')\n        ]\n\n    def inference_end(self, outputs):\n        for k, v in outputs.items():\n            for i in range(v.shape[0]):\n                out_ids = [int(x) for x in v[i]]\n                ret_str = self.tokenizer.decode(out_ids)\n                # ret_str = text[i] + ret_str\n                print(ret_str)\n\n\nclass GPTFinetuneModule(BasicModule):\n    def __init__(self, configs):\n        self.nranks = paddle.distributed.get_world_size()\n        self.data_world_size = env.get_data_world_size()\n        super(GPTFinetuneModule, self).__init__(configs)\n\n        # self.loss_config will be init in super class by get_model()\n        assert self.loss_config is not None\n        assert 'train' in self.loss_config\n        assert 'eval' in self.loss_config\n\n        train_loss = copy.deepcopy(self.loss_config.train)\n        train_loss_cls = train_loss.pop('name')\n        self.loss_fn = eval(f'paddle.nn.loss.{train_loss_cls}')(**train_loss)\n\n        eval_loss = copy.deepcopy(self.loss_config.eval)\n        eval_loss_cls = eval_loss.pop('name')\n        self.eval_loss_fn = eval(f'paddle.nn.loss.{eval_loss_cls}')(\n            **eval_loss)\n\n        # self.metric_config will be init in super class by get_model()\n        assert self.metric_config is not None\n        assert 'eval' in self.metric_config\n\n        if 'train' in self.metric_config:\n            train_metric = copy.deepcopy(self.metric_config.train)\n            train_metric_cls = train_metric.pop('name')\n            self.train_metric = eval(f'{train_metric_cls}')(**train_metric)\n\n        eval_metric = copy.deepcopy(self.metric_config.eval)\n        eval_metric_cls = eval_metric.pop('name')\n        self.eval_metric = eval(f'{eval_metric_cls}')(**eval_metric)\n\n        self.best_metric = 0.0\n\n    def process_configs(self, configs):\n        return configs\n\n    def get_model(self):\n        model_setting = copy.deepcopy(self.configs.Model)\n        model_setting.pop(\"module\")\n\n        self.metric_config = model_setting.pop(\"metric\", None)\n        self.loss_config = model_setting.pop(\"loss\", None)\n\n        pretrained = model_setting.pop(\"pretrained\")\n        num_classes = model_setting.pop(\"num_classes\", 2)\n        assert pretrained is not None\n\n        model_name = model_setting.pop(\"name\")\n        tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]\n        self.tokenizer = tokenizer_class.from_pretrained(pretrained_name)\n\n        model_setting['vocab_size'] = vocab_size_with_padding(\n            model_setting.get('vocab_size', self.tokenizer.vocab_size),\n            model_setting.pop('vocab_size_divisible_unit', 128),\n            self.configs.Distributed.get('mp_degree', 1))\n\n        l = model_setting['num_layers']\n        h = model_setting['hidden_size']\n        v = model_setting['vocab_size']\n        num_heads = model_setting['num_attention_heads']\n        s = self.configs.Data.Train.dataset.max_length\n        get_model_size(l, h, v, s)\n\n        if self.nranks == 1:\n            model = gpt.GPTForSequenceClassification(\n                gpt.GPTModel(**model_setting), num_classes)\n        else:\n            raise NotImplementedError\n\n        pretrained_path = pretrained + \".pdparams\"\n        assert os.path.exists(\n            pretrained_path), f'{pretrained_path} is not exists!'\n        model_dict = paddle.load(pretrained_path)\n\n        # Note(GuoxiaWang): Guess whether to convert fused vs non-fused parameters.\n        # 'q_proj' vs 'qkv_proj'\n        def is_fused(model_state):\n            for key in model_state:\n                if 'qkv_proj' in key:\n                    return True\n            return False\n\n        def split_params(model_state, num_layers):\n            for idx in range(num_layers):\n                qkv_b = model_state.pop(\n                    f'gpt.decoder.layers.{idx}.self_attn.qkv_proj.bias')\n                qkv_w = model_state.pop(\n                    f'gpt.decoder.layers.{idx}.self_attn.qkv_proj.weight')\n\n                qkv_b = qkv_b.reshape((num_heads, 3, -1))\n                qkv_w = qkv_w.reshape((h, num_heads, 3, -1))\n\n                q_w, k_w, v_w = np.split(qkv_w, 3, axis=2)\n                q_w = q_w.reshape((h, -1))\n                k_w = k_w.reshape((h, -1))\n                v_w = v_w.reshape((h, -1))\n\n                q_b, k_b, v_b = np.split(qkv_b, 3, axis=1)\n                q_b = q_b.reshape((-1))\n                k_b = k_b.reshape((-1))\n                v_b = v_b.reshape((-1))\n\n                model_state[\n                    f'gpt.decoder.layers.{idx}.self_attn.q_proj.bias'] = q_b\n                model_state[\n                    f'gpt.decoder.layers.{idx}.self_attn.q_proj.weight'] = q_w\n\n                model_state[\n                    f'gpt.decoder.layers.{idx}.self_attn.k_proj.bias'] = k_b\n                model_state[\n                    f'gpt.decoder.layers.{idx}.self_attn.k_proj.weight'] = k_w\n\n                model_state[\n                    f'gpt.decoder.layers.{idx}.self_attn.v_proj.bias'] = v_b\n                model_state[\n                    f'gpt.decoder.layers.{idx}.self_attn.v_proj.weight'] = v_w\n\n            return model_state\n\n        def fuse_params(model_state, num_layers):\n            for idx in range(num_layers):\n                q_b = model_state.pop(\n                    f'gpt.decoder.layers.{idx}.self_attn.q_proj.bias')\n                q_w = model_state.pop(\n                    f'gpt.decoder.layers.{idx}.self_attn.q_proj.weight')\n\n                k_b = model_state.pop(\n                    f'gpt.decoder.layers.{idx}.self_attn.k_proj.bias')\n                k_w = model_state.pop(\n                    f'gpt.decoder.layers.{idx}.self_attn.k_proj.weight')\n\n                v_b = model_state.pop(\n                    f'gpt.decoder.layers.{idx}.self_attn.v_proj.bias')\n                v_w = model_state.pop(\n                    f'gpt.decoder.layers.{idx}.self_attn.v_proj.weight')\n\n                q_w = q_w.reshape((h, num_heads, -1))\n                k_w = k_w.reshape((h, num_heads, -1))\n                v_w = v_w.reshape((h, num_heads, -1))\n\n                qkv_w = np.stack([q_w, k_w, v_w], axis=2)\n                qkv_w = qkv_w.reshape((h, -1))\n\n                q_b = q_b.reshape((num_heads, -1))\n                k_b = k_b.reshape((num_heads, -1))\n                v_b = v_b.reshape((num_heads, -1))\n                qkv_b = np.stack([q_b, k_b, v_b], axis=1)\n                qkv_b = qkv_b.reshape((-1))\n\n                model_state[\n                    f'gpt.decoder.layers.{idx}.self_attn.qkv_proj.weight'] = qkv_w\n                model_state[\n                    f'gpt.decoder.layers.{idx}.self_attn.qkv_proj.bias'] = qkv_b\n            return model_state\n\n        fused = is_fused(model.state_dict())\n        load_fused = is_fused(model_dict)\n\n        if fused is True and load_fused is False:\n            model_dict = fuse_params(model_dict, l)\n        elif fused is False and load_fused is True:\n            model_dict = split_params(model_dict, l)\n\n        for name, param in model.state_dict().items():\n            if name in model_dict and param.dtype != model_dict[name].dtype:\n                model_dict[name] = model_dict[name].cast(param.dtype)\n\n        model.set_state_dict(model_dict)\n        logger.info(f'Load pretrained weight from {pretrained_path}')\n\n        return model\n\n    def forward(self, tokens):\n        return self.model(tokens)\n\n    def training_step(self, batch):\n        input_ids, labels = batch\n\n        input_ids.stop_gradient = True\n        labels.stop_gradient = True\n\n        logits = self(input_ids)\n        loss = self.loss_fn(logits, labels)\n\n        return loss\n\n    def training_step_end(self, log_dict):\n        speed = 1. / log_dict['train_cost']\n        default_global_tokens_num = self.configs.Global.global_batch_size * \\\n            self.configs.Data.Train.dataset.max_length\n\n        logger.info(\n            \"[train] epoch: [%d/%d], step: [%d/%d], learning rate: %.7f, loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, \" \\\n            \"ips_total: %.0f tokens/s, ips: %.0f tokens/s\"\n            % (log_dict['epoch'], log_dict['total_epoch'], log_dict['batch'], log_dict['total_batch'], log_dict['lr'], log_dict['loss'], log_dict['train_cost'], speed,\n               speed * default_global_tokens_num, speed * default_global_tokens_num / self.data_world_size))\n\n    def validation_step(self, batch):\n        input_ids, labels = batch\n\n        input_ids.stop_gradient = True\n        labels.stop_gradient = True\n\n        logits = self(input_ids)\n        loss = self.eval_loss_fn(logits, labels)\n        correct = self.eval_metric.compute(logits, labels)\n        self.eval_metric.update(correct)\n        return loss\n\n    def validation_step_end(self, log_dict):\n        speed = 1. / log_dict['eval_cost']\n        logger.info(\n            \"[eval] epoch: %d, batch: %d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s\"\n            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'],\n               log_dict['eval_cost'], speed))\n\n    def test_step(self, batch):\n        tokens, position_ids, labels, loss_mask = batch\n        preds = self(tokens, position_ids)\n        preds = paddle.cast(preds, dtype=\"float32\")\n        loss = self.eval_loss_fn(preds, labels, loss_mask)\n        return loss\n\n    def test_step_end(self, log_dict):\n        speed = 1. / log_dict['test_cost']\n        logger.info(\n            \"[test] epoch: %d, batch: %d, loss: %.9f, avg_test_cost: %.5f sec, speed: %.2f step/s\"\n            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'],\n               log_dict['test_cost'], speed))\n\n    def training_epoch_end(self, log_dict):\n        logger.info(\"[Training] epoch: %d, total time: %.5f sec\" %\n                    (log_dict['epoch'], log_dict['train_cost']))\n\n    def validation_epoch_end(self, log_dict):\n        res = self.eval_metric.accumulate()\n        self.eval_metric.reset()\n        if isinstance(self.eval_metric, AccuracyAndF1):\n            msg = \"acc: %.5f, precision: %.5f, recall: %.5f, f1: %.5f, acc and f1: %.5f\" % (\n                res[0], res[1], res[2], res[3], res[4])\n            metric = res[4]\n        elif isinstance(self.eval_metric, Mcc):\n            msg = \"mcc: %.5f\" % (res[0])\n            metric = res[0]\n        elif isinstance(self.eval_metric, PearsonAndSpearman):\n            msg = \"pearson: %.5f, spearman: %.5f, pearson and spearman: %.5f\" % (\n                res[0], res[1], res[2])\n            metric = res[2]\n        else:\n            msg = \"acc: %.5f\" % (res)\n            metric = res\n\n        if metric > self.best_metric:\n            self.best_metric = metric\n\n        logger.info(\n            \"[Eval] epoch: %d, total time: %.5f sec, %s, best_metric: %.5f\" %\n            (log_dict['epoch'], log_dict['eval_cost'], msg, self.best_metric))\n\n\nclass GPTGenerationModule(BasicModule):\n    def __init__(self, configs):\n        self.configs = configs\n        self.generation_cfgs = configs.Generation\n        self.nranks = paddle.distributed.get_world_size()\n\n        super().__init__(configs)\n\n    def process_configs(self, configs):\n        configs = process_configs(configs)\n        return configs\n\n    def get_model(self):\n        model_setting = copy.deepcopy(self.configs.Model)\n        if 'Compress' in self.configs and 'Quantization' in self.configs.Compress:\n            quant_setting = copy.deepcopy(self.configs.Compress.Quantization)\n            skip_tensor_map = quant_setting.get('skip_tensor_map', {})\n            freeze_embedding = quant_setting.get('freeze_embedding', False)\n            model_setting['skip_tensor_map'] = skip_tensor_map\n            model_setting['freeze_embedding'] = freeze_embedding\n        model_setting.pop(\"module\")\n\n        model_name = model_setting.pop(\"name\")\n        tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]\n        self.tokenizer = tokenizer_class.from_pretrained(pretrained_name)\n\n        model_setting['vocab_size'] = vocab_size_with_padding(\n            model_setting.get('vocab_size', self.tokenizer.vocab_size),\n            model_setting.pop('vocab_size_divisible_unit', 128),\n            self.configs.Distributed.get('mp_degree', 1))\n\n        if self.nranks == 1:\n            model = gpt.GPTForGeneration(\n                gpt.GPTModel(**model_setting), self.generation_cfgs)\n        else:\n            assert self.nranks == self.configs.Distributed.dp_degree, \\\n                \"only support single card and data parallel in generation task.\"\n            model = gpt.GPTForGenerationHybrid(\n                gpt.GPTModelHybrid(**model_setting), self.generation_cfgs)\n\n        self.generation_cfgs['max_dec_len'] = self.adjust_length_to_model(\n            self.generation_cfgs['max_dec_len'], 512)\n\n        self.generation_cfgs['bos_token_id'] = self.tokenizer.eos_token_id\n        self.generation_cfgs['eos_token_id'] = self.tokenizer.eos_token_id\n        self.generation_cfgs['pad_token_id'] = self.tokenizer.eos_token_id\n\n        return model\n\n    def adjust_length_to_model(self, length, max_sequence_length):\n        if length < 0 or length > max_sequence_length:\n            length = max_sequence_length\n        return length\n\n    def left_padding(self, inputs, pad_id, padding=\"longest\"):\n        assert \"input_ids\" in inputs, \"input_ids should be in inputs!\"\n        max_length = 0\n        for ids in inputs[\"input_ids\"]:\n            max_length = max(max_length, len(ids))\n\n        def extend_max_lenth(value, max_length, to_pad_id):\n            return [to_pad_id] * (max_length - len(value)) + value\n\n        def extend_filed(name, max_length, to_pad_id):\n            values = inputs[name]\n            res = []\n            for index, value in enumerate(values):\n                res.append(extend_max_lenth(value, max_length, to_pad_id))\n            inputs[name] = res\n\n        extend_filed(\"input_ids\", max_length, pad_id)\n        if \"attention_mask\" in inputs:\n            extend_filed(\"attention_mask\", max_length, 0)\n        if \"position_ids\" in inputs:\n            extend_filed(\"position_ids\", max_length, 0)\n\n        return inputs\n\n    def generate(self, input_text):\n        return self(input_text)\n\n    def forward(self, input_text):\n        input_ids = self.tokenizer.encode(input_text)\n        inputs = {'input_ids': [input_ids]}\n\n        inputs = self.left_padding(inputs, self.tokenizer.eos_token_id)\n        input_ids = inputs['input_ids']\n\n        if len(input_ids) == 0:\n            input_ids = None\n        else:\n            # [1, seq_len]\n            input_ids = paddle.to_tensor(input_ids, dtype='int64')\n\n        ids, scores = self.model(input_ids=input_ids)\n\n        generated_sequences = []\n        for i, generated_ids in enumerate(ids):\n            generated_ids = generated_ids.numpy().tolist()\n            # Decode text\n            text = self.tokenizer.convert_ids_to_string(generated_ids)\n            sequence = input_text + text\n            generated_sequences.append(sequence)\n\n        return generated_sequences\n\n    def input_spec(self):\n        return [InputSpec(shape=[None, None], name=\"input_ids\", dtype='int64')]\n\n\nclass GPTEvalModule(LanguageModule):\n    def __init__(self, configs):\n        self.eval_cfgs = configs.Offline_Eval\n\n        super().__init__(configs)\n\n        self.post_process_configs()\n\n        self.first_step = True\n        self.total_score = 0\n        self.score_name = \"loss\" if not self.eval_cfgs.cloze_eval else \"number correct\"\n\n    def post_process_configs(self):\n        self.configs.pop(\"Optimizer\", None)\n        self.configs.pop(\"Inference\", None)\n\n        self.configs.Data.pop(\"Train\", None)\n        self.configs.Data.pop(\"Test\", None)\n        self.configs.Data.Eval.pop(\"sampler\", None)\n        self.configs.Data.Eval.loader.collate_fn = \"gpt_collate_fn\"\n        self.configs.Data.Eval.loader.batch_size = self.eval_cfgs.batch_size\n        self.configs.Data.Eval.dataset.input_dir = self.eval_cfgs.eval_path\n        self.configs.Data.Eval.dataset.max_seq_len = self.eval_cfgs.max_seq_len\n\n        self.configs.Engine.logging_freq = self.eval_cfgs.logging_freq\n\n        if not self.eval_cfgs.cloze_eval:\n            self.configs.Data.Eval.dataset.name = \"LM_Eval_Dataset\"\n            self.configs.Data.Eval.dataset.overlapping_eval = self.eval_cfgs.overlapping_eval\n        else:\n            self.configs.Data.Eval.dataset.name = \"Lambada_Eval_Dataset\"\n\n    def get_model(self):\n        model_setting = copy.deepcopy(self.configs.Model)\n        if 'Compress' in self.configs and 'Quantization' in self.configs.Compress:\n            quant_setting = copy.deepcopy(self.configs.Compress.Quantization)\n            skip_tensor_map = quant_setting.get('skip_tensor_map', {})\n            freeze_embedding = quant_setting.get('freeze_embedding', False)\n            model_setting['skip_tensor_map'] = skip_tensor_map\n            model_setting['freeze_embedding'] = freeze_embedding\n        model_setting.pop(\"module\")\n\n        model_name = model_setting.pop(\"name\")\n        tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]\n        self.tokenizer = tokenizer_class.from_pretrained(pretrained_name)\n\n        model_setting['vocab_size'] = vocab_size_with_padding(\n            model_setting.get('vocab_size', self.tokenizer.vocab_size),\n            model_setting.pop('vocab_size_divisible_unit', 128),\n            self.configs.Distributed.get('mp_degree', 1))\n\n        if self.nranks == 1:\n            model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting))\n        else:\n            raise RuntimeError(\n                \"Only single-card offline eval is supported in GPTModel now.\")\n\n        return model\n\n    def forward(self, tokens, ids, mask):\n        return self.model(tokens, ids, mask)\n\n    def validation_step(self, batch):\n        tokens, loss_mask, attention_mask, position_ids, labels, info = batch\n\n        preds = self(tokens, position_ids, attention_mask)\n\n        if not self.eval_cfgs.cloze_eval:\n            if self.first_step:\n                self.num_original_tokens = info.numpy()[0][0]\n                self.num_tokenized_tokens = info.numpy()[0][1]\n\n            masked_lm_loss = paddle.nn.functional.cross_entropy(\n                preds, labels, reduction=\"none\")\n            loss = paddle.sum(masked_lm_loss * loss_mask)\n            return loss\n        else:\n            if self.first_step:\n                self.num_examples = info.numpy()[0][0]\n\n            outputs = paddle.argmax(preds, -1)\n            acc = paddle.cast(outputs == labels, 'float32')\n            acc = paddle.where(\n                paddle.cast(loss_mask, 'bool'), acc, paddle.ones_like(acc))\n            acc = paddle.sum(paddle.prod(acc, -1))\n            return acc\n\n        self.first_step = False\n\n    def validation_step_end(self, log_dict):\n        speed = 1. / log_dict['eval_cost']\n\n        if not self.eval_cfgs.cloze_eval:\n            self.total_score += log_dict[\n                'loss'] * self.configs.Engine.logging_freq / (\n                    self.num_tokenized_tokens - 1)\n        else:\n            self.total_score += log_dict[\n                'loss'] * self.configs.Engine.logging_freq\n\n        logger.info(\"[eval] epoch: %d, batch: %d, %s: %.9f, speed: %.2f step/s\"\n                    % (log_dict['epoch'], log_dict['batch'], self.score_name,\n                       self.total_score, speed))\n\n    def validation_epoch_end(self, log_dict):\n        if not self.eval_cfgs.cloze_eval:\n            total_loss = float(self.total_score)\n            ppl = math.exp(min(20, total_loss))\n            token_ratio = (self.num_tokenized_tokens - 1) / (\n                self.num_original_tokens - 1)\n            adjusted_ppl = math.exp(min(20, total_loss * token_ratio))\n            string = ' validation results on {} | '.format(\n                self.eval_cfgs.eval_path)\n            string += 'avg loss: {:.4E} | '.format(total_loss)\n            string += 'ppl: {:.4E} | '.format(ppl)\n            string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl)\n            string += 'token ratio: {} |'.format(token_ratio)\n        else:\n            num_correct = float(self.total_score)\n            acc = float(num_correct / self.num_examples)\n            string = ' validation results on {} | '.format(\n                self.eval_cfgs.eval_path)\n            string += 'number correct: {:.4E} | '.format(num_correct)\n            string += 'total examples: {:.4E} | '.format(self.num_examples)\n            string += 'avg accuracy: {:.4E}'.format(acc)\n\n        logger.info(string)\n\n    def input_spec(self):\n        return [\n            InputSpec(\n                shape=[None, None], name=\"tokens\", dtype='int64'), InputSpec(\n                    shape=[None, None], name=\"ids\", dtype='int64')\n        ]\n\n\nclass MoEModule(LanguageModule):\n    def __init__(self, configs):\n        super(MoEModule, self).__init__(configs)\n\n        assert self.nranks == configs.Distributed.dp_degree, \\\n            \"only support single card or data parallel in MoE model.\"\n\n    def get_model(self):\n        model_setting = copy.deepcopy(self.configs.Model)\n        model_setting.pop(\"module\")\n        model_setting.pop(\"name\")\n\n        l = model_setting['num_layers']\n        h = model_setting['hidden_size']\n        v = model_setting['vocab_size']\n        s = self.configs.Data.Train.dataset.max_seq_len\n        get_model_size(l, h, v, s)\n\n        if self.nranks == 1:\n            model_setting.pop(\"sequence_parallel\")\n            model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting))\n        else:\n            model_setting[\n                'num_partitions'] = self.configs.Distributed.mp_degree\n            if self.configs.Distributed.pp_degree == 1:\n                model_setting.pop(\"virtual_pp_degree\", None)\n                model = gpt.GPTForPretrainingHybrid(\n                    gpt.GPTModelHybrid(**model_setting))\n            else:\n                model = gpt.GPTForPretrainingPipe(**model_setting)\n\n        return model\n\n    def get_loss_fn(self):\n        if self.nranks == 1:\n            loss_fn = gpt.GPTPretrainingCriterion()\n        else:\n            loss_fn = gpt.GPTPretrainingCriterionHybird()\n        return loss_fn\n\n    def training_step(self, batch):\n        tokens, position_ids, labels, loss_mask = batch\n\n        loss_mask.stop_gradient = True\n        labels.stop_gradient = True\n        position_ids.stop_gradient = True\n\n        preds = self(tokens, position_ids)\n        loss = self.loss_fn(preds, labels, loss_mask)\n\n        with paddle.amp.auto_cast(enable=False):\n            if self.configs.Model.gate != \"naive\" and \\\n                self.configs.Model.balance_loss_weight:\n\n                gpt_layer = self.model._layers.gpt if isinstance(\n                    self.model, paddle.DataParallel) else self.model.gpt\n\n                aux_loss_list = [\n                    l.moe_mlp.gate.get_loss(clear=False)\n                    for l in gpt_layer.decoder.layers\n                    if hasattr(l.moe_mlp, \"gate\")\n                ]\n                bal_loss = paddle.concat(aux_loss_list)\n                if bal_loss.dtype == paddle.float16:\n                    bal_loss = paddle.cast(bal_loss, dtype=paddle.float32)\n                bal_loss = bal_loss.mean()\n                loss += bal_loss * self.configs.Engine.balance_loss_weight\n\n        return loss\n\n    def initialize_mp_dp_parameters(self):\n        hcg = env.get_hcg()\n        mp_group = hcg.get_model_parallel_group()\n        mp_src_rank = hcg.get_model_parallel_group_src_rank()\n\n        dp_group = hcg.get_data_parallel_group()\n        dp_src_rank = hcg.get_data_parallel_group_src_rank()\n\n        for param in self.model.parameters():\n            if \"expert_\" in param.name:\n                setattr(param, \"no_sync\", True)\n                continue\n\n            if not param.is_distributed:\n                paddle.distributed.broadcast(\n                    param.detach(),\n                    src=mp_src_rank,\n                    group=mp_group,\n                    use_calc_stream=True)\n\n            paddle.distributed.broadcast(\n                param.detach(),\n                src=dp_src_rank,\n                group=dp_group,\n                use_calc_stream=True)\n"
  },
  {
    "path": "ppfleetx/models/language_model/metrics.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport sys\nimport math\nimport warnings\nfrom functools import partial\n\nimport numpy as np\nimport paddle\nfrom paddle.metric import Metric, Accuracy, Precision, Recall\n\n__all__ = [\n    'Accuracy', 'AccuracyAndF1', 'Mcc', 'PearsonAndSpearman',\n    'MultiLabelsMetric'\n]\n\n\nclass AccuracyAndF1(Metric):\n    \"\"\"\n    This class encapsulates Accuracy, Precision, Recall and F1 metric logic,\n    and `accumulate` function returns accuracy, precision, recall and f1.\n    The overview of all metrics could be seen at the document of `paddle.metric\n    <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/metric/Overview_cn.html>`_\n    for details.\n\n    Args:\n        topk (int or tuple(int), optional):\n            Number of top elements to look at for computing accuracy.\n            Defaults to (1,).\n        pos_label (int, optional): The positive label for calculating precision\n            and recall.\n            Defaults to 1.\n        name (str, optional):\n            String name of the metric instance. Defaults to 'acc_and_f1'.\n\n    Example:\n\n        .. code-block::\n\n            import paddle\n            from paddlenlp.metrics import AccuracyAndF1\n\n            x = paddle.to_tensor([[0.1, 0.9], [0.5, 0.5], [0.6, 0.4], [0.7, 0.3]])\n            y = paddle.to_tensor([[1], [0], [1], [1]])\n\n            m = AccuracyAndF1()\n            correct = m.compute(x, y)\n            m.update(correct)\n            res = m.accumulate()\n            print(res) # (0.5, 0.5, 0.3333333333333333, 0.4, 0.45)\n\n    \"\"\"\n\n    def __init__(self,\n                 topk=(1, ),\n                 pos_label=1,\n                 name='acc_and_f1',\n                 *args,\n                 **kwargs):\n        super(AccuracyAndF1, self).__init__(*args, **kwargs)\n        self.topk = topk\n        self.pos_label = pos_label\n        self._name = name\n        self.acc = Accuracy(self.topk, *args, **kwargs)\n        self.precision = Precision(*args, **kwargs)\n        self.recall = Recall(*args, **kwargs)\n        self.reset()\n\n    def compute(self, pred, label, *args):\n        \"\"\"\n        Accepts network's output and the labels, and calculates the top-k\n        (maximum value in topk) indices for accuracy.\n\n        Args:\n            pred (Tensor):\n                Predicted tensor, and its dtype is float32 or float64, and\n                has a shape of [batch_size, num_classes].\n            label (Tensor):\n                The ground truth tensor, and its dtype is is int64, and has a\n                shape of [batch_size, 1] or [batch_size, num_classes] in one\n                hot representation.\n\n        Returns:\n            Tensor: Correct mask, each element indicates whether the prediction\n            equals to the label. Its' a tensor with a data type of float32 and\n            has a shape of [batch_size, topk].\n\n        \"\"\"\n        self.label = label\n        self.preds_pos = paddle.nn.functional.softmax(pred)[:, self.pos_label]\n        return self.acc.compute(pred, label)\n\n    def update(self, correct, *args):\n        \"\"\"\n        Updates the metrics states (accuracy, precision and recall), in order to\n        calculate accumulated accuracy, precision and recall of all instances.\n\n        Args:\n            correct (Tensor):\n                Correct mask for calculating accuracy, and it's a tensor with\n                shape [batch_size, topk] and has a dtype of\n                float32.\n\n        \"\"\"\n        self.acc.update(correct)\n        self.precision.update(self.preds_pos, self.label)\n        self.recall.update(self.preds_pos, self.label)\n\n    def accumulate(self):\n        \"\"\"\n        Calculates and returns the accumulated metric.\n\n        Returns:\n            tuple: The accumulated metric. A tuple of shape (acc, precision,\n            recall, f1, average_of_acc_and_f1)\n\n            With the fields:\n\n            - `acc` (numpy.float64):\n                The accumulated accuracy.\n            - `precision` (numpy.float64):\n                The accumulated precision.\n            - `recall` (numpy.float64):\n                The accumulated recall.\n            - `f1` (numpy.float64):\n                The accumulated f1.\n            - `average_of_acc_and_f1` (numpy.float64):\n                The average of accumulated accuracy and f1.\n\n        \"\"\"\n        acc = self.acc.accumulate()\n        precision = self.precision.accumulate()\n        recall = self.recall.accumulate()\n        if precision == 0.0 or recall == 0.0:\n            f1 = 0.0\n        else:\n            # 1/f1 = 1/2 * (1/precision + 1/recall)\n            f1 = (2 * precision * recall) / (precision + recall)\n        return (\n            acc,\n            precision,\n            recall,\n            f1,\n            (acc + f1) / 2, )\n\n    def reset(self):\n        \"\"\"\n        Resets all metric states.\n        \"\"\"\n        self.acc.reset()\n        self.precision.reset()\n        self.recall.reset()\n        self.label = None\n        self.preds_pos = None\n\n    def name(self):\n        \"\"\"\n        Returns name of the metric instance.\n\n        Returns:\n           str: The name of the metric instance.\n\n        \"\"\"\n        return self._name\n\n\nclass Mcc(Metric):\n    \"\"\"\n    This class calculates `Matthews correlation coefficient <https://en.wikipedia.org/wiki/Matthews_correlation_coefficient>`_ .\n\n    Args:\n        name (str, optional):\n            String name of the metric instance. Defaults to 'mcc'.\n\n    Example:\n\n        .. code-block::\n\n            import paddle\n            from paddlenlp.metrics import Mcc\n\n            x = paddle.to_tensor([[-0.1, 0.12], [-0.23, 0.23], [-0.32, 0.21], [-0.13, 0.23]])\n            y = paddle.to_tensor([[1], [0], [1], [1]])\n\n            m = Mcc()\n            (preds, label) = m.compute(x, y)\n            m.update((preds, label))\n            res = m.accumulate()\n            print(res) # (0.0,)\n\n    \"\"\"\n\n    def __init__(self, name='mcc', *args, **kwargs):\n        super(Mcc, self).__init__(*args, **kwargs)\n        self._name = name\n        self.tp = 0  # true positive\n        self.fp = 0  # false positive\n        self.tn = 0  # true negative\n        self.fn = 0  # false negative\n\n    def compute(self, pred, label, *args):\n        \"\"\"\n        Processes the pred tensor, and returns the indices of the maximum of each\n        sample.\n\n        Args:\n            pred (Tensor):\n                The predicted value is a Tensor with dtype float32 or float64.\n                Shape is [batch_size, 1].\n            label (Tensor):\n                The ground truth value is Tensor with dtype int64, and its\n                shape is [batch_size, 1].\n\n        Returns:\n            tuple: A tuple of preds and label. Each shape is\n            [batch_size, 1], with dtype float32 or float64.\n\n        \"\"\"\n        preds = paddle.argsort(pred, descending=True)[:, :1]\n        return (preds, label)\n\n    def update(self, preds_and_labels):\n        \"\"\"\n        Calculates states, i.e. the number of true positive, false positive,\n        true negative and false negative samples.\n\n        Args:\n            preds_and_labels (tuple[Tensor]):\n                Tuple of predicted value and the ground truth label, with dtype\n                float32 or float64. Each shape is [batch_size, 1].\n\n        \"\"\"\n        preds = preds_and_labels[0]\n        labels = preds_and_labels[1]\n        if isinstance(preds, paddle.Tensor):\n            preds = preds.numpy()\n        if isinstance(labels, paddle.Tensor):\n            labels = labels.numpy().reshape(-1, 1)\n        sample_num = labels.shape[0]\n        for i in range(sample_num):\n            pred = preds[i]\n            label = labels[i]\n            if pred == 1:\n                if pred == label:\n                    self.tp += 1\n                else:\n                    self.fp += 1\n            else:\n                if pred == label:\n                    self.tn += 1\n                else:\n                    self.fn += 1\n\n    def accumulate(self):\n        \"\"\"\n        Calculates and returns the accumulated metric.\n\n        Returns:\n            tuple: Returns the accumulated metric, a tuple of shape (mcc,), `mcc` is the accumulated mcc and its data\n            type is float64.\n\n        \"\"\"\n        if self.tp == 0 or self.fp == 0 or self.tn == 0 or self.fn == 0:\n            mcc = 0.0\n        else:\n            # mcc = (tp*tn-fp*fn)/ sqrt(tp+fp)(tp+fn)(tn+fp)(tn+fn))\n            mcc = (self.tp * self.tn - self.fp * self.fn) / math.sqrt(\n                (self.tp + self.fp) * (self.tp + self.fn) *\n                (self.tn + self.fp) * (self.tn + self.fn))\n        return (mcc, )\n\n    def reset(self):\n        \"\"\"\n        Resets all metric states.\n        \"\"\"\n        self.tp = 0  # true positive\n        self.fp = 0  # false positive\n        self.tn = 0  # true negative\n        self.fn = 0  # false negative\n\n    def name(self):\n        \"\"\"\n        Returns name of the metric instance.\n\n        Returns:\n            str: The name of the metric instance.\n\n        \"\"\"\n        return self._name\n\n\nclass PearsonAndSpearman(Metric):\n    \"\"\"\n    The class calculates `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_\n    and `Spearman's rank correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_ .\n\n\n    Args:\n        name (str, optional):\n            String name of the metric instance. Defaults to 'pearson_and_spearman'.\n\n    Example:\n\n        .. code-block::\n\n            import paddle\n            from paddlenlp.metrics import PearsonAndSpearman\n\n            x = paddle.to_tensor([[0.1], [1.0], [2.4], [0.9]])\n            y = paddle.to_tensor([[0.0], [1.0], [2.9], [1.0]])\n\n            m = PearsonAndSpearman()\n            m.update((x, y))\n            res = m.accumulate()\n            print(res) # (0.9985229081857804, 1.0, 0.9992614540928901)\n\n    \"\"\"\n\n    def __init__(self, name='pearson_and_spearman', *args, **kwargs):\n        super(PearsonAndSpearman, self).__init__(*args, **kwargs)\n        self._name = name\n        self.preds = []\n        self.labels = []\n\n    def update(self, preds_and_labels):\n        \"\"\"\n        Ensures the type of preds and labels is numpy.ndarray and reshapes them\n        into [-1, 1].\n\n        Args:\n            preds_and_labels (tuple[Tensor] or list[Tensor]):\n                Tuple or list of predicted value and the ground truth label.\n                Its data type should be float32 or float64 and its shape is [batch_size, d0, ..., dN].\n\n        \"\"\"\n        preds = preds_and_labels[0]\n        labels = preds_and_labels[1]\n        if isinstance(preds, paddle.Tensor):\n            preds = preds.numpy()\n        if isinstance(labels, paddle.Tensor):\n            labels = labels.numpy()\n        preds = np.squeeze(preds.reshape(-1, 1)).tolist()\n        labels = np.squeeze(labels.reshape(-1, 1)).tolist()\n        self.preds.append(preds)\n        self.labels.append(labels)\n\n    def accumulate(self):\n        \"\"\"\n        Calculates and returns the accumulated metric.\n\n        Returns:\n            tuple: Returns the accumulated metric, a tuple of (pearson, spearman,\n            the_average_of_pearson_and_spearman).\n\n            With the fields:\n\n            - `pearson` (numpy.float64):\n                The accumulated pearson.\n\n            - `spearman` (numpy.float64):\n                The accumulated spearman.\n\n            - `the_average_of_pearson_and_spearman` (numpy.float64):\n                The average of accumulated pearson and spearman correlation\n                coefficient.\n\n        \"\"\"\n        preds = [item for sublist in self.preds for item in sublist]\n        labels = [item for sublist in self.labels for item in sublist]\n        pearson = self.pearson(preds, labels)\n        spearman = self.spearman(preds, labels)\n        return (\n            pearson,\n            spearman,\n            (pearson + spearman) / 2, )\n\n    def pearson(self, preds, labels):\n        n = len(preds)\n        # simple sums\n        sum1 = sum(float(preds[i]) for i in range(n))\n        sum2 = sum(float(labels[i]) for i in range(n))\n        # sum up the squares\n        sum1_pow = sum([pow(v, 2.0) for v in preds])\n        sum2_pow = sum([pow(v, 2.0) for v in labels])\n        # sum up the products\n        p_sum = sum([preds[i] * labels[i] for i in range(n)])\n\n        numerator = p_sum - (sum1 * sum2 / n)\n        denominator = math.sqrt(\n            (sum1_pow - pow(sum1, 2) / n) * (sum2_pow - pow(sum2, 2) / n))\n        if denominator == 0:\n            return 0.0\n        return numerator / denominator\n\n    def spearman(self, preds, labels):\n        preds_rank = self.get_rank(preds)\n        labels_rank = self.get_rank(labels)\n\n        total = 0\n        n = len(preds)\n        for i in range(n):\n            total += pow((preds_rank[i] - labels_rank[i]), 2)\n        spearman = 1 - float(6 * total) / (n * (pow(n, 2) - 1))\n        return spearman\n\n    def get_rank(self, raw_list):\n        x = np.array(raw_list)\n        r_x = np.empty(x.shape, dtype=int)\n        y = np.argsort(-x)\n        for i, k in enumerate(y):\n            r_x[k] = i + 1\n        return r_x\n\n    def reset(self):\n        \"\"\"\n        Resets all metric states.\n        \"\"\"\n        self.preds = []\n        self.labels = []\n\n    def name(self):\n        \"\"\"\n        Returns name of the metric instance.\n\n        Returns:\n           str: The name of the metric instance.\n\n        \"\"\"\n        return self._name\n\n\nclass MultiLabelsMetric(Metric):\n    \"\"\"\n    This class encapsulates Accuracy, Precision, Recall and F1 metric logic in\n    multi-labels setting (also the binary setting).\n    Some codes are taken and modified from sklearn.metrics .\n\n    Args:\n        num_labels (int)\n            The total number of labels which is usually the number of classes\n        name (str, optional):\n            String name of the metric instance. Defaults to 'multi_labels_metric'.\n\n    Example:\n\n        .. code-block::\n\n            import paddle\n            from paddlenlp.metrics import MultiLabelsMetric\n\n            x = paddle.to_tensor([[0.1, 0.2, 0.9], [0.5, 0.8, 0.5], [0.6, 1.5, 0.4], [2.8, 0.7, 0.3]])\n            y = paddle.to_tensor([[2], [1], [2], [1]])\n\n            m = MultiLabelsMetric(num_labels=3)\n            args = m.compute(x, y)\n            m.update(args)\n\n            result1 = m.accumulate(average=None)\n            # (array([0.0, 0.5, 1.0]), array([0.0, 0.5, 0.5]), array([0.0, 0.5, 0.66666667]))\n            result2 = m.accumulate(average='binary', pos_label=0)\n            # (0.0, 0.0, 0.0)\n            result3 = m.accumulate(average='binary', pos_label=1)\n            # (0.5, 0.5, 0.5)\n            result4 = m.accumulate(average='binary', pos_label=2)\n            # (1.0, 0.5, 0.6666666666666666)\n            result5 = m.accumulate(average='micro')\n            # (0.5, 0.5, 0.5)\n            result6 = m.accumulate(average='macro')\n            # (0.5, 0.3333333333333333, 0.38888888888888884)\n            result7 = m.accumulate(average='weighted')\n            # (0.75, 0.5, 0.5833333333333333)\n\n    Note: When zero_division is encountered (details as followed), the corresponding metrics will be set to 0.0\n        precision is zero_division if there are no positive predictions\n        recall is zero_division if there are no positive labels\n        fscore is zero_division if all labels AND predictions are negative\n    \"\"\"\n\n    def __init__(self, num_labels, name='multi_labels_metric'):\n        super(MultiLabelsMetric, self).__init__()\n        if num_labels <= 1:\n            raise ValueError(\n                f\"The num_labels is {num_labels}, which must be greater than 1.\"\n            )\n        self.num_labels = num_labels\n        self._name = name\n        self._confusion_matrix = np.zeros((num_labels, 2, 2), dtype=int)\n\n    def update(self, args):\n        \"\"\"\n        Updates the metrics states (accuracy, precision and recall), in order to\n        calculate accumulated accuracy, precision and recall of all instances.\n\n        Args:\n            args (tuple of Tensor):\n                the tuple returned from `compute` function\n        \"\"\"\n        pred = args[0].numpy()\n        label = args[1].numpy()\n        tmp_confusion_matrix = self._multi_labels_confusion_matrix(pred, label)\n        self._confusion_matrix += tmp_confusion_matrix\n\n    def accumulate(self, average=None, pos_label=1):\n        \"\"\"\n        Calculates and returns the accumulated metric.\n\n        Args:\n            average (str in {‘binary’, ‘micro’, ‘macro’, ’weighted’} or None, optional):\n            Defaults to `None`. If `None`, the scores for each class are returned.\n            Otherwise, this determines the type of averaging performed on the data:\n\n            - `binary` :\n                Only report results for the class specified by pos_label.\n\n            - `micro` :\n                Calculate metrics globally by counting the total true positives,\n                false negatives and false positives.\n\n            - `macro` :\n                Calculate metrics for each label, and find their unweighted mean.\n                This does not take label imbalance into account.\n\n            - `weighted` :\n                Calculate metrics for each label, and find their average weighted\n                by support (the number of true instances for each label). This\n                alters `macro` to account for label imbalance; it can result in\n                an F-score that is not between precision and recall.\n\n            pos_label (int, optional):\n                The positive label for calculating precision and recall in binary settings.\n                Noted: Only when `average='binary'`, this arguments will be used. Otherwise,\n                it will be ignored.\n                Defaults to 1.\n\n        Returns:\n            tuple: The accumulated metric. A tuple of shape (precision, recall, f1)\n                With the fields:\n\n                - `precision` (numpy.float64 or numpy.ndarray if average=None):\n                    The accumulated precision.\n                - `recall` (numpy.float64 or numpy.ndarray if average=None):\n                    The accumulated recall.\n                - `f1` (numpy.float64 or numpy.ndarray if average=None):\n                    The accumulated f1.\n\n        \"\"\"\n        if average not in {'binary', 'micro', 'macro', 'weighted', None}:\n            raise ValueError(f\"The average is {average}, which is unknown.\")\n        if average == 'binary':\n            if pos_label >= self.num_labels:\n                raise ValueError(\n                    f\"The pos_label is {pos_label}, num_labels is {self.num_labels}. \"\n                    f\"The num_labels must be greater than pos_label.\")\n\n        confusion_matrix = None  # [*, 2, 2]\n        if average == 'binary':\n            confusion_matrix = np.expand_dims(\n                self._confusion_matrix[pos_label], axis=0)\n        elif average == 'micro':\n            confusion_matrix = self._confusion_matrix.sum(axis=0,\n                                                          keepdims=True)\n        #  if average is 'macro' or 'weighted' or None\n        else:\n            confusion_matrix = self._confusion_matrix\n\n        tp = confusion_matrix[:, 1, 1]  # [*,]\n        pred = tp + confusion_matrix[:, 0, 1]  # [*,]\n        true = tp + confusion_matrix[:, 1, 0]  # [*,]\n\n        def _robust_divide(numerator, denominator, metric_name):\n            mask = denominator == 0.0\n            denominator = denominator.copy()\n            denominator[mask] = 1  # avoid zero division\n            result = numerator / denominator\n\n            if not np.any(mask):\n                return result\n\n            # precision is zero_division if there are no positive predictions\n            # recall is zero_division if there are no positive labels\n            # fscore is zero_division if all labels AND predictions are negative\n            warnings.warn(f'Zero division when calculating {metric_name}.',\n                          UserWarning)\n            result[mask] = 0.0\n            return result\n\n        precision = _robust_divide(tp, pred, 'precision')\n        recall = _robust_divide(tp, true, 'recall')\n        f1 = _robust_divide(2 * (precision * recall), (precision + recall),\n                            'f1')\n\n        weights = None  # [num_labels]\n        if average == 'weighted':\n            weights = true\n            if weights.sum() == 0:\n                zero_division_value = np.float64(0.0)\n                if pred.sum() == 0:\n                    return (zero_division_value, zero_division_value,\n                            zero_division_value)\n                else:\n                    return (np.float64(0.0), zero_division_value,\n                            np.float64(0.0))\n        elif average == 'macro':\n            weights = np.ones((self.num_labels), dtype=float)\n        if average is not None:\n            precision = np.average(precision, weights=weights)\n            recall = np.average(recall, weights=weights)\n            f1 = np.average(f1, weights=weights)\n\n        return precision, recall, f1\n\n    def compute(self, pred, label):\n        \"\"\"\n        Accepts network's output and the labels, and calculates the top-k\n        (maximum value in topk) indices for accuracy.\n\n        Args:\n            pred (Tensor):\n                Predicted tensor, and its dtype is float32 or float64, and\n                has a shape of [batch_size, *, num_labels].\n            label (Tensor):\n                The ground truth tensor, and its dtype is is int64, and has a\n                shape of [batch_size, *] or [batch_size, *, num_labels] in one\n                hot representation.\n\n        Returns:\n            tuple of Tensor: it contains two Tensor of shape [*, 1].\n            The tuple should be passed to `update` function.\n        \"\"\"\n        if not (paddle.is_tensor(pred) and paddle.is_tensor(label)):\n            raise ValueError('pred and label must be paddle tensor')\n\n        if pred.shape[-1] != self.num_labels:\n            raise ValueError(f'The last dim of pred is {pred.shape[-1]}, '\n                             f'which should be num_labels')\n        pred = paddle.reshape(pred, [-1, self.num_labels])\n        pred = paddle.argmax(pred, axis=-1)\n\n        if label.shape[-1] == self.num_labels:\n            label = paddle.reshape(label, [-1, self.num_labels])\n            label = paddle.argmax(label, axis=-1)\n        else:\n            label = paddle.reshape(label, [-1])\n            if paddle.max(label) >= self.num_labels:\n                raise ValueError(\n                    f\"Tensor label has value {paddle.max(label)}, \"\n                    f\"which is no less than num_labels\")\n\n        if pred.shape[0] != label.shape[0]:\n            raise ValueError(\n                f\"The length of pred is not equal to the length of label\")\n\n        return pred, label\n\n    def _multi_labels_confusion_matrix(self, pred, label):\n        tp_bins = label[pred == label]\n        tp = np.bincount(tp_bins, minlength=self.num_labels)  # [num_labels,]\n        tp_plus_fp = np.bincount(\n            pred, minlength=self.num_labels)  # [num_labels,]\n        tp_plus_fn = np.bincount(\n            label, minlength=self.num_labels)  # [num_labels,]\n        fp = tp_plus_fp - tp  # [num_labels,]\n        fn = tp_plus_fn - tp  # [num_labels,]\n        tn = pred.shape[0] - tp - fp - fn  # [num_labels,]\n        return np.array([tn, fp, fn, tp]).T.reshape(-1, 2,\n                                                    2)  # [num_labels, 2, 2]\n\n    def reset(self):\n        self._confusion_matrix = np.zeros((self.num_labels, 2, 2), dtype=int)\n\n    def name(self):\n        \"\"\"\n        Returns name of the metric instance.\n\n        Returns:\n           str: The name of the metric instance.\n\n        \"\"\"\n        return self._name\n"
  },
  {
    "path": "ppfleetx/models/language_model/moe/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom .gate import GShardGate, BaseGate, SwitchGate, NaiveGate\nfrom .moe_layer import MoELayer\n"
  },
  {
    "path": "ppfleetx/models/language_model/moe/comm/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/models/language_model/moe/comm_ops.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# The file has been adapted from the file:\n#     https://github.com/laekov/fastmoe/blob/master/fmoe/functions.py\n#     Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4\n# We retain the following license from the original files:\n#     Copyright 2021, Jiaao He. All rights reserved.\n#   Licensed under the Apache License, Version 2.0 (the \"License\").\n\nimport paddle\nfrom paddle.autograd import PyLayer\nfrom paddle.distributed.utils.moe_utils import global_scatter, global_gather\nfrom .utils import _local_scatter, _local_gather, _all_gather\n\n\nclass MoEScatter(PyLayer):\n    r\"\"\"\n    Scatter input samples from [batch x sequences] to contiguous alone experts.\n    If `world_size` is greater than 1, the samples will first be locally\n    scattered, and then exchanged across workers.\n    \"\"\"\n\n    @staticmethod\n    def forward(ctx,\n                inp,\n                pos,\n                local_expert_count,\n                global_expert_count,\n                fwd_batch_size,\n                world_size,\n                group=None):\n        local_input_buf = _local_scatter(inp, pos)\n        if world_size > 1:\n            global_input_buf = global_scatter(\n                local_input_buf,\n                local_expert_count,\n                global_expert_count,\n                group=group)\n        else:\n            global_input_buf = local_input_buf\n\n        ctx.moe_args = inp.shape[0], world_size, group\n\n        variables = (pos, local_expert_count, global_expert_count)\n        ctx.save_for_backward(*variables)\n        return global_input_buf\n\n    @staticmethod\n    def backward(ctx, grad):\n        (pos, local_expert_count, global_expert_count) = ctx.saved_tensor()\n        (inp_batch_size, world_size, group) = ctx.moe_args\n\n        if world_size > 1:\n            local_grad_in = global_gather(\n                grad, local_expert_count, global_expert_count, group=group)\n        else:\n            local_grad_in = grad\n        grad_in = _local_gather(local_grad_in, pos, inp_batch_size)\n        return grad_in, None, None, None\n\n\nclass MoEGather(PyLayer):\n    r\"\"\"\n    Gather output samples from contiguous alone experts back to [batch x\n    sequences]. Works symmetrically with MoEScatter.\n    \"\"\"\n\n    @staticmethod\n    def forward(ctx,\n                global_output_buf,\n                pos,\n                local_expert_count,\n                global_expert_count,\n                local_batch_size,\n                world_size,\n                group=None):\n        if world_size > 1:\n            local_output_buf = global_gather(\n                global_output_buf,\n                local_expert_count,\n                global_expert_count,\n                group=group)\n        else:\n            local_output_buf = global_output_buf\n        output = _local_gather(\n            local_output_buf, pos, local_batch_size, maybe_overlap=False)\n\n        ctx.moe_args = (global_output_buf.shape[0], world_size, group)\n        variables = (pos, local_expert_count, global_expert_count)\n        ctx.save_for_backward(*variables)\n        return output\n\n    @staticmethod\n    def backward(ctx, grad_out):\n        pos, local_expert_count, global_expert_count = ctx.saved_tensor()\n        fwd_batch_size, world_size, group = ctx.moe_args\n        grad_out_buf = _local_scatter(grad_out, pos)\n        if world_size > 1:\n            global_grad_out_buf = global_scatter(\n                grad_out_buf,\n                local_expert_count,\n                global_expert_count,\n                group=group)\n        else:\n            global_grad_out_buf = grad_out_buf\n        return global_grad_out_buf, None, None, None\n\n\nclass AllGather(PyLayer):\n    r\"\"\"\n    A wrapper for the All-Gather function to support auto-differentiation.\n    \"\"\"\n\n    @staticmethod\n    def forward(ctx, inp, rank, world_size, group):\n        tensor_list = []\n        paddle.distributed.all_gather(tensor_list, inp, group=group)\n        output = paddle.concat(tensor_list, axis=0)\n        ctx.args = rank, inp.shape[0]\n        return output\n\n    @staticmethod\n    def backward(ctx, grad_out):\n        rank, dim0 = ctx.args\n        return paddle.slice(\n            grad_out, axes=[0], starts=[rank * dim0], ends=[(rank + 1) * dim0])\n\n\nclass Slice(PyLayer):\n    r\"\"\"\n    A wrapper for the Slice function to support auto-differentiation.\n    \"\"\"\n\n    @staticmethod\n    def forward(ctx, inp, rank, world_size, group):\n        B = inp.shape[0]\n        local_batch_size = B // world_size\n        batch_start = local_batch_size * rank\n        batch_end = min(batch_start + local_batch_size, B)\n        inp = paddle.slice(\n            inp, axes=[0], starts=[batch_start], ends=[batch_end])\n        ctx.args = world_size, group\n        return inp\n\n    @staticmethod\n    def backward(ctx, grad_out):\n        world_size, group = ctx.args\n        return _all_gather(grad_out, group=group)\n"
  },
  {
    "path": "ppfleetx/models/language_model/moe/gate/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom .gshard_gate import GShardGate\nfrom .switch_gate import SwitchGate\nfrom .naive_gate import NaiveGate\nfrom .base_gate import BaseGate\n"
  },
  {
    "path": "ppfleetx/models/language_model/moe/gate/base_gate.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# The file has been adapted from the file:\n#     https://github.com/laekov/fastmoe/blob/master/fmoe/gates/base_gate.py\n#     Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4\n# We retain the following license from the original files:\n#     Copyright 2021, Jiaao He. All rights reserved.\n#   Licensed under the Apache License, Version 2.0 (the \"License\").\n\nimport paddle.nn as nn\n\n\nclass BaseGate(nn.Layer):\n    def __init__(self, num_expert, group=None):\n        super().__init__()\n        self.world_size = group.nranks if group is not None else 1\n        self.num_expert = num_expert\n        self.tot_expert = self.world_size * num_expert\n        self.loss = None\n\n    def forward(self, x):\n        raise NotImplementedError(\"Please implement the forward function.\")\n\n    def set_loss(self, loss):\n        self.loss = loss\n\n    def get_loss(self, clear=True):\n        loss = self.loss\n        if clear:\n            self.loss = None\n        return loss\n"
  },
  {
    "path": "ppfleetx/models/language_model/moe/gate/gshard_gate.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# The file has been adapted from the file:\n#     https://github.com/laekov/fastmoe/blob/master/fmoe/gates/gshard_gate.py\n#     Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4\n# We retain the following license from the original files:\n#     Copyright 2021, Jiaao He. All rights reserved.\n#   Licensed under the Apache License, Version 2.0 (the \"License\").\n\nimport math\nimport paddle\nimport paddle.nn.functional as F\nfrom .naive_gate import NaiveGate\nfrom ..utils import limit_by_capacity\n\n\nclass GShardGate(NaiveGate):\n    def __init__(self,\n                 d_model,\n                 num_expert,\n                 topk=2,\n                 capacity=(1.2, 2.4),\n                 random_routing=True,\n                 group=None):\n        assert topk == 2, \"topk should be 2 in gshard\"\n        super().__init__(d_model, num_expert, group)\n        self.capacity = capacity\n        self.random_routing = random_routing\n        self.group = group\n\n    def forward(self, x):\n        topk_val, topk_idx, gate_score = super().forward(\n            x, return_all_scores=True)\n        s = gate_score.shape[0]\n        top1_idx = topk_idx.flatten()\n        c_e = paddle.scatter(\n            paddle.zeros(shape=[self.tot_expert]),\n            top1_idx,\n            paddle.ones_like(\n                top1_idx, dtype=\"float32\"),\n            overwrite=False) / s\n        m_e = paddle.mean(F.softmax(gate_score, axis=1), axis=0)\n        loss = paddle.mean(c_e * m_e) * (self.num_expert**2)\n        self.set_loss(loss)\n\n        cap_rate = self.capacity[0 if self.training else 1]\n        capacity = math.ceil(cap_rate * x.shape[0])\n        _new_lec, _new_gec, topk_idx = limit_by_capacity(\n            topk_idx,\n            self.num_expert,\n            self.world_size,\n            capacity,\n            group=self.group)\n\n        if self.random_routing:\n            rand_routing_prob = paddle.rand(\n                shape=[gate_score.shape[0]], dtype=\"float32\")\n            topk_idx = paddle.distributed.models.moe.utils._random_routing(\n                topk_idx, topk_val, rand_routing_prob)\n        return topk_val, topk_idx\n"
  },
  {
    "path": "ppfleetx/models/language_model/moe/gate/naive_gate.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# The file has been adapted from the file:\n#     https://github.com/laekov/fastmoe/blob/master/fmoe/gates/naive_gate.py\n#     Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4\n# We retain the following license from the original files:\n#     Copyright 2021, Jiaao He. All rights reserved.\n#   Licensed under the Apache License, Version 2.0 (the \"License\").\n\nfrom .base_gate import BaseGate\n\nimport paddle\nimport paddle.nn as nn\n\n\nclass NaiveGate(BaseGate):\n    def __init__(self, d_model, num_expert, group=None, topk=2):\n        super().__init__(num_expert, group)\n        self.gate = nn.Linear(d_model, self.tot_expert)\n        self.gate.weight.name = \"gate_\" + self.gate.weight.name\n        self.gate.bias.name = \"gate_\" + self.gate.bias.name\n        self.top_k = topk\n\n    def forward(self, inp, return_all_scores=False):\n        gate = self.gate(inp)\n        gate_top_k_val, gate_top_k_idx = paddle.topk(\n            gate, k=self.top_k, axis=-1, largest=True, sorted=False)\n\n        if return_all_scores:\n            return gate_top_k_val, gate_top_k_idx, gate\n        return gate_top_k_val, gate_top_k_idx\n"
  },
  {
    "path": "ppfleetx/models/language_model/moe/gate/switch_gate.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# The file has been adapted from the file:\n#     https://github.com/laekov/fastmoe/blob/master/fmoe/gates/switch_gate.py\n#     Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4\n# We retain the following license from the original files:\n#     Copyright 2021, Jiaao He. All rights reserved.\n#   Licensed under the Apache License, Version 2.0 (the \"License\").\n\nimport math\nimport paddle\nimport paddle.nn.functional as F\nfrom .naive_gate import NaiveGate\nfrom ..utils import limit_by_capacity\n\n\nclass SwitchGate(NaiveGate):\n    def __init__(self,\n                 d_model,\n                 num_expert,\n                 topk=1,\n                 switch_eps=.1,\n                 capacity=(1.2, 2.4),\n                 group=None):\n        assert topk == 1, \"topk should be 1 in switch\"\n        super().__init__(d_model, num_expert, group, topk=1)\n        self.switch_eps = switch_eps\n        self.capacity = capacity\n        self.group = group\n\n    def forward(self, inp):\n        score = self.gate(inp)\n\n        if self.training:\n            noise = paddle.rand(shape=score.shape)\n            noise = noise * 2 * self.switch_eps + 1.0 - self.switch_eps\n            score += noise\n\n        score = F.softmax(score, axis=-1)\n        top1_score, top1_idx = paddle.topk(score, k=1, axis=-1, largest=True)\n\n        cap_rate = self.capacity[0 if self.training else 1]\n        capacity = math.ceil(cap_rate * inp.shape[0])\n        _new_lec, _new_gec, top1_idx = limit_by_capacity(\n            top1_idx,\n            self.num_expert,\n            self.world_size,\n            capacity,\n            group=self.group)\n        valid_idx = top1_idx[top1_idx > -1]\n        valid_idx_tmp = paddle.reshape(valid_idx, shape=[len(valid_idx), 1])\n        fraction_expert = paddle.scatter_nd_add(\n            x=paddle.zeros(shape=[self.tot_expert]),\n            index=valid_idx_tmp,\n            updates=paddle.ones_like(\n                valid_idx, dtype=paddle.float32).reshape(\n                    shape=[len(valid_idx)]), ) / valid_idx.numel()\n        prob_expert = score.sum(axis=0) / valid_idx.numel()\n        loss = (fraction_expert * prob_expert).sum() * self.tot_expert\n        self.set_loss(loss)\n\n        return top1_score, top1_idx\n"
  },
  {
    "path": "ppfleetx/models/language_model/moe/moe_layer.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# The file has been adapted from the file:\n#     https://github.com/laekov/fastmoe/blob/master/fmoe/layers.py\n#     Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4\n# We retain the following license from the original files:\n#     Copyright 2021, Jiaao He. All rights reserved.\n#   Licensed under the Apache License, Version 2.0 (the \"License\").\n\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\n\nfrom .gate import NaiveGate, GShardGate, SwitchGate, BaseGate\nfrom .comm_ops import MoEScatter, MoEGather, AllGather, Slice\nfrom .utils import prepare_forward\nfrom paddle.distributed.fleet.utils import recompute\nfrom paddle.incubate.distributed.fleet import recompute_hybrid\n\n\nclass MoELayer(nn.Layer):\n    \"\"\"MoE Layer\n    Args:\n        d_model: (int) model dimention\n        experts: (list|nn.LayerList) expert networks list\n        gate: (str|BaseGate|None):\n                if gate is a str, it can only be \"naive\", \"gshard\", \"switch\" or None, default is \"naive\"\n                else gate is an instance of BaseGate\n        \n        top_k: (int) default value is 2\n        moe_group: moe group for experts communication\n        mp_group: mp group for mp commutication\n        recompute_interval(int, optional): whether to use recompute, default 0, means to disable recompute.\n        recompute_ctx(dict, optional): the context for recompute, if recompute_interval > 1, recompute_ctx must be given.\n    Examples:\n        .. code-block:: python\n        from paddle.nn import layer, LayerList\n        from paddle.distributed.moe import MoElayer\n        from paddle.distributed.collective import Group\n        from paddle.distributed import fleet\n\n        moe_group = Group(fleet.worker_index(),\n                          0,\n                          list(range(fleet.worker_num())))\n        mp_group = None\n\n        num_experts=8\n        dim_feedforward=512\n        d_model=8\n        top_k=2\n\n        class ExpertLayer(Layer):\n            def __init__(self, d_model, d_hidden, name=None):\n                super(ExpertLayer, self).__init__()\n                self.htoh4 = nn.Linear(d_model, d_hidden)\n                self.h4toh = nn.Linear(d_hidden, d_model)\n\n            def forward(self, x):\n                x = self.htoh4(x)\n                x = self.h4toh(x)\n                return x\n\n        experts_list = LayerList()\n        for expi in range(num_experts):\n            exp_layer = ExpertLayer(d_model, dim_feedforward)\n            experts_list.append(exp_layer)\n\n        moeLayer = MoELayer(d_model = d_model,\n                            experts=experts_list,\n                            gate=\"gshard\",\n                            top_k=2,\n                            moe_group=moe_group,\n                            mp_group=mp_group,\n                            recompute_interval=0)\n\n    \"\"\"\n\n    def __init__(self,\n                 d_model,\n                 experts,\n                 moe_group=None,\n                 mp_group=None,\n                 top_k=2,\n                 gate=None,\n                 recompute_interval=0,\n                 recompute_partition=False,\n                 recompute_offload=False):\n        super(MoELayer, self).__init__()\n\n        self.d_model = d_model\n\n        assert experts is not None\n        assert isinstance(experts, (list, nn.LayerList)), \\\n             \"The type of experts must be list or nn.LayerList\"\n\n        for i, exp in enumerate(experts):\n            assert isinstance(\n                exp,\n                nn.Layer), \"The type of experts[{}] must be nn.Layer\".format(i)\n\n        self.experts = nn.LayerList(experts) if isinstance(experts,\n                                                           list) else experts\n        self.num_experts = len(experts)\n\n        gate = \"naive\" if gate is None else gate\n        assert isinstance(gate, (str, BaseGate)), \\\n             \"The type of gate must be str or an instance of BaseGate\"\n        self.top_k = top_k\n\n        # only support mp/dp\n        self.group = moe_group\n        self.mp_group = mp_group\n\n        self.world_size = self.group.nranks \\\n            if self.group is not None else 1\n\n        if isinstance(gate, str):\n            gate_map = {\n                \"naive\": NaiveGate,\n                \"gshard\": GShardGate,\n                \"switch\": SwitchGate,\n            }\n\n            if gate in gate_map.keys():\n                self.gate = gate_map[gate](self.d_model,\n                                           num_expert=self.num_expert,\n                                           topk=self.top_k,\n                                           group=self.group)\n            else:\n                assert False, \"We only support naive gate, \\\n                                gshard gate and switch gate, \\\n                                but you choose {} gate.\".format(gate)\n        elif isinstance(gate, BaseGate):\n            self.gate = gate\n        else:\n            raise TypeError(\"The type of gate must be either str in ('naive', \\\n                'gshard', 'switch') or an instance of moe.BaseGate\")\n\n        self.recompute_interval = recompute_interval\n        self.recompute_ctx = {\n            \"mp_group\": self.mp_group,\n            \"offload\": recompute_offload,\n            \"partition\": recompute_partition,\n        }\n\n    def forward(self, inp):\n        origin_shape = inp.shape\n        inp = inp.reshape_([-1, origin_shape[-1]])\n\n        mp_rank = 0\n        mp_size = 1\n        if self.mp_group is not None:\n            mp_rank = self.mp_group.rank\n            mp_size = self.mp_group.nranks\n        if mp_size > 1:\n            inp = Slice.apply(inp, mp_rank, mp_size, self.mp_group)\n        value, gate = self.gate(inp)\n\n        (\n            pos,\n            local_expert_count,\n            global_expert_count,\n            fwd_expert_count,\n            fwd_batch_size, ) = prepare_forward(gate, self.num_expert,\n                                                self.world_size, self.group)\n\n        topk = 1\n        if len(gate.shape) == 2:\n            topk = gate.shape[1]\n\n        if pos.shape != [0]:\n            temp_pos = pos // topk\n        else:\n            temp_pos = pos\n        assert topk == self.top_k\n\n        x = MoEScatter.apply(inp, temp_pos, local_expert_count,\n                             global_expert_count, fwd_batch_size,\n                             self.world_size, self.group)\n\n        d_model = self.d_model\n\n        def experts_fwd(x, fwd_expert_count, experts):\n\n            if x.shape[0] == 0:\n                return x\n            y = []\n            last_index = 0\n            assert isinstance(fwd_expert_count, np.ndarray)\n            assert len(experts) == len(fwd_expert_count)\n            for idx, expert_count in enumerate(fwd_expert_count):\n                if expert_count <= 0:\n                    continue\n                y.append(experts[idx](x[last_index:expert_count + last_index]))\n                last_index = expert_count + last_index\n            return paddle.concat(y, axis=0)\n\n        if self.recompute_interval <= 0 or x.shape[0] == 0:\n            x = experts_fwd(x, fwd_expert_count.numpy(), self.experts)\n        elif self.world_size > 1:\n            x = recompute_hybrid(self.recompute_ctx, experts_fwd, x,\n                                 fwd_expert_count.numpy(), self.experts)\n        else:\n            x = recompute(experts_fwd, x,\n                          fwd_expert_count.numpy(), self.experts)\n\n        out_batch_size = inp.shape[0]\n        if len(gate.shape) == 2:\n            out_batch_size *= gate.shape[1]\n\n        x = MoEGather.apply(x, pos, local_expert_count, global_expert_count,\n                            out_batch_size, self.world_size, self.group)\n\n        x = x.reshape([-1, self.top_k, d_model])\n        value = value.reshape([x.shape[0], 1, self.top_k])\n        x = paddle.bmm(value, x).reshape([-1, d_model])\n\n        if mp_size > 1:\n            x = AllGather.apply(x, mp_rank, mp_size, self.mp_group)\n\n        x = paddle.reshape_(x, origin_shape)\n\n        return x\n"
  },
  {
    "path": "ppfleetx/models/language_model/moe/utils.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# The file has been adapted from the file:\n#     https://github.com/laekov/fastmoe/blob/master/fmoe/functions.py\n#     Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4\n# We retain the following license from the original files:\n#     Copyright 2021, Jiaao He. All rights reserved.\n#   Licensed under the Apache License, Version 2.0 (the \"License\").\n\nimport paddle\nfrom paddle.distributed.models.moe.utils import _number_count, _limit_by_capacity, _prune_gate_by_capacity, _assign_pos\n\n\ndef prepare_forward(gate, num_expert, world_size, moe_group):\n    pos, local_expert_count, global_expert_count = count_by_gate(\n        gate, num_expert, world_size, group=moe_group)\n    with paddle.no_grad():\n        fwd_expert_count = global_expert_count.reshape_(\n            [world_size, num_expert]).sum(axis=0)\n        fwd_batch_size = int(fwd_expert_count.sum().item())\n    return (\n        pos,\n        local_expert_count,\n        global_expert_count,\n        fwd_expert_count,\n        fwd_batch_size, )\n\n\ndef _alltoall(in_tensor_list, group=None, use_calc_stream=True):\n    if group is not None and not group.is_member():\n        return\n\n    group = paddle.distributed.collective._get_default_group(\n    ) if group is None else group\n    out = paddle.empty(in_tensor_list.shape, in_tensor_list.dtype)\n    task = group.process_group.alltoall(in_tensor_list, out)\n    task.wait()\n    return out\n\n\ndef _local_scatter(inp, pos):\n    if pos.shape != [0]:\n        inp_buf = paddle.index_select(inp, pos, 0)\n    else:\n        inp_buf = paddle.empty([0, inp.shape[1]], dtype=inp.dtype)\n    return inp_buf\n\n\ndef _local_gather(inp, pos, out_batch_size, maybe_overlap=True):\n    if pos.shape != [0]:\n        origin_dtype = inp.dtype\n        inp = paddle.cast(inp, dtype=\"float32\")\n        inp_buf = paddle.scatter(\n            paddle.zeros(\n                shape=[out_batch_size, inp.shape[-1]], dtype=\"float32\"),\n            pos,\n            inp,\n            overwrite=True)\n        inp_buf = paddle.cast(inp_buf, dtype=origin_dtype)\n    else:\n        inp_buf = paddle.zeros(\n            [out_batch_size, inp.shape[-1]], dtype=inp.dtype)\n    return inp_buf\n\n\ndef _all_gather(tensor, group=None, use_calc_stream=True):\n    if group is not None and not group.is_member():\n        return\n\n    group = paddle.distributed.collective._get_default_group(\n    ) if group is None else group\n    tensor_shape = list(tensor.shape)\n    tensor_shape[0] *= group.nranks\n    out = paddle.empty(tensor_shape, tensor.dtype)\n\n    task = group.process_group.all_gather(tensor, out)\n    task.wait()\n    return out\n\n\ndef count_by_gate(gate, num_expert, world_size, require_pos=True, group=None):\n    total_expert_count = num_expert * world_size\n    with paddle.no_grad():\n        local_expert_count = _number_count(gate, total_expert_count)\n\n        if world_size > 1:\n            global_expert_count = _alltoall(local_expert_count, group=group)\n        else:\n            global_expert_count = local_expert_count\n        if not require_pos:\n            pos = None\n        else:\n            lec_cum = paddle.cumsum(local_expert_count, axis=0)\n            pos = _assign_pos(gate, lec_cum)\n    return pos, local_expert_count, global_expert_count\n\n\ndef limit_by_capacity(topk_idx, num_expert, world_size, capacity, group=None):\n    with paddle.no_grad():\n        capacity = paddle.ones(\n            shape=[num_expert], dtype=paddle.int64) * capacity\n        pos, lec, gec = count_by_gate(\n            topk_idx, num_expert, world_size, require_pos=False, group=group)\n        new_gec = _limit_by_capacity(gec, capacity, world_size)\n        if world_size > 1:\n            assert group.nranks == world_size\n            new_lec = _alltoall(new_gec, group=group)\n        else:\n            new_lec = new_gec\n\n        topk_idx = _prune_gate_by_capacity(topk_idx, new_lec, num_expert,\n                                           world_size)\n\n    return new_lec, new_gec, topk_idx\n"
  },
  {
    "path": "ppfleetx/models/language_model/moe_exp/__init__.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/models/language_model/moe_exp/experts.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n\n# The file has been adapted from a deepspeed file:\n# https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/moe/experts.py\n# Git commit hash: a091bc223c01e94448f443456a6c15684644b966\n# We retain the following license from the original files:\n#   Copyright (c) The Microsoft DeepSpeed Team. All rights reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport copy\n\n\nclass Experts(nn.Layer):\n    def __init__(self, expert, num_local_experts=1, expert_group_name=None):\n        super(Experts, self).__init__()\n\n        self.fleetx_experts = nn.LayerList(\n            [copy.deepcopy(expert) for i in range(num_local_experts)])\n        self.num_local_experts = num_local_experts\n\n        # TODO: revisit allreduce for moe.gate...\n        for expert in self.fleetx_experts:\n            # TODO: Create param groups to handle expert + data case (e.g. param.group = moe_group)\n            for name, param in expert.named_parameters():\n                param.allreduce = False\n                param.group_name = expert_group_name\n\n    def forward(self, inputs):\n        chunks = paddle.chunk(inputs, chunks=self.num_local_experts, axis=1)\n        expert_outputs = []\n        for chunk, expert in zip(chunks, self.fleetx_experts):\n            out = expert(chunk)\n            if type(out) is tuple:\n                out = out[0]  # Ignore the bias term for now\n            expert_outputs += [out]\n\n        expert_output = paddle.concat(expert_outputs, axis=1)\n        return expert_output\n"
  },
  {
    "path": "ppfleetx/models/language_model/moe_exp/layer.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n# \n# The file has been adapted from a deepspeed file:\n# https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/moe/layer.py\n# Git commit hash: a091bc223c01e94448f443456a6c15684644b966\n# We retain the following license from the original files:\n#   Copyright (c) The Microsoft DeepSpeed Team. All rights reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\n\nfrom .experts import Experts\nfrom .sharded_moe import TopKGate, MOELayer\n\n\nclass MoE(nn.Layer):\n    def __init__(self,\n                 hidden_size,\n                 expert,\n                 num_experts=1,\n                 ep_size=1,\n                 k=1,\n                 capacity_factor=1.,\n                 eval_capacity_factor=1.,\n                 min_capacity=4,\n                 use_residual=False,\n                 noisy_gate_policy=None,\n                 drop_tokens=True,\n                 use_rts=False,\n                 enable_expert_tensor_parallelism=False):\n        super(MoE, self).__init__()\n\n        self.use_residual = use_residual\n        self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism\n        assert num_experts % ep_size == 0, f\"Number of experts ({num_experts}) should be divisible by expert parallel size ({ep_size})\"\n        self.ep_size = ep_size\n        self.expert_group_name = f\"ep_size_{self.ep_size}\"\n        self.num_experts = num_experts\n        self.num_local_experts = num_experts // self.ep_size\n\n        # log_dist(\n        #     f'Creating MoE layer with num_experts: {num_experts} | num_local_experts: {self.num_local_experts} | expert_parallel_size: {self.ep_size}',\n        #     [0])\n\n        assert noisy_gate_policy is None or noisy_gate_policy in ['None', 'Jitter', 'RSample'], \\\n            'Unsupported noisy_gate_policy: ' + noisy_gate_policy\n\n        experts = Experts(expert, self.num_local_experts,\n                          self.expert_group_name)\n        self.gate = TopKGate(hidden_size, num_experts, k, capacity_factor,\n                             eval_capacity_factor, min_capacity,\n                             noisy_gate_policy, drop_tokens, use_rts)\n        self.fleetx_moe = MOELayer(self.gate, experts, self.expert_group_name,\n                                   self.ep_size, self.num_local_experts)\n        if self.use_residual:\n            self.mlp = expert\n            # coefficient is used for weighted sum of the output of expert and mlp\n            self.coefficient = nn.Linear(hidden_size, 2)\n\n    def forward(self, hidden_states, used_token=None):\n        \"\"\" MoE forward\n\n        Arguments:\n            hidden_states (Tensor): input to the layer\n            used_token (Tensor, optional): default: None, mask only used tokens\n\n        Returns:\n            A tuple including output, gate loss, and expert count.\n\n            * output (Tensor): output of the model\n\n            * l_aux (Tensor): gate loss value\n\n            * exp_counts (int): expert count\n        \"\"\"\n        output = self.fleetx_moe(hidden_states, used_token)\n        if self.use_residual:\n            # Residual MoE\n            output_mlp = self.mlp(hidden_states)\n            if type(output_mlp) is tuple:\n                output_mlp = output_mlp[0]  # Ignore the bias term for now\n            coef = self.coefficient(hidden_states)\n            coef = F.softmax(coef, dim=-1)\n            output = output * coef[..., 0:1] + output_mlp * coef[..., 1:]\n        return output\n"
  },
  {
    "path": "ppfleetx/models/language_model/moe_exp/mappings.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n# \n# The file has been adapted from a deepspeed file:\n# https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/moe/mappings.py\n# Git commit hash: a091bc223c01e94448f443456a6c15684644b966\n# We retain the following license from the original files:\n#   Copyright (c) The Microsoft DeepSpeed Team. All rights reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.distributed as dist\nfrom paddle.autograd import PyLayer\n\n\n#TODO: set axis for all_gather\ndef _gather_tokens(input_, group, axis=0):\n    \"\"\"Gather tensors and concatenate them along a axisension\"\"\"\n    # in case model is not deployed in distributed environment\n    group = dist.collective._get_default_group() if group is None else group\n    tensor_list = [paddle.empty_like(input_) for _ in range(group.nranks)]\n    dist.all_gather(tensor_list, input_, group)\n    output_ = paddle.concat(tensor_list, axis=axis)\n    return output_\n\n\ndef _drop_tokens(input_, group, axis=0):\n    \"\"\"Divide a tensor among the tensor parallel ranks\"\"\"\n    # in case model is not deployed in distributed environment\n    group = dist.collective._get_default_group() if group is None else group\n\n    total_chunks = group.nranks\n    this_chunk = group.rank\n    assert input_.shape[\n        axis] % total_chunks == 0, f\"input dimention {axis} ({input_.shape[axis]}) is not divisible by tensor parallel world size ({total_chunks})\"\n    chunk_size = input_.shape[axis] // total_chunks\n\n    return paddle.slice(input_, [axis], [this_chunk * chunk_size],\n                        [this_chunk * chunk_size + chunk_size])\n\n\nclass _GatherTokens(PyLayer):\n    \"\"\"All gather tokens among the tensor parallel ranks\"\"\"\n\n    @staticmethod\n    def forward(ctx, input_, group, axis):\n        ctx.group = group\n        ctx.axis = axis\n        return _gather_tokens(input_, group, axis)\n\n    @staticmethod\n    def backward(ctx, grad_output):\n        return _drop_tokens(grad_output, ctx.group, ctx.axis), None\n\n\nclass _DropTokens(PyLayer):\n    \"Divide tokens equally among the tensor parallel ranks\"\n\n    @staticmethod\n    def forward(ctx, input_, group, axis):\n        ctx.group = group\n        ctx.axis = axis\n        return _drop_tokens(input_, axis)\n\n    @staticmethod\n    def backward(ctx, grad_output):\n        return _gather_tokens(grad_output, ctx.group, ctx.axis), None\n\n\ndef gather_tokens(input_, group=None, axis=0):\n    if group is None or group.nranks == 1:\n        # no tensor parallelism for non-experts\n        return input_\n    return _GatherTokens.apply(input_, group, axis)\n\n\ndef drop_tokens(input_, group=None, axis=0):\n    if group is None or group.nranks == 1:\n        # no tensor parallelism for non-experts\n        return input_\n    return _DropTokens.apply(input_, group, axis)\n"
  },
  {
    "path": "ppfleetx/models/language_model/moe_exp/sharded_moe.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n# \n# The file has been adapted from a deepspeed file:\n# https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/moe/sharded_moe.py\n# Git commit hash: a091bc223c01e94448f443456a6c15684644b966\n# We retain the following license from the original files:\n#   Copyright (c) The Microsoft DeepSpeed Team. All rights reserved.\n# \n\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nfrom typing import Callable, Dict, Tuple, Optional, Any\nfrom paddle.distribution import Uniform, Gumbel\nimport paddle.nn.functional as F\nfrom paddle import Tensor\nimport paddle.nn as nn\nimport paddle.distributed as dist\nfrom paddle.autograd import PyLayer\nimport paddle.distributed.fleet as fleet\n\nfrom .mappings import drop_tokens, gather_tokens\n\nuniform_map: Dict[str, Callable] = {}\ngumbel_map: Dict[str, Callable] = {}\nexp_selection_uniform_map: Dict[str, Callable] = {}\n\n\ndef multiplicative_jitter(x, epsilon=1e-2):\n    if epsilon == 0:\n        return x\n    device = paddle.get_device()\n    uniform = uniform_map.get(device)\n    if uniform is None:\n        uniform = Uniform(\n            low=paddle.to_tensor(1.0 - epsilon),\n            high=paddle.to_tensor(1.0 + epsilon)).rsample  # type: ignore\n        uniform_map[device] = uniform\n    return x * uniform(x.shape)\n\n\ndef gumbel_rsample(shape):\n    device = paddle.get_device()\n    gumbel = gumbel_map.get(device)\n    if gumbel is None:\n        one = paddle.to_tensor(1.0)\n        zero = paddle.to_tensor(0.0)\n        gumbel = Gumbel(zero, one).rsample  # type: ignore\n        gumbel_map[device] = gumbel\n    return gumbel(shape)\n\n\n# einsum dimensions: (g)roup, (s)equence, (e)xpert, (m)odel, (c)apacity\n# See https://arxiv.org/pdf/2006.16668.pdf for details.\n\nclass _AllToAll(PyLayer):\n    @staticmethod\n    def forward(ctx: Any, group: dist.collective.Group,\n                input: Tensor) -> Tensor:  # type: ignore\n        ctx.group = group\n        output = paddle.empty_like(input)\n        dist.alltoall_single(input, output, group=group)\n        return output\n\n    @staticmethod\n    def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor]:\n        return (None, _AllToAll.apply(ctx.group, *grad_output))\n\n\n# einsum rewrites are on par or more performant\n# switch can be bubbled up in future\nUSE_EINSUM = True\n\n\n# einsum dimensions: (g)roup, (s)equence, (e)xpert, (m)odel, (c)apacity\n# See https://arxiv.org/pdf/2006.16668.pdf for details.\ndef einsum(rule, a, b):\n    if USE_EINSUM:\n        return paddle.einsum(rule, a, b)\n    elif rule == 's,se->se':\n        return a.reshape((a.shape[0], -1)) * b\n    elif rule == 'se,sc->sec':\n        return a.unsqueeze(2) * b.unsqueeze(1)\n    elif rule == 'se,se->s':\n        return paddle.bmm(paddle.unsqueeze(a, 1),\n                          paddle.unsqueeze(b, 2)).reshape((-1))\n    elif rule == 'sec,sm->ecm':\n        s = a.shape[0]\n        e = a.shape[1]\n        c = a.shape[2]\n        m = b.shape[1]\n        return paddle.matmul(a.reshape((s, -1)).t(), b).reshape((e, c, m))\n    elif rule == 'sec,ecm->sm':\n        return paddle.matmul(\n            a.reshape((a.shape[0], -1)), b.reshape((-1, b.shape[-1])))\n    elif rule == 'ks,ksm->sm':\n        k = b.shape[0]\n        s = b.shape[1]\n        m = b.shape[2]\n        # [k, s] -> [s, k] -> [s, 1, k]\n        a = a.t().unsqueeze(1)\n        # [k,s,m] -> [k, sm] -> [sm, k] -> [s, m, k]\n        b = b.reshape((k, -1)).t().reshape((s, m, k))\n        # bmm([s, 1, k], [s, m, k]^t) -> [s, m, 1]\n        return paddle.bmm(a, b.transpose(1, 2)).squeeze(2)\n    else:\n        return paddle.einsum(rule, a, b)\n\ndef _capacity(gates, capacity_factor, min_capacity):\n    # gates has shape of SE\n    num_tokens = gates.shape[0]\n    num_experts = gates.shape[1]\n    capacity = paddle.ceil(\n        (num_tokens / num_experts) * capacity_factor).astype(paddle.int64)\n    if capacity < min_capacity:\n        capacity = min_capacity.astype(paddle.int64)\n    return capacity\n\n\ndef _top_idx(source, k):\n    return paddle.topk(source, k=k, axis=0)[1]\n\n\ndef top1gating(logits,\n               capacity_factor: float,\n               min_capacity: int,\n               used_token: Tensor=None,\n               noisy_gate_policy: Optional[str]=None,\n               drop_tokens: bool=True,\n               use_rts: bool=True) -> Tuple[Tensor, Tensor, Tensor, Tensor]:\n    \"\"\"Implements Top1Gating on logits.\"\"\"\n    if noisy_gate_policy == 'RSample':\n        logits_w_noise = logits + \\\n            gumbel_rsample(logits.shape, device=logits.device)\n    # everything is in fp32 in this function\n    gates = F.softmax(logits, axis=1)\n\n    capacity = _capacity(gates,\n                         paddle.to_tensor(capacity_factor),\n                         paddle.to_tensor(min_capacity))\n\n    # Create a mask for 1st's expert per token\n    # noisy gating\n    indices1_s = paddle.argmax(\n        logits_w_noise if noisy_gate_policy == 'RSample' else gates, axis=1)\n    num_experts = int(gates.shape[1])\n\n    assert(0 <= indices1_s.min() and indices1_s.max() < num_experts)\n    mask1 = F.one_hot(indices1_s, num_classes=num_experts)\n\n    # mask only used tokens\n    if used_token is not None:\n        mask1 = einsum(\"s,se->se\", used_token, mask1)\n\n    # gating decisions\n    exp_counts = paddle.sum(mask1, axis=0).detach()\n\n    # if we don't want to drop any tokens\n    if not drop_tokens:\n        new_capacity = paddle.max(exp_counts)\n        # dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX,\n        #                 group=dist.get_world_group())\n        # capacity = new_capacity\n        group = dist.collective._get_default_group()\n        task = group.process_group.all_reduce(new_capacity, dist.ReduceOp.MAX)\n        task.wait()\n\n    # Compute l_aux\n    me = paddle.mean(gates, axis=0)\n    ce = paddle.mean(mask1.astype(\"float32\"), axis=0)\n    l_aux = paddle.sum(me * ce) * num_experts\n\n    # Random Token Selection\n    if use_rts:\n        device = paddle.get_device()\n        uniform = exp_selection_uniform_map.get(device)\n        if uniform is None:\n            uniform = Uniform(\n                low=paddle.to_tensor(0.0), high=paddle.to_tensor(1.0)).rsample\n            exp_selection_uniform_map[device] = uniform\n\n        mask1_rand = mask1 * uniform(mask1.shape)\n    else:\n        mask1_rand = mask1\n\n    assert logits.shape[\n        0] >= min_capacity, \"No. of tokens (batch-size) should be greater than min_capacity. Either set min_capacity to 0 or increase your batch size.\"\n\n    top_idx = _top_idx(mask1_rand, capacity)\n    new_mask1 = paddle.zeros_like(mask1).put_along_axis_(\n        indices=top_idx, values=1., axis=0)\n    mask1 *= new_mask1\n\n    # Compute locations in capacity buffer\n\n    with paddle.amp.auto_cast(False, level='O2'):\n        locations1 = paddle.cumsum(mask1.astype(paddle.float32), axis=0) - 1\n        # Store the capacity location for each token\n        locations1_s = paddle.sum(locations1 * mask1.astype(paddle.float32), axis=1)\n\n    # Normalize gate probabilities\n    mask1_float = mask1.astype(\"float32\")\n    gates = gates * mask1_float\n\n    assert(0 <= locations1_s.astype(paddle.int32).min() and locations1_s.astype(paddle.int32).max() < capacity)\n    locations1_sc = F.one_hot(locations1_s.astype(paddle.int32),\n                              capacity).astype(paddle.float32)\n\n    combine_weights = einsum(\"se,sc->sec\", gates, locations1_sc)\n\n    dispatch_mask = combine_weights.astype(\"bool\")\n\n    return l_aux, combine_weights, dispatch_mask, exp_counts\n\n\ndef top2gating(logits: Tensor, capacity_factor: float,\n               min_capacity: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:\n    \"\"\"Implements Top2Gating on logits.\"\"\"\n    # everything is in fp32 in this function\n    gates = F.softmax(logits, axis=1)\n\n    capacity = _capacity(gates,\n                         paddle.to_tensor(capacity_factor * 2),\n                         paddle.to_tensor(min_capacity))\n\n    # Create a mask for 1st's expert per token\n    indices1_s = paddle.argmax(gates, axis=1)\n    num_experts = int(gates.shape[1])\n    mask1 = F.one_hot(indices1_s, num_classes=num_experts)\n\n    # Create a mask for 2nd's expert per token using Gumbel-max trick\n    # https://timvieira.github.io/blog/post/2014/07/31/gumbel-max-trick/\n    logits_w_noise = logits + gumbel_rsample(logits.shape)\n    # Replace top-expert with min value\n    # logits_except1 = logits_w_noise.masked_fill(mask1.astype(\"bool\"), float(\"-inf\"))\n    logits_except1 = paddle.where(\n        mask1.astype(\"bool\"),\n        paddle.ones(logits_w_noise.shape) * float(\"-inf\"), logits_w_noise)\n    indices2_s = paddle.argmax(logits_except1, axis=1)\n    mask2 = F.one_hot(indices2_s, num_classes=num_experts)\n\n    # Compute locations in capacity buffer\n    locations1 = paddle.cumsum(mask1, axis=0) - 1\n    locations2 = paddle.cumsum(mask2, axis=0) - 1\n    # Update 2nd's location by accounting for locations of 1st\n    locations2 += paddle.sum(mask1, axis=0, keepdim=True)\n\n    # gating decisions\n    exp_counts = paddle.sum(mask1, axis=0).detach()\n\n    # Compute l_aux\n    me = paddle.mean(gates, axis=0)\n    ce = paddle.mean(mask1.astype(\"float32\"), axis=0)\n    l_aux = paddle.mean(me * ce) * num_experts * num_experts\n\n    # Remove locations outside capacity from mask\n    mask1 *= paddle.less_than(locations1, capacity)\n    mask2 *= paddle.less_than(locations2, capacity)\n\n    # Store the capacity location for each token\n    locations1_s = paddle.sum(locations1 * mask1, axis=1)\n    locations2_s = paddle.sum(locations2 * mask2, axis=1)\n\n    # Normalize gate probabilities\n    mask1_float = mask1.astype(\"float32\")\n    mask2_float = mask2.astype(\"float32\")\n    gates1_s = einsum(\"se,se->s\", gates, mask1_float)\n    gates2_s = einsum(\"se,se->s\", gates, mask2_float)\n    denom_s = gates1_s + gates2_s\n    # Avoid divide-by-zero\n    # HACK: paddle currently does not support finfo, use constant instead\n    min_constant = 1.1920928955078125e-07\n    denom_s = paddle.clip(denom_s, min=min_constant)\n    gates1_s /= denom_s\n    gates2_s /= denom_s\n\n    # Calculate combine_weights and dispatch_mask\n    gates1 = einsum(\"s,se->se\", gates1_s, mask1_float)\n    gates2 = einsum(\"s,se->se\", gates2_s, mask2_float)\n    locations1_sc = F.one_hot(locations1_s, capacity)\n    locations2_sc = F.one_hot(locations2_s, capacity)\n    combine1_sec = einsum(\"se,sc->sec\", gates1, locations1_sc)\n    combine2_sec = einsum(\"se,sc->sec\", gates2, locations2_sc)\n    combine_weights = combine1_sec + combine2_sec\n    dispatch_mask = combine_weights.astype(\"bool\")\n\n    return l_aux, combine_weights, dispatch_mask, exp_counts\n\n\nclass TopKGate(nn.Layer):\n    \"\"\"Gate module which implements Top2Gating as described in Gshard_.\n    ::\n\n        gate = TopKGate(model_dim, num_experts)\n        l_aux, combine_weights, dispatch_mask = gate(input)\n\n    .. Gshard_: https://arxiv.org/pdf/2006.16668.pdf\n\n    Args:\n        model_dim (int):\n            size of model embedding dimension\n        num_experts (ints):\n            number of experts in model\n    \"\"\"\n\n    wg: nn.Linear\n\n    def __init__(self,\n                 model_dim: int,\n                 num_experts: int,\n                 k: int=1,\n                 capacity_factor: float=1.0,\n                 eval_capacity_factor: float=1.0,\n                 min_capacity: int=8,\n                 noisy_gate_policy: Optional[str]=None,\n                 drop_tokens: bool=True,\n                 use_rts: bool=True) -> None:\n        super().__init__()\n\n        # Only top-1 and top-2 are supported at the moment.\n        if k != 1 and k != 2:\n            raise ValueError('Only top-1 and top-2 gatings are supported.')\n        self.wg = nn.Linear(model_dim, num_experts).to(dtype=paddle.float32)\n        self.k = k\n        self.capacity_factor = capacity_factor\n        self.eval_capacity_factor = eval_capacity_factor\n        self.min_capacity = min_capacity\n        self.noisy_gate_policy = noisy_gate_policy\n        # self.timers = SynchronizedWallClockTimer()\n        self.wall_clock_breakdown = False\n        self.gate_time = 0.0\n        self.drop_tokens = drop_tokens\n        self.use_rts = use_rts\n\n    def forward(self, input: paddle.Tensor, used_token: paddle.Tensor=None\n                ) -> Tuple[Tensor, Tensor, Tensor]:  # type: ignore\n\n        # if self.wall_clock_breakdown:\n        #     self.timers('TopKGate').start()\n\n        if self.wg.weight.dtype != paddle.float32:\n            self.wg = self.wg.to(dtype=paddle.float32)\n        input_fp32 = input.astype(\"float32\")\n        # input jittering\n        if self.noisy_gate_policy == 'Jitter' and self.training:\n            input_fp32 = multiplicative_jitter(input_fp32)\n        logits = self.wg(input_fp32)\n\n        if self.k == 1:\n            gate_output = top1gating(\n                logits, self.capacity_factor\n                if self.training else self.eval_capacity_factor,\n                self.min_capacity, used_token, self.noisy_gate_policy\n                if self.training else None, self.drop_tokens, self.use_rts)\n\n        else:\n            gate_output = top2gating(logits, self.capacity_factor\n                                     if self.training else\n                                     self.eval_capacity_factor,\n                                     self.min_capacity)\n\n        # if self.wall_clock_breakdown:\n        #     self.timers('TopKGate').stop()\n        #     self.gate_time = self.timers('TopKGate').elapsed(reset=False)\n\n        return gate_output\n\n\nclass MOELayer(nn.Layer):\n\n    def __init__(self,\n                 gate: nn.Layer,\n                 experts: nn.Layer,\n                 ep_group_name,\n                 ep_size,\n                 num_local_experts: int) -> None:\n        super().__init__()\n        self.gate = gate\n        self.experts = experts\n        self.ep_group = None\n        self.ep_size = ep_size\n        self.ep_group_name = ep_group_name\n        self.num_local_experts = num_local_experts\n        self.time_falltoall = 0.0\n        self.time_salltoall = 0.0\n        self.time_moe = 0.0\n        # self.timers = SynchronizedWallClockTimer()\n        self.wall_clock_breakdown = False\n        #HACK need fix\n        # self.hcg = fleet.get_hybrid_communicate_group()\n        self.hcg = None\n\n    def _set_ep_group(self, ep_group):\n        self.ep_group = ep_group\n\n    def get_loss(self):\n        return self.l_aux\n\n    def forward(self, *input: Tensor, **kwargs: Any) -> Tensor:\n\n        # if self.wall_clock_breakdown:\n        #     self.timers('moe').start()\n\n        # Implement Algorithm 2 from GShard paper.\n        d_model = input[0].shape[-1]\n\n        # Initial implementation -> Reshape into S tokens by dropping sequence dimension.\n        # Reshape into G groups so that each group can distribute tokens equally\n        # group_size = kwargs['group_size'] if 'group_size' in kwargs.keys() else 1\n        reshaped_input = input[0].reshape((-1, d_model))\n\n        self.l_aux, combine_weights, dispatch_mask, self.exp_counts = self.gate(\n            reshaped_input, input[1])\n        dispatched_input = einsum(\"sec,sm->ecm\",\n                                  dispatch_mask.astype(input[0].dtype),\n                                  reshaped_input)\n\n        # if self.wall_clock_breakdown:\n        #     self.timers('falltoall').start()\n\n        # HACK: _get_expert_model_parallel_world_size is needed here\n        if False and self.hcg.get_model_parallel_group().nranks == 1:\n            # If the non-expert is tensor-parallel, it will create\n            # duplicate tokens on the tensor-parallel ranks.\n            # Since our experts are not tensor-parallel, these duplicates\n            # need to be dropped to ensure correctness.\n            # this also doubles up as a communication optimization as we are\n            # reducing the all-to-all communication volume.\n            dispatched_input = drop_tokens(dispatched_input, axis=1)\n\n        # HACK disable AllToAll\n        # dispatched_input = _AllToAll.apply(self.ep_group, dispatched_input)\n\n        # if self.wall_clock_breakdown:\n        #     self.timers('falltoall').stop()\n        #     self.time_falltoall = self.timers('falltoall').elapsed(reset=False)\n\n        # Re-shape after all-to-all: ecm -> gecm\n        dispatched_input = dispatched_input.reshape(\n            (self.ep_size, self.num_local_experts, -1, d_model))\n\n        expert_output = self.experts(dispatched_input)\n\n        # if self.wall_clock_breakdown:\n        #     self.timers('salltoall').start()\n\n        # HACK disable AllToAll\n        # expert_output = _AllToAll.apply(self.ep_group, expert_output)\n\n        # if self.wall_clock_breakdown:\n        #     self.timers('salltoall').stop()\n        #     self.time_salltoall = self.timers('salltoall').elapsed(reset=False)\n\n        # Re-shape back: gecm -> ecm\n        expert_output = expert_output.reshape(\n            (self.ep_size * self.num_local_experts, -1, d_model))\n\n        # HACK: _get_expert_model_parallel_world_size is needed here\n        if False and self.hcg.get_model_parallel_group().nranks == 1:\n            # the dropped duplicate tokens need to be gathered on each\n            # tensor parallel rank again for the tensor-parallel\n            # non-expert of the next layer.\n            expert_output = gather_tokens(expert_output, axis=1)\n\n        combined_output = einsum(\"sec,ecm->sm\",\n                                 combine_weights.astype(input[0].dtype),\n                                 expert_output)\n\n        a = combined_output.reshape((input[0].shape))\n\n        # if self.wall_clock_breakdown:\n        #     self.timers('moe').stop()\n        #     self.time_moe = self.timers('moe').elapsed(reset=False)\n\n        return a\n"
  },
  {
    "path": "ppfleetx/models/language_model/t5/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom .modeling import (finfo, ACT2FN, ModelOutput, get_t5_model,\n                       t5_encode_text, get_encoded_dim)\nfrom .utils import normal_, constant_init\n"
  },
  {
    "path": "ppfleetx/models/language_model/t5/modeling.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport math\nimport copy\nimport json\nimport numpy as np\nfrom collections import OrderedDict\nfrom typing import Callable, List, Optional, Set, Tuple, Union, Any\n\nimport paddle\nfrom paddle import nn\n\nfrom ppfleetx.data.tokenizers.t5_tokenizer import (\n    t5_tokenize, get_t5_tokenizer, DEFAULT_T5_NAME)\nfrom ppfleetx.models.multimodal_model.imagen.utils import rearrange, exists, default\n\n\ndef finfo(dtype):\n    if dtype == paddle.float32:\n        return np.finfo(np.float32)\n    if dtype == paddle.float16:\n        return np.finfo(np.float16)\n    if dtype == paddle.float64:\n        return np.finfo(np.float64)\n\n\ndef fields(class_or_instance):\n    \"\"\"Return a tuple describing the fields of this dataclass.\n\n    Accepts a dataclass or an instance of one. Tuple elements are of\n    type Field.\n    \"\"\"\n\n    # Might it be worth caching this, per class?\n    try:\n        fields = getattr(class_or_instance, _FIELDS)\n    except AttributeError:\n        raise TypeError('must be called with a dataclass type or instance')\n\n    # Exclude pseudo-fields.  Note that fields is sorted by insertion\n    # order, so the order of the tuple is as the fields were defined.\n    return tuple(f for f in fields.values() if f._field_type is _FIELD)\n\n\ndef is_tensor(x):\n    return isinstance(x, paddle.Tensor)\n\n\nclass ModelOutput(OrderedDict):\n    \"\"\"\n    Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a\n    tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular\n    python dictionary.\n\n    <Tip warning={true}>\n\n    You can't unpack a `ModelOutput` directly. Use the [`~utils.ModelOutput.to_tuple`] method to convert it to a tuple\n    before.\n\n    </Tip>\n    \"\"\"\n\n    def __post_init__(self):\n        class_fields = fields(self)\n\n        # Safety and consistency checks\n        if not len(class_fields):\n            raise ValueError(f\"{self.__class__.__name__} has no fields.\")\n        if not all(field.default is None for field in class_fields[1:]):\n            raise ValueError(\n                f\"{self.__class__.__name__} should not have more than one required field.\"\n            )\n\n        first_field = getattr(self, class_fields[0].name)\n        other_fields_are_none = all(\n            getattr(self, field.name) is None for field in class_fields[1:])\n\n        if other_fields_are_none and not is_tensor(first_field):\n            if isinstance(first_field, dict):\n                iterator = first_field.items()\n                first_field_iterator = True\n            else:\n                try:\n                    iterator = iter(first_field)\n                    first_field_iterator = True\n                except TypeError:\n                    first_field_iterator = False\n\n            # if we provided an iterator as first field and the iterator is a (key, value) iterator\n            # set the associated fields\n            if first_field_iterator:\n                for element in iterator:\n                    if (not isinstance(element, (list, tuple)) or\n                            not len(element) == 2 or\n                            not isinstance(element[0], str)):\n                        break\n                    setattr(self, element[0], element[1])\n                    if element[1] is not None:\n                        self[element[0]] = element[1]\n            elif first_field is not None:\n                self[class_fields[0].name] = first_field\n        else:\n            for field in class_fields:\n                v = getattr(self, field.name)\n                if v is not None:\n                    self[field.name] = v\n\n    def __delitem__(self, *args, **kwargs):\n        raise Exception(\n            f\"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.\"\n        )\n\n    def setdefault(self, *args, **kwargs):\n        raise Exception(\n            f\"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.\"\n        )\n\n    def pop(self, *args, **kwargs):\n        raise Exception(\n            f\"You cannot use ``pop`` on a {self.__class__.__name__} instance.\")\n\n    def update(self, *args, **kwargs):\n        raise Exception(\n            f\"You cannot use ``update`` on a {self.__class__.__name__} instance.\"\n        )\n\n    def __getitem__(self, k):\n        if isinstance(k, str):\n            inner_dict = {k: v for (k, v) in self.items()}\n            return inner_dict[k]\n        else:\n            return self.to_tuple()[k]\n\n    def __setattr__(self, name, value):\n        if name in self.keys() and value is not None:\n            # Don't call self.__setitem__ to avoid recursion errors\n            super().__setitem__(name, value)\n        super().__setattr__(name, value)\n\n    def __setitem__(self, key, value):\n        # Will raise a KeyException if needed\n        super().__setitem__(key, value)\n        # Don't call self.__setattr__ to avoid recursion errors\n        super().__setattr__(key, value)\n\n    def to_tuple(self) -> Tuple[Any]:\n        \"\"\"\n        Convert self to a tuple containing all the attributes/keys that are not `None`.\n        \"\"\"\n        return tuple(self[k] for k in self.keys())\n\n\nclass NewGELUActivation(nn.Layer):\n    \"\"\"\n    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see\n    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415\n    \"\"\"\n\n    def forward(self, input):\n        return 0.5 * input * (1.0 + paddle.tanh(\n            math.sqrt(2.0 / math.pi) *\n            (input + 0.044715 * paddle.pow(input, 3.0))))\n\n\nclass GELUActivation(nn.Layer):\n    \"\"\"\n    Original Implementation of the GELU activation function in Google BERT repo when initially created. For\n    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +\n    paddle.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * paddle.pow(x, 3)))) This is now written in C in nn.functional\n    Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415\n    \"\"\"\n\n    def __init__(self, use_gelu_python: bool=False):\n        super().__init__()\n        self.act = nn.functional.gelu\n\n    def _gelu_python(self, input):\n        return input * 0.5 * (1.0 + paddle.erf(input / math.sqrt(2.0)))\n\n    def forward(self, input):\n        return self.act(input)\n\n\nclass FastGELUActivation(nn.Layer):\n    \"\"\"\n    Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs\n    \"\"\"\n\n    def forward(self, input):\n        return 0.5 * input * (\n            1.0 + paddle.tanh(input * 0.7978845608 *\n                              (1.0 + 0.044715 * input * input)))\n\n\nclass QuickGELUActivation(nn.Layer):\n    \"\"\"\n    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs\n    \"\"\"\n\n    def forward(self, input):\n        return input * paddle.nn.functional.sigmoid(1.702 * input)\n\n\nclass ClippedGELUActivation(nn.Layer):\n    \"\"\"\n    Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as\n    it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to\n    https://arxiv.org/abs/2004.09602.\n\n    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when\n    initially created.\n\n    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +\n    paddle.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * paddle.pow(x, 3)))). See https://arxiv.org/abs/1606.08415\n    \"\"\"\n\n    def __init__(self, min: float, max: float):\n        if min > max:\n            raise ValueError(\n                f\"min should be < max (got min: {min}, max: {max})\")\n\n        super().__init__()\n        self.min = min\n        self.max = max\n\n    def forward(self, x):\n        return paddle.clip(gelu(x), self.min, self.max)\n\n\nclass SiLUActivation(nn.Layer):\n    \"\"\"\n    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear\n    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function\n    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated\n    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with\n    later.\n    \"\"\"\n\n    def __init__(self):\n        super().__init__()\n        self.act = nn.functional.silu\n\n    def _silu_python(self, input):\n        return input * nn.functional.sigmoid(input)\n\n    def forward(self, input):\n        return self.act(input)\n\n\nclass MishActivation(nn.Layer):\n    \"\"\"\n    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also\n    visit the official repository for the paper: https://github.com/digantamisra98/Mish\n    \"\"\"\n\n    def __init__(self):\n        super().__init__()\n        self.act = nn.functional.mish\n\n    def _mish_python(self, input):\n        return input * paddle.tanh(nn.functional.softplus(input))\n\n    def forward(self, input):\n        return self.act(input)\n\n\nclass LinearActivation(nn.Layer):\n    \"\"\"\n    Applies the linear activation function, i.e. forwarding input directly to output.\n    \"\"\"\n\n    def forward(self, input):\n        return input\n\n\nACT2FN = {\n    \"gelu\": GELUActivation(),\n    \"gelu_10\": ClippedGELUActivation(-10, 10),\n    \"gelu_fast\": FastGELUActivation(),\n    \"gelu_new\": NewGELUActivation(),\n    \"gelu_python\": GELUActivation(use_gelu_python=True),\n    \"linear\": LinearActivation(),\n    \"mish\": MishActivation(),\n    \"quick_gelu\": QuickGELUActivation(),\n    \"relu\": nn.ReLU(),\n    \"sigmoid\": nn.Sigmoid(),\n    \"silu\": SiLUActivation(),\n    \"swish\": SiLUActivation(),\n    \"tanh\": nn.Tanh(),\n}\n\n\ndef get_activation(activation_string):\n    if activation_string in ACT2FN:\n        return ACT2FN[activation_string]\n    else:\n        raise KeyError(\n            f\"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}\"\n        )\n\n\n# For backwards compatibility with: from activations import gelu_python\ngelu_python = get_activation(\"gelu_python\")\ngelu_new = get_activation(\"gelu_new\")\ngelu = get_activation(\"gelu\")\ngelu_fast = get_activation(\"gelu_fast\")\nquick_gelu = get_activation(\"quick_gelu\")\nsilu = get_activation(\"silu\")\nmish = get_activation(\"mish\")\nlinear_act = get_activation(\"linear\")\n\n\ndef prune_linear_layer(layer: nn.Linear, index: paddle.int64,\n                       dim: int=0) -> nn.Linear:\n    \"\"\"\n    Prune a linear layer to keep only entries in index.\n\n    Used to remove heads.\n\n    Args:\n        layer (`paddle.nn.Linear`): The layer to prune.\n        index (`paddle.int64`): The indices to keep in the layer.\n        dim (`int`, *optional*, defaults to 0): The dimension on which to keep the indices.\n\n    Returns:\n        `paddle.nn.Linear`: The pruned layer as a new layer with `requires_grad=True`.\n    \"\"\"\n    W = layer.weight.index_select(dim, index).clone().detach()\n    if layer.bias is not None:\n        if dim == 1:\n            b = layer.bias.clone().detach()\n        else:\n            b = layer.bias[index].clone().detach()\n    new_size = list(layer.weight.size())\n    new_size[dim] = len(index)\n    new_layer = nn.Linear(\n        new_size[1], new_size[0], bias_attr=layer.bias is not None)\n    new_layer.weight.requires_grad = False\n    new_layer.weight.copy_(W)\n    new_layer.weight.stop_gradient = False\n    if layer.bias is not None:\n        new_layer.bias.stop_gradient = True\n        new_layer.bias.copy_(b)\n        new_layer.bias.stop_gradient = False\n    return new_layer\n\n\ndef find_pruneable_heads_and_indices(heads,\n                                     n_heads: int,\n                                     head_size: int,\n                                     already_pruned_heads):\n    \"\"\"\n    Finds the heads and their indices taking `already_pruned_heads` into account.\n\n    Args:\n        heads : List of the indices of heads to prune.\n        n_heads : The number of heads in the model.\n        head_size : The size of each head.\n        already_pruned_heads : A set of already pruned heads.\n\n    Returns:\n        A tuple with the remaining heads and their corresponding indices.\n    \"\"\"\n    mask = paddle.ones(n_heads, head_size)\n    heads = set(\n        heads\n    ) - already_pruned_heads  # Convert to set and remove already pruned heads\n    for head in heads:\n        # Compute how many pruned heads are before the head and move the index accordingly\n        head = head - sum(1 if h < head else 0 for h in already_pruned_heads)\n        mask[head] = 0\n    mask = mask.reshape(-1).equal(1)\n    index = paddle.arange(len(mask))[mask].cast(paddle.int64)\n    return heads, index\n\n\nclass BaseModelOutputWithPastAndCrossAttentions(ModelOutput):\n    \"\"\"\n    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).\n\n    Args:\n        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):\n            Sequence of hidden-states at the output of the last layer of the model.\n\n            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,\n            hidden_size)` is output.\n        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):\n            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape\n            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if\n            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,\n            encoder_sequence_length, embed_size_per_head)`.\n\n            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if\n            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`\n            input) to speed up sequential decoding.\n        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):\n            Tuple of (one for the output of the embeddings, if the model has an embedding layer, +\n            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.\n\n            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.\n        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):\n            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,\n            sequence_length)`.\n\n            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention\n            heads.\n        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):\n            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,\n            sequence_length)`.\n\n            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the\n            weighted average in the cross-attention heads.\n    \"\"\"\n\n    last_hidden_state = None\n    past_key_values = None\n    hidden_states = None\n    attentions = None\n    cross_attentions = None\n\n\nclass T5Config(object):\n    def __init__(self, **kwargs):\n\n        # Fine-tuning task arguments\n        self.architectures = kwargs.pop(\"architectures\", None)\n        self.use_return_dict = kwargs.pop(\"return_dict\", True)\n        self.d_ff = kwargs.pop(\"d_ff\", None)\n        self.d_kv = kwargs.pop(\"d_kv\", None)\n        self.d_model = kwargs.pop(\"d_model\", None)\n        self.decoder_start_token_id = kwargs.pop(\"decoder_start_token_id\",\n                                                 None)\n        self.dense_act_fn = kwargs.pop(\"dense_act_fn\", 'gelu_new')\n        self.eos_token_id = kwargs.pop(\"eos_token_id\", None)\n        self.feed_forward_proj = kwargs.pop(\"feed_forward_proj\", None)\n        self.initializer_factor = kwargs.pop(\"initializer_factor\", None)\n        self.is_decoder = kwargs.pop(\"is_decoder\", False)\n        self.is_encoder_decoder = kwargs.pop(\"is_encoder_decoder\", False)\n        self.is_gated_act = kwargs.pop(\"is_gated_act\", True)\n        self.layer_norm_epsilon = kwargs.pop(\"layer_norm_epsilon\", None)\n        self.model_type = kwargs.pop(\"model_type\", None)\n        self.num_decoder_layers = kwargs.pop(\"num_decoder_layers\", None)\n        self.num_heads = kwargs.pop(\"num_heads\", None)\n        self.num_layers = kwargs.pop(\"num_layers\", None)\n        self.output_past = kwargs.pop(\"output_past\", True)\n        self.pad_token_id = kwargs.pop(\"pad_token_id\", None)\n        self.relative_attention_max_distance = kwargs.pop(\n            \"relative_attention_max_distance\", 128)\n        self.relative_attention_num_buckets = kwargs.pop(\n            \"relative_attention_num_buckets\", None)\n        self.tie_word_embeddings = kwargs.pop(\"tie_word_embeddings\", False)\n        self.transformers_version = kwargs.pop(\"transformers_version\", None)\n        self.use_cache = kwargs.pop(\"use_cache\", False)\n        self.vocab_size = kwargs.pop(\"vocab_size\", None)\n        self.model_type = kwargs.pop(\"model_type\", None)\n        self.dropout_rate = kwargs.pop(\"dropout_rate\", None)\n        self.output_attentions = kwargs.pop(\"output_attentions\", False)\n        self.output_hidden_states = kwargs.pop(\"output_hidden_states\", False)\n\n\nclass T5LayerNorm(nn.Layer):\n    def __init__(self, hidden_size, eps=1e-6):\n        super().__init__()\n        \"\"\"\n        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.\n        \"\"\"\n        super().__init__()\n        self.weight = self.create_parameter(\n            [hidden_size],\n            default_initializer=nn.initializer.Constant(value=1.))\n        self.variance_epsilon = eps\n\n    def forward(self, hidden_states):\n\n        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean\n        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated\n        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for\n        # half-precision inputs is done in fp32\n\n        variance = hidden_states.cast(paddle.float32).pow(2).mean(\n            -1, keepdim=True)\n        hidden_states = hidden_states * paddle.rsqrt(variance +\n                                                     self.variance_epsilon)\n\n        # convert into half-precision if necessary\n        if self.weight.dtype in [paddle.float16, paddle.bfloat16]:\n            hidden_states = hidden_states.cast(self.weight.dtype)\n\n        return self.weight * hidden_states\n\n\nclass T5DenseActDense(nn.Layer):\n    def __init__(self, d_model, d_ff, dropout_rate, dense_act_fn):\n        super().__init__()\n        self.wi = nn.Linear(d_model, d_ff, bias_attr=False)\n        self.wo = nn.Linear(d_ff, d_model, bias_attr=False)\n        self.dropout = nn.Dropout(dropout_rate)\n        self.act = ACT2FN[dense_act_fn]\n\n    def forward(self, hidden_states):\n        hidden_states = self.wi(hidden_states)\n        hidden_states = self.act(hidden_states)\n        hidden_states = self.dropout(hidden_states)\n        hidden_states = self.wo(hidden_states)\n        return hidden_states\n\n\nclass T5DenseGatedActDense(nn.Layer):\n    def __init__(self, d_model, d_ff, dropout_rate, dense_act_fn):\n        super().__init__()\n        self.wi_0 = nn.Linear(d_model, d_ff, bias_attr=False)\n        self.wi_1 = nn.Linear(d_model, d_ff, bias_attr=False)\n        self.wo = nn.Linear(d_ff, d_model, bias_attr=False)\n        self.dropout = nn.Dropout(dropout_rate)\n        self.act = ACT2FN[dense_act_fn]\n\n    def forward(self, hidden_states):\n        hidden_gelu = self.act(self.wi_0(hidden_states))\n        hidden_linear = self.wi_1(hidden_states)\n        hidden_states = hidden_gelu * hidden_linear\n        hidden_states = self.dropout(hidden_states)\n        hidden_states = self.wo(hidden_states)\n        return hidden_states\n\n\nclass T5LayerFF(nn.Layer):\n    def __init__(self, d_model, d_ff, dropout_rate, layer_norm_epsilon,\n                 feed_forward_proj):\n        super().__init__()\n        if feed_forward_proj == \"gated-gelu\":\n            self.DenseReluDense = T5DenseGatedActDense(\n                d_model, d_ff, dropout_rate, dense_act_fn)\n        elif feed_forward_proj == \"relu\":\n            self.DenseReluDense = T5DenseActDense(d_model, d_ff, dropout_rate,\n                                                  feed_forward_proj)\n\n        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)\n        self.dropout = nn.Dropout(dropout_rate)\n\n    def forward(self, hidden_states):\n        forwarded_states = self.layer_norm(hidden_states)\n        forwarded_states = self.DenseReluDense(forwarded_states)\n        hidden_states = hidden_states + self.dropout(forwarded_states)\n        return hidden_states\n\n\nclass T5Attention(nn.Layer):\n    def __init__(self,\n                 is_decoder,\n                 relative_attention_num_buckets,\n                 d_model,\n                 d_kv,\n                 num_heads,\n                 dropout_rate,\n                 has_relative_attention_bias=False):\n        super().__init__()\n        self.is_decoder = is_decoder\n        self.has_relative_attention_bias = has_relative_attention_bias\n        self.relative_attention_num_buckets = relative_attention_num_buckets\n        self.d_model = d_model\n        self.key_value_proj_dim = d_kv\n        self.n_heads = num_heads\n        self.dropout = dropout_rate\n        self.inner_dim = self.n_heads * self.key_value_proj_dim\n\n        # Mesh TensorFlow initialization to avoid scaling before softmax\n        self.q = nn.Linear(self.d_model, self.inner_dim, bias_attr=False)\n        self.k = nn.Linear(self.d_model, self.inner_dim, bias_attr=False)\n        self.v = nn.Linear(self.d_model, self.inner_dim, bias_attr=False)\n        self.o = nn.Linear(self.inner_dim, self.d_model, bias_attr=False)\n\n        if self.has_relative_attention_bias:\n            self.relative_attention_bias = nn.Embedding(\n                self.relative_attention_num_buckets, self.n_heads)\n        self.pruned_heads = set()\n        self.gradient_checkpointing = False\n\n    def prune_heads(self, heads):\n        if len(heads) == 0:\n            return\n        heads, index = find_pruneable_heads_and_indices(\n            heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads)\n        # Prune linear layers\n        self.q = prune_linear_layer(self.q, index)\n        self.k = prune_linear_layer(self.k, index)\n        self.v = prune_linear_layer(self.v, index)\n        self.o = prune_linear_layer(self.o, index, dim=1)\n        # Update hyper params\n        self.n_heads = self.n_heads - len(heads)\n        self.inner_dim = self.key_value_proj_dim * self.n_heads\n        self.pruned_heads = self.pruned_heads.union(heads)\n\n    @staticmethod\n    def _relative_position_bucket(relative_position,\n                                  bidirectional=True,\n                                  num_buckets=32,\n                                  max_distance=128):\n        \"\"\"\n        Adapted from Mesh Tensorflow:\n        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593\n\n        Translate relative position to a bucket number for relative attention. The relative position is defined as\n        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to\n        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for\n        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative\n        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.\n        This should allow for more graceful generalization to longer sequences than the model has been trained on\n\n        Args:\n            relative_position: an int32 Tensor\n            bidirectional: a boolean - whether the attention is bidirectional\n            num_buckets: an integer\n            max_distance: an integer\n\n        Returns:\n            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)\n        \"\"\"\n        relative_buckets = 0\n        if bidirectional:\n            num_buckets //= 2\n            relative_buckets += (\n                relative_position > 0).cast(paddle.int64) * num_buckets\n            relative_position = paddle.abs(relative_position)\n        else:\n            relative_position = -paddle.min(\n                relative_position, paddle.zeros_like(relative_position))\n        # now relative_position is in the range [0, inf)\n\n        # half of the buckets are for exact increments in positions\n        max_exact = num_buckets // 2\n        is_small = relative_position < max_exact\n\n        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance\n        relative_position_if_large = max_exact + (\n            paddle.log(relative_position.cast('float32') /\n                       max_exact) / math.log(max_distance / max_exact) *\n            (num_buckets - max_exact)).cast(paddle.int64)\n        relative_position_if_large = paddle.minimum(\n            relative_position_if_large,\n            paddle.full_like(relative_position_if_large, num_buckets - 1))\n\n        relative_buckets += paddle.where(is_small, relative_position,\n                                         relative_position_if_large)\n        return relative_buckets\n\n    def compute_bias(self, query_length, key_length, device=None):\n        \"\"\"Compute binned relative position bias\"\"\"\n        context_position = paddle.arange(\n            query_length, dtype=paddle.int64)[:, None]\n        memory_position = paddle.arange(\n            key_length, dtype=paddle.int64)[None, :]\n        relative_position = memory_position - context_position  # shape (query_length, key_length)\n        relative_position_bucket = self._relative_position_bucket(\n            relative_position,  # shape (query_length, key_length)\n            bidirectional=(not self.is_decoder),\n            num_buckets=self.relative_attention_num_buckets, )\n        values = self.relative_attention_bias(\n            relative_position_bucket\n        )  # shape (query_length, key_length, num_heads)\n        values = values.transpose([2, 0, 1]).unsqueeze(\n            0)  # shape (1, num_heads, query_length, key_length)\n        return values\n\n    def forward(\n            self,\n            hidden_states,\n            mask=None,\n            key_value_states=None,\n            position_bias=None,\n            past_key_value=None,\n            layer_head_mask=None,\n            query_length=None,\n            use_cache=False,\n            output_attentions=False, ):\n        \"\"\"\n        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).\n        \"\"\"\n        # Input is (batch_size, seq_length, dim)\n        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)\n        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)\n        batch_size, seq_length = hidden_states.shape[:2]\n\n        real_seq_length = seq_length\n\n        if past_key_value is not None:\n            assert (\n                len(past_key_value) == 2\n            ), f\"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states\"\n            real_seq_length += past_key_value[0].shape[\n                2] if query_length is None else query_length\n\n        key_length = real_seq_length if key_value_states is None else key_value_states.shape[\n            1]\n\n        def shape(states):\n            \"\"\"projection\"\"\"\n            return states.reshape(\n                [0, -1, self.n_heads, self.key_value_proj_dim]).transpose(\n                    [0, 2, 1, 3])\n\n        def unshape(states):\n            \"\"\"reshape\"\"\"\n            return states.transpose([0, 2, 1, 3]).reshape(\n                [batch_size, -1, self.inner_dim])\n\n        def project(hidden_states, proj_layer, key_value_states,\n                    past_key_value):\n            \"\"\"projects hidden states correctly to key/query states\"\"\"\n            if key_value_states is None:\n                # self-attn\n                # (batch_size, n_heads, seq_length, dim_per_head)\n                hidden_states = shape(proj_layer(hidden_states))\n            elif past_key_value is None:\n                # cross-attn\n                # (batch_size, n_heads, seq_length, dim_per_head)\n                hidden_states = shape(proj_layer(key_value_states))\n\n            if past_key_value is not None:\n                if key_value_states is None:\n                    # self-attn\n                    # (batch_size, n_heads, key_length, dim_per_head)\n                    hidden_states = paddle.concat(\n                        [past_key_value, hidden_states], axis=2)\n                else:\n                    # cross-attn\n                    hidden_states = past_key_value\n            return hidden_states\n\n        # get query states\n        query_states = shape(self.q(\n            hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)\n\n        # get key/value states\n        key_states = project(hidden_states, self.k, key_value_states,\n                             past_key_value[0]\n                             if past_key_value is not None else None)\n        value_states = project(hidden_states, self.v, key_value_states,\n                               past_key_value[1]\n                               if past_key_value is not None else None)\n\n        # compute scores\n        scores = paddle.matmul(\n            query_states, key_states.transpose([0, 1, 3, 2])\n        )  # equivalent of paddle.einsum(\"bnqd,bnkd->bnqk\", query_states, key_states), compatible with onnx op>9\n\n        if position_bias is None:\n            if not self.has_relative_attention_bias:\n                position_bias = paddle.zeros(\n                    (1, self.n_heads, real_seq_length, key_length),\n                    dtype=scores.dtype)\n                if self.gradient_checkpointing and self.training:\n                    position_bias.requires_grad = True\n            else:\n                position_bias = self.compute_bias(real_seq_length, key_length)\n\n            # if key and values are already calculated\n            # we want only the last query position bias\n            if past_key_value is not None:\n                position_bias = position_bias[:, :, -hidden_states.size(1):, :]\n\n            if mask is not None:\n                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)\n\n        scores += position_bias\n        attn_weights = nn.functional.softmax(\n            scores.cast('float32'), axis=-1).astype(\n                scores.dtype)  # (batch_size, n_heads, seq_length, key_length)\n        attn_weights = nn.functional.dropout(\n            attn_weights, p=self.dropout, training=self.\n            training)  # (batch_size, n_heads, seq_length, key_length)\n\n        # Mask heads if we want to\n        if layer_head_mask is not None:\n            attn_weights = attn_weights * layer_head_mask\n\n        attn_output = unshape(paddle.matmul(\n            attn_weights, value_states))  # (batch_size, seq_length, dim)\n        attn_output = self.o(attn_output)\n\n        present_key_value_state = (key_states, value_states) if (\n            self.is_decoder and use_cache) else None\n        outputs = (attn_output, ) + (present_key_value_state, ) + (\n            position_bias, )\n\n        if output_attentions:\n            outputs = outputs + (attn_weights, )\n        return outputs\n\n\nclass T5LayerSelfAttention(nn.Layer):\n    def __init__(self,\n                 is_decoder,\n                 relative_attention_num_buckets,\n                 d_model,\n                 d_kv,\n                 num_heads,\n                 dropout_rate,\n                 layer_norm_epsilon,\n                 has_relative_attention_bias=False):\n        super().__init__()\n        self.SelfAttention = T5Attention(\n            is_decoder,\n            relative_attention_num_buckets,\n            d_model,\n            d_kv,\n            num_heads,\n            dropout_rate,\n            has_relative_attention_bias=has_relative_attention_bias)\n        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)\n        self.dropout = nn.Dropout(dropout_rate)\n\n    def forward(\n            self,\n            hidden_states,\n            attention_mask=None,\n            position_bias=None,\n            layer_head_mask=None,\n            past_key_value=None,\n            use_cache=False,\n            output_attentions=False, ):\n        normed_hidden_states = self.layer_norm(hidden_states)\n        attention_output = self.SelfAttention(\n            normed_hidden_states,\n            mask=attention_mask,\n            position_bias=position_bias,\n            layer_head_mask=layer_head_mask,\n            past_key_value=past_key_value,\n            use_cache=use_cache,\n            output_attentions=output_attentions, )\n        hidden_states = hidden_states + self.dropout(attention_output[0])\n        outputs = (hidden_states,\n                   ) + attention_output[1:]  # add attentions if we output them\n        return outputs\n\n\nclass T5LayerCrossAttention(nn.Layer):\n    def __init__(self, is_decoder, relative_attention_num_buckets, d_model,\n                 d_kv, num_heads, dropout_rate, layer_norm_epsilon):\n        super().__init__()\n        self.EncDecAttention = T5Attention(\n            is_decoder,\n            relative_attention_num_buckets,\n            d_model,\n            d_kv,\n            num_heads,\n            has_relative_attention_bias=False)\n        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)\n        self.dropout = nn.Dropout(dropout_rate)\n\n    def forward(\n            self,\n            hidden_states,\n            key_value_states,\n            attention_mask=None,\n            position_bias=None,\n            layer_head_mask=None,\n            past_key_value=None,\n            use_cache=False,\n            query_length=None,\n            output_attentions=False, ):\n        normed_hidden_states = self.layer_norm(hidden_states)\n        attention_output = self.EncDecAttention(\n            normed_hidden_states,\n            mask=attention_mask,\n            key_value_states=key_value_states,\n            position_bias=position_bias,\n            layer_head_mask=layer_head_mask,\n            past_key_value=past_key_value,\n            use_cache=use_cache,\n            query_length=query_length,\n            output_attentions=output_attentions, )\n        layer_output = hidden_states + self.dropout(attention_output[0])\n        outputs = (layer_output,\n                   ) + attention_output[1:]  # add attentions if we output them\n        return outputs\n\n\nclass T5Block(nn.Layer):\n    def __init__(self,\n                 is_decoder,\n                 relative_attention_num_buckets,\n                 feed_forward_proj,\n                 d_model,\n                 d_kv,\n                 num_heads,\n                 dropout_rate,\n                 layer_norm_epsilon,\n                 d_ff,\n                 has_relative_attention_bias=False):\n        super().__init__()\n        self.is_decoder = is_decoder\n        self.layer = nn.LayerList()\n        self.layer.append(\n            T5LayerSelfAttention(\n                is_decoder,\n                relative_attention_num_buckets,\n                d_model,\n                d_kv,\n                num_heads,\n                dropout_rate,\n                layer_norm_epsilon,\n                has_relative_attention_bias=has_relative_attention_bias))\n        if self.is_decoder:\n            self.layer.append(\n                T5LayerCrossAttention(\n                    is_decoder, relative_attention_num_buckets, d_model, d_kv,\n                    num_heads, dropout_rate, layer_norm_epsilon))\n\n        self.layer.append(\n            T5LayerFF(d_model, d_ff, dropout_rate, layer_norm_epsilon,\n                      feed_forward_proj))\n\n    def forward(\n            self,\n            hidden_states,\n            attention_mask=None,\n            position_bias=None,\n            encoder_hidden_states=None,\n            encoder_attention_mask=None,\n            encoder_decoder_position_bias=None,\n            layer_head_mask=None,\n            cross_attn_layer_head_mask=None,\n            past_key_value=None,\n            use_cache=False,\n            output_attentions=False,\n            return_dict=True, ):\n\n        if past_key_value is not None:\n            if not self.is_decoder:\n                logger.warning(\n                    \"`past_key_values` is passed to the encoder. Please make sure this is intended.\"\n                )\n            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4\n\n            if len(past_key_value) != expected_num_past_key_values:\n                raise ValueError(\n                    f\"There should be {expected_num_past_key_values} past states. \"\n                    f\"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}\"\n                    f\"Got {len(past_key_value)} past key / value states\")\n\n            self_attn_past_key_value = past_key_value[:2]\n            cross_attn_past_key_value = past_key_value[2:]\n        else:\n            self_attn_past_key_value, cross_attn_past_key_value = None, None\n\n        self_attention_outputs = self.layer[0](\n            hidden_states,\n            attention_mask=attention_mask,\n            position_bias=position_bias,\n            layer_head_mask=layer_head_mask,\n            past_key_value=self_attn_past_key_value,\n            use_cache=use_cache,\n            output_attentions=output_attentions, )\n        hidden_states, present_key_value_state = self_attention_outputs[:2]\n        attention_outputs = self_attention_outputs[\n            2:]  # Keep self-attention outputs and relative position weights\n\n        # clamp inf values to enable fp16 training\n        if hidden_states.dtype == paddle.float16 and paddle.isinf(\n                hidden_states).any():\n            clamp_value = finfo(hidden_states.dtype).max - 1000\n            hidden_states = paddle.clip(\n                hidden_states, min=-clamp_value, max=clamp_value)\n\n        do_cross_attention = self.is_decoder and encoder_hidden_states is not None\n        if do_cross_attention:\n            # the actual query length is unknown for cross attention\n            # if using past key value states. Need to inject it here\n            if present_key_value_state is not None:\n                query_length = present_key_value_state[0].shape[2]\n            else:\n                query_length = None\n\n            cross_attention_outputs = self.layer[1](\n                hidden_states,\n                key_value_states=encoder_hidden_states,\n                attention_mask=encoder_attention_mask,\n                position_bias=encoder_decoder_position_bias,\n                layer_head_mask=cross_attn_layer_head_mask,\n                past_key_value=cross_attn_past_key_value,\n                query_length=query_length,\n                use_cache=use_cache,\n                output_attentions=output_attentions, )\n            hidden_states = cross_attention_outputs[0]\n\n            # clamp inf values to enable fp16 training\n            if hidden_states.dtype == paddle.float16 and paddle.isinf(\n                    hidden_states).any():\n                clamp_value = finfo(hidden_states.dtype).max - 1000\n                hidden_states = paddle.clip(\n                    hidden_states, min=-clamp_value, max=clamp_value)\n\n            # Combine self attn and cross attn key value states\n            if present_key_value_state is not None:\n                present_key_value_state = present_key_value_state + cross_attention_outputs[\n                    1]\n\n            # Keep cross-attention outputs and relative position weights\n            attention_outputs = attention_outputs + cross_attention_outputs[2:]\n\n        # Apply Feed Forward layer\n        hidden_states = self.layer[-1](hidden_states)\n\n        # clamp inf values to enable fp16 training\n        if hidden_states.dtype == paddle.float16 and paddle.isinf(\n                hidden_states).any():\n            clamp_value = finfo(hidden_states.dtype).max - 1000\n            hidden_states = paddle.clip(\n                hidden_states, min=-clamp_value, max=clamp_value)\n\n        outputs = (hidden_states, )\n\n        if use_cache:\n            outputs = outputs + (present_key_value_state, ) + attention_outputs\n        else:\n            outputs = outputs + attention_outputs\n\n        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)\n\n\nclass T5Stack(nn.Layer):\n    def __init__(self,\n                 d_model,\n                 num_layers,\n                 layer_norm_epsilon,\n                 dropout_rate,\n                 relative_attention_num_buckets,\n                 feed_forward_proj,\n                 d_kv,\n                 num_heads,\n                 d_ff,\n                 embed_tokens=None,\n                 is_decoder=False):\n        super().__init__()\n        self.embed_tokens = embed_tokens\n        self.is_decoder = is_decoder\n        self.num_layers = num_layers\n\n        self.block = nn.LayerList([\n            T5Block(\n                is_decoder,\n                relative_attention_num_buckets,\n                feed_forward_proj,\n                d_model,\n                d_kv,\n                num_heads,\n                dropout_rate,\n                layer_norm_epsilon,\n                d_ff,\n                has_relative_attention_bias=bool(i == 0))\n            for i in range(num_layers)\n        ])\n        self.final_layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)\n        self.dropout = nn.Dropout(dropout_rate)\n\n    def get_input_embeddings(self):\n        return self.embed_tokens\n\n    def set_input_embeddings(self, new_embeddings):\n        self.embed_tokens = new_embeddings\n\n    def get_extended_attention_mask(self, attention_mask, input_shape):\n        \"\"\"\n        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.\n\n        Arguments:\n            attention_mask (`paddle.Tensor`):\n                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.\n            input_shape (`Tuple[int]`):\n                The shape of the input to the model.\n\n        Returns:\n            `paddle.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.\n        \"\"\"\n        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]\n        # ourselves in which case we just need to make it broadcastable to all heads.\n        if attention_mask.dim() == 3:\n            extended_attention_mask = attention_mask[:, None, :, :]\n        elif attention_mask.dim() == 2:\n            # Provided a padding mask of dimensions [batch_size, seq_length]\n            # - if the model is a decoder, apply a causal mask in addition to the padding mask\n            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]\n            extended_attention_mask = attention_mask[:, None, None, :]\n        else:\n            raise ValueError(\n                f\"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})\"\n            )\n\n        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for\n        # masked positions, this operation will create a tensor which is 0.0 for\n        # positions we want to attend and -10000.0 for masked positions.\n        # Since we are adding it to the raw scores before the softmax, this is\n        # effectively the same as removing these entirely.\n        #extended_attention_mask = extended_attention_mask.cast(dtype='float16')  # fp16 compatibility\n        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0\n        return extended_attention_mask\n\n    def get_head_mask(self,\n                      head_mask,\n                      num_hidden_layers,\n                      is_attention_chunked=False):\n        \"\"\"\n        Prepare the head mask if needed.\n\n        Args:\n            head_mask (`paddle.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):\n                The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).\n            num_hidden_layers (`int`):\n                The number of hidden layers in the model.\n            is_attention_chunked: (`bool`, *optional*, defaults to `False`):\n                Whether or not the attentions scores are computed by chunks or not.\n\n        Returns:\n            `paddle.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with\n            `[None]` for each layer.\n        \"\"\"\n        if head_mask is not None:\n            head_mask = self._convert_head_mask_to_5d(head_mask,\n                                                      num_hidden_layers)\n            if is_attention_chunked is True:\n                head_mask = head_mask.unsqueeze(-1)\n        else:\n            head_mask = [None] * num_hidden_layers\n\n        return head_mask\n\n    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):\n        \"\"\"-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]\"\"\"\n        if head_mask.dim() == 1:\n            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(\n                -1).unsqueeze(-1)\n            head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1)\n        elif head_mask.dim() == 2:\n            head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(\n                -1)  # We can specify head_mask for each layer\n        assert head_mask.dim(\n        ) == 5, f\"head_mask.dim != 5, instead {head_mask.dim()}\"\n        #head_mask = head_mask.cast(dtype=self.dtype)  # switch to float if need + fp16 compatibility\n        return head_mask\n\n    def forward(\n            self,\n            input_ids=None,\n            attention_mask=None,\n            encoder_hidden_states=None,\n            encoder_attention_mask=None,\n            inputs_embeds=None,\n            head_mask=None,\n            cross_attn_head_mask=None,\n            past_key_values=None,\n            use_cache=False,\n            output_attentions=False,\n            output_hidden_states=False,\n            return_dict=True, ):\n        if use_cache is True:\n            assert (\n                self.is_decoder\n            ), f\"`use_cache` can only be set to `True` if {self} is used as a decoder\"\n\n        output_hidden_states = (output_hidden_states\n                                if output_hidden_states is not None else False)\n\n        if input_ids is not None and inputs_embeds is not None:\n            err_msg_prefix = \"decoder_\" if self.is_decoder else \"\"\n            raise ValueError(\n                f\"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time\"\n            )\n        elif input_ids is not None:\n            input_shape = input_ids.shape\n            input_ids = input_ids.reshape([-1, input_shape[-1]])\n        elif inputs_embeds is not None:\n            input_shape = inputs_embeds.shape[:-1]\n        else:\n            err_msg_prefix = \"decoder_\" if self.is_decoder else \"\"\n            raise ValueError(\n                f\"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds\"\n            )\n\n        if inputs_embeds is None:\n            assert self.embed_tokens is not None, \"You have to initialize the model with valid token embeddings\"\n            inputs_embeds = self.embed_tokens(input_ids)\n\n        batch_size, seq_length = input_shape\n\n        # required mask seq length can be calculated via length of past\n        mask_seq_length = past_key_values[0][0].shape[\n            2] + seq_length if past_key_values is not None else seq_length\n\n        if use_cache is True:\n            assert self.is_decoder, f\"`use_cache` can only be set to `True` if {self} is used as a decoder\"\n\n        if attention_mask is None:\n            attention_mask = paddle.ones(batch_size, mask_seq_length)\n        if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:\n            encoder_seq_length = encoder_hidden_states.shape[1]\n            encoder_attention_mask = paddle.ones(\n                batch_size, encoder_seq_length, dtype=paddle.int64)\n\n        # initialize past_key_values with `None` if past does not exist\n        if past_key_values is None:\n            past_key_values = [None] * len(self.block)\n\n        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]\n        # ourselves in which case we just need to make it broadcastable to all heads.\n        extended_attention_mask = self.get_extended_attention_mask(\n            attention_mask, input_shape)\n\n        # If a 2D or 3D attention mask is provided for the cross-attention\n        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]\n        if self.is_decoder and encoder_hidden_states is not None:\n            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape\n            encoder_hidden_shape = (encoder_batch_size,\n                                    encoder_sequence_length)\n            if encoder_attention_mask is None:\n                encoder_attention_mask = paddle.ones(encoder_hidden_shape)\n            encoder_extended_attention_mask = self.invert_attention_mask(\n                encoder_attention_mask)\n        else:\n            encoder_extended_attention_mask = None\n\n        # Prepare head mask if needed\n        head_mask = self.get_head_mask(head_mask, self.num_layers)\n        cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask,\n                                                  self.num_layers)\n        present_key_value_states = () if use_cache else None\n        all_hidden_states = () if output_hidden_states else None\n        all_attentions = () if output_attentions else None\n        all_cross_attentions = () if (output_attentions and\n                                      self.is_decoder) else None\n        position_bias = None\n        encoder_decoder_position_bias = None\n\n        hidden_states = self.dropout(inputs_embeds)\n\n        for i, (layer_module, past_key_value\n                ) in enumerate(zip(self.block, past_key_values)):\n            layer_head_mask = head_mask[i]\n            cross_attn_layer_head_mask = cross_attn_head_mask[i]\n\n            if output_hidden_states:\n                all_hidden_states = all_hidden_states + (hidden_states, )\n\n            layer_outputs = layer_module(\n                hidden_states,\n                attention_mask=extended_attention_mask,\n                position_bias=position_bias,\n                encoder_hidden_states=encoder_hidden_states,\n                encoder_attention_mask=encoder_extended_attention_mask,\n                encoder_decoder_position_bias=encoder_decoder_position_bias,\n                layer_head_mask=layer_head_mask,\n                cross_attn_layer_head_mask=cross_attn_layer_head_mask,\n                past_key_value=past_key_value,\n                use_cache=use_cache,\n                output_attentions=output_attentions, )\n\n            # layer_outputs is a tuple with:\n            # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)\n            if use_cache is False:\n                layer_outputs = layer_outputs[:1] + (None,\n                                                     ) + layer_outputs[1:]\n\n            hidden_states, present_key_value_state = layer_outputs[:2]\n\n            # We share the position biases between the layers - the first layer store them\n            # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),\n            # (cross-attention position bias), (cross-attention weights)\n            position_bias = layer_outputs[2]\n            if self.is_decoder and encoder_hidden_states is not None:\n                encoder_decoder_position_bias = layer_outputs[\n                    4 if output_attentions else 3]\n            # append next layer key value states\n            if use_cache:\n                present_key_value_states = present_key_value_states + (\n                    present_key_value_state, )\n\n            if output_attentions:\n                all_attentions = all_attentions + (layer_outputs[3], )\n                if self.is_decoder:\n                    all_cross_attentions = all_cross_attentions + (\n                        layer_outputs[5], )\n\n        hidden_states = self.final_layer_norm(hidden_states)\n        hidden_states = self.dropout(hidden_states)\n\n        # Add last layer\n        if output_hidden_states:\n            all_hidden_states = all_hidden_states + (hidden_states, )\n\n        if not return_dict:\n            return tuple(v\n                         for v in [\n                             hidden_states,\n                             present_key_value_states,\n                             all_hidden_states,\n                             all_attentions,\n                             all_cross_attentions,\n                         ] if v is not None)\n        return BaseModelOutputWithPastAndCrossAttentions(\n            last_hidden_state=hidden_states,\n            past_key_values=present_key_value_states,\n            hidden_states=all_hidden_states,\n            attentions=all_attentions,\n            cross_attentions=all_cross_attentions, )\n\n\nclass T5EncoderModel(nn.Layer):\n    authorized_missing_keys = [r\"encoder.embed_tokens.weight\", ]\n\n    def __init__(self,\n                 vocab_size=32128,\n                 d_model=768,\n                 d_kv=64,\n                 d_ff=3072,\n                 num_layers=12,\n                 num_decoder_layers=12,\n                 num_heads=12,\n                 relative_attention_num_buckets=32,\n                 dropout_rate=0.1,\n                 layer_norm_epsilon=1e-06,\n                 feed_forward_proj=\"relu\"):\n        super().__init__()\n        self.shared = nn.Embedding(vocab_size, d_model)\n        # self.extra_parameters = list(self.shared.parameters())\n\n        use_cache = False\n        is_encoder_decoder = False\n        self.encoder = T5Stack(\n            d_model,\n            num_layers,\n            layer_norm_epsilon,\n            dropout_rate,\n            relative_attention_num_buckets,\n            feed_forward_proj,\n            d_kv,\n            num_heads,\n            d_ff,\n            embed_tokens=self.shared,\n            is_decoder=False)\n\n    def get_input_embeddings(self):\n        return self.shared\n\n    def set_input_embeddings(self, new_embeddings):\n        self.shared = new_embeddings\n        self.encoder.set_input_embeddings(new_embeddings)\n\n    def get_encoder(self):\n        return self.encoder\n\n    def _prune_heads(self, heads_to_prune):\n        \"\"\"\n        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base\n        class PreTrainedModel\n        \"\"\"\n        for layer, heads in heads_to_prune.items():\n            self.encoder.layer[layer].attention.prune_heads(heads)\n\n    def forward(\n            self,\n            input_ids=None,\n            attention_mask=None,\n            head_mask=None,\n            inputs_embeds=None,\n            output_attentions=None,\n            output_hidden_states=None,\n            return_dict=None, ):\n        r\"\"\"\n        Returns:\n\n        Example:\n\n        ```python\n        >>> from transformers import T5Tokenizer, T5EncoderModel\n\n        >>> tokenizer = T5Tokenizer.from_pretrained(\"t5-small\")\n        >>> model = T5EncoderModel.from_pretrained(\"t5-small\")\n        >>> input_ids = tokenizer(\n        ...     \"Studies have been shown that owning a dog is good for you\", return_tensors=\"pt\"\n        ... ).input_ids  # Batch size 1\n        >>> outputs = model(input_ids=input_ids)\n        >>> last_hidden_states = outputs.last_hidden_state\n        ```\"\"\"\n        return_dict = return_dict if return_dict is not None else True\n        #import numpy as np\n        #attention_mask = paddle.to_tensor(np.load('attn_mask.npy'))\n        #input_ids = paddle.to_tensor(np.load('input_ids.npy'))\n        encoder_outputs = self.encoder(\n            input_ids=input_ids,\n            attention_mask=attention_mask,\n            inputs_embeds=inputs_embeds,\n            head_mask=head_mask,\n            output_attentions=output_attentions,\n            output_hidden_states=output_hidden_states,\n            return_dict=return_dict, )\n\n        return encoder_outputs\n\n\ndef T5Model(config):\n    config = T5Config(**config)\n    model = T5EncoderModel(config)\n    return model\n\n\ndef get_t5_model(name, pretrained=True):\n    #t5_config = dict_from_json_file(name)\n    #model = T5Model(t5_config)\n    model = T5EncoderModel(\n        vocab_size=32128,\n        d_model=1024,\n        d_kv=128,\n        d_ff=65536,\n        num_layers=2,\n        num_decoder_layers=None,\n        num_heads=128,\n        relative_attention_num_buckets=32,\n        dropout_rate=0.,\n        layer_norm_epsilon=1e-06,\n        feed_forward_proj=\"relu\")\n    if pretrained:\n        checkpoint = paddle.load(name + '/t5.pd', return_numpy=True)\n        model.set_state_dict(checkpoint['model'])\n    model.eval()\n    for p in model.parameters():\n        p.stop_gradient = True\n\n    return model\n\n\ndef t5_11b():\n    return T5EncoderModel(\n        vocab_size=32128,\n        d_model=1024,\n        d_kv=128,\n        d_ff=65536,\n        num_layers=24,\n        num_decoder_layers=None,\n        num_heads=128,\n        relative_attention_num_buckets=32,\n        dropout_rate=0.,\n        layer_norm_epsilon=1e-06,\n        feed_forward_proj=\"relu\")\n\n\ndef dict_from_json_file(name):\n    with open(name + '/config.json', \"r\", encoding=\"utf-8\") as reader:\n        text = reader.read()\n        config_dict = json.loads(text)\n        return config_dict\n\n\ndef t5_encode_text(t5, texts, tokenizer, return_attn_mask=False):\n    token_ids, attn_mask = t5_tokenize(texts, tokenizer)\n    t5.eval()\n    with paddle.no_grad():\n        encoded_text = t5(input_ids=token_ids, attention_mask=attn_mask)\n        text_features = encoded_text.last_hidden_state.detach()\n\n    if return_attn_mask:\n        #attn_mask = attn_mask.cast('bool')\n        return text_features, attn_mask\n\n    return text_features\n\n\ndef get_encoded_dim(name):\n    return dict_from_json_file(name)['d_model']\n"
  },
  {
    "path": "ppfleetx/models/language_model/t5/utils.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nfrom paddle.nn.initializer import TruncatedNormal, Constant, Normal\n\ntrunc_normal_ = TruncatedNormal(std=0.02)\nzeros_ = Constant(value=0.0)\nones_ = Constant(value=1.0)\n\n\n@paddle.no_grad()\ndef constant_(x, value):\n    temp_value = paddle.full(x.shape, value, x.dtype)\n    x.set_value(temp_value)\n    return x\n\n\n@paddle.no_grad()\ndef normal_(x, mean=0., std=1.):\n    temp_value = paddle.normal(mean, std, shape=x.shape)\n    x.set_value(temp_value)\n    return\n\n\ndef normal_init(layer, mean=0, std=1, bias=0):\n    if hasattr(layer, 'weight') and layer.weight is not None:\n        normal_(layer.weight, mean, std)\n    else:\n        normal_(layer, mean, std)\n    if hasattr(layer, 'bias') and layer.bias is not None:\n        constant_(layer.bias, bias)\n\n\ndef constant_init(layer, val, bias=0):\n    if hasattr(layer, 'weight') and layer.weight is not None:\n        constant_(layer.weight, val)\n    if hasattr(layer, 'bias') and layer.bias is not None:\n        constant_(layer.bias, bias)\n"
  },
  {
    "path": "ppfleetx/models/language_model/utils.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport logging\nimport os\nimport sys\nimport copy\n\nimport yaml\nimport numpy as np\nimport paddle\nimport paddle.distributed as dist\nfrom paddle.fluid import core\nimport argparse\nfrom functools import reduce\n\nfrom ppfleetx.distributed.apis import env\nfrom ppfleetx.utils.log import logger\n\n\ndef is_fused_matmul_bias_supported():\n    if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm():\n        return hasattr(core.eager.ops.legacy, 'fused_gemm_epilogue')\n    else:\n        return False\n\n\ndef process_inference_configs(config):\n    \"\"\"\n    process inference configs for hybrid parallel\n    \"\"\"\n    if 'Inference' not in config.keys():\n        return\n\n    configs = config['Inference']\n\n    if configs['model_dir'] is None:\n        configs['model_dir'] = config['Engine']['save_load']['output_dir']\n\n    if configs['mp_degree'] is None:\n        configs['mp_degree'] = config['Distributed']['mp_degree']\n\n\ndef process_model_configs(config):\n    \"\"\"\n    process model configs for hybrid parallel\n    \"\"\"\n    configs = config['Model']\n    if configs['ffn_hidden_size'] is None:\n        configs['ffn_hidden_size'] = 4 * configs['hidden_size']\n\n    if configs['use_recompute']:\n        if not configs['recompute_granularity']:\n            configs['recompute_granularity'] = 'full'\n        if not configs['no_recompute_layers']:\n            configs['no_recompute_layers'] = []\n        else:\n            assert isinstance(configs['no_recompute_layers'],\n                              list), \"no_recompute_layers should be a list\"\n            for i in configs['no_recompute_layers']:\n                assert isinstance(\n                    i, int\n                ), \"all values in no_recompute_layers should be an integer\"\n            assert min(configs['no_recompute_layers']) >= 0, \\\n                \"the min value in no_recompute_layers should >= 0\"\n            assert max(configs['no_recompute_layers']) < configs['num_layers'], \\\n                \"the max value in no_recompute_layers should < num_layers\"\n            configs['no_recompute_layers'] = sorted(\n                list(set(configs['no_recompute_layers'])))\n\n    if configs['fused_linear'] and not is_fused_matmul_bias_supported():\n        configs['fused_linear'] = False\n        logging.warning(\n            \"The flag fused_linear only valid for cuda version higher than 11.6, \"\n            \"but the paddle is compiled with cuda \" + paddle.version.cuda())\n\n    pp_degree = config.Distributed.pp_degree\n\n    if pp_degree > 1:\n        configs['virtual_pp_degree'] = 1 \\\n            if configs.get('virtual_pp_degree', None) is None \\\n            else configs['virtual_pp_degree']\n        virtual_pp_degree = configs['virtual_pp_degree']\n        num_layers = configs.num_layers\n\n        if not (num_layers % (virtual_pp_degree * pp_degree)) == 0:\n            assert virtual_pp_degree == 1, \"virtual pp doesn't support uneven layer split.\"\n            logger.warning(\n                \"The num_layers of the model is not divisible by pp_degree.\" \\\n                \"Receive num_layers: {}, pp_degree: {}.\".format(num_layers, pp_degree))\n        else:\n            assert (num_layers %\n                (virtual_pp_degree * pp_degree)) == 0, \\\n                \"The num_layers of the model should be divisible of pp_degree * virtual_pp_degree.\" \\\n                \"Receive num_layers: {}, pp_degree: {}, virtual_pp_degree: {}.\".format(\n                num_layers, pp_degree, virtual_pp_degree)\n\n        if virtual_pp_degree > 1:\n            local_batch_size = config.Global.local_batch_size\n            micro_batch_size = config.Global.micro_batch_size\n            acc_steps = local_batch_size // micro_batch_size\n            assert acc_steps % pp_degree == 0, \"num of microbatches {} should be divisible of pp_degree {} when \" \\\n                                               \"using interleave pipeline\".format(acc_steps, pp_degree)\n\n        if virtual_pp_degree > 2:\n            logger.warning(\n                \"Setting virtual_pp_degree > 2 may harm the throughput of the pipeline parallel.\"\n            )\n    else:\n        if configs.get('virtual_pp_degree', None):\n            logger.warning(\"virtual_pp_degree is unuseful.\")\n\n\ndef process_optim_configs(config):\n    \"\"\"\n    process optim configs for hybrid parallel\n    \"\"\"\n    config['Optimizer']['multi_precision'] = config['Engine']['mix_precision'][\n        'enable']\n\n    nranks = dist.get_world_size()\n    dp_degree = config['Distributed']['dp_degree']\n    sharding_degree = config['Distributed']['sharding']['sharding_degree']\n    if config['Optimizer']['tensor_fusion']:\n        assert nranks == dp_degree * sharding_degree, \\\n            \"tensor_fusion only support single card train or data/sharding parallel train\"\n\n    if config['Optimizer']['lr']['decay_steps'] is None:\n        config['Optimizer']['lr']['decay_steps'] = config['Engine'][\n            'max_steps']\n    config['Optimizer']['lr']['decay_steps'] *= config['Global'][\n        'global_batch_size']\n\n\ndef process_data_configs(config):\n    \"\"\"\n    process data configs for hybrid parallel\n    \"\"\"\n    cfg_global = config['Global']\n    cfg_data = config['Data']\n\n    mode_to_num_samples = {\n        \"Train\":\n        cfg_global['global_batch_size'] * config['Engine']['max_steps'],\n        \"Eval\": cfg_global['global_batch_size'] *\n        (config['Engine']['max_steps'] // config['Engine']['eval_freq'] + 1) *\n        config['Engine']['eval_iters'],\n        \"Test\":\n        cfg_global['global_batch_size'] * config['Engine']['test_iters'],\n    }\n\n    for mode in (\"Train\", \"Eval\", \"Test\"):\n        if mode in cfg_data.keys():\n            cfg_data[mode]['dataset']['num_samples'] = mode_to_num_samples[\n                mode]\n            cfg_data[mode]['dataset']['mode'] = mode\n            cfg_data[mode]['dataset']['seed'] = cfg_global['seed']\n            cfg_data[mode]['dataset']['model_type'] = config['Model']['name']\n            cfg_data[mode]['sampler']['batch_size'] = cfg_global[\n                'local_batch_size']\n\n\ndef process_configs(config):\n    process_data_configs(config)\n    process_model_configs(config)\n    process_optim_configs(config)\n    process_inference_configs(config)\n\n    return config\n"
  },
  {
    "path": "ppfleetx/models/multimodal_model/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/models/multimodal_model/clip/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/models/multimodal_model/imagen/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom .modeling import (ImagenModel, imagen_397M_text2im_64, imagen_text2im_64,\n                       imagen_text2im_64_debertav2, imagen_SR256,\n                       imagen_SR1024, ImagenCriterion)\n"
  },
  {
    "path": "ppfleetx/models/multimodal_model/imagen/modeling.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom tqdm import tqdm\nfrom functools import partial\nfrom contextlib import contextmanager, nullcontext\n\nimport paddle\nimport paddle.nn.functional as F\nfrom paddle import nn\nimport paddle.vision.transforms as T\n\nfrom .unet import Unet\nfrom ppfleetx.models.language_model.debertav2 import *\nfrom ppfleetx.models.language_model.t5 import *\nfrom ppfleetx.data.tokenizers import get_t5_tokenizer, get_debertav2_tokenizer\nfrom .utils import (\n    GaussianDiffusionContinuousTimes, default, exists, cast_tuple, first,\n    maybe, eval_decorator, identity, pad_tuple_to_length, right_pad_dims_to,\n    resize_image_to, normalize_neg_one_to_one, rearrange, repeat, reduce,\n    unnormalize_zero_to_one, cast_uint8_images_to_float, is_float_dtype)\n\n\n# predefined unets, with configs lining up with hyperparameters in appendix of paper\nclass Unet64_397M(Unet):\n    def __init__(self, *args, **kwargs):\n        default_kwargs = dict(\n            dim=256,\n            dim_mults=(1, 2, 3, 4),\n            num_resnet_blocks=3,\n            layer_attns=(False, True, True, True),\n            layer_cross_attns=(False, True, True, True),\n            attn_heads=8,\n            ff_mult=2.,\n            memory_efficient=False)\n        super().__init__(*args, **{ ** default_kwargs, ** kwargs})\n\n\nclass BaseUnet64(Unet):\n    def __init__(self, *args, **kwargs):\n        default_kwargs = dict(\n            dim=512,\n            cond_dim=512,\n            dim_mults=(1, 2, 3, 4),\n            num_resnet_blocks=3,\n            layer_attns=(False, True, True, True),\n            layer_cross_attns=(False, True, True, True),\n            attn_heads=8,\n            ff_mult=2.,\n            memory_efficient=False)\n        super().__init__(*args, **{ ** default_kwargs, ** kwargs})\n\n\nclass SRUnet256(Unet):\n    def __init__(self, *args, **kwargs):\n        default_kwargs = dict(\n            dim=128,\n            dim_mults=(1, 2, 4, 8),\n            num_resnet_blocks=(2, 4, 8, 8),\n            layer_attns=(False, False, False, True),\n            layer_cross_attns=(False, False, False, True),\n            attn_heads=8,\n            ff_mult=2.,\n            memory_efficient=True)\n        super().__init__(*args, **{ ** default_kwargs, ** kwargs})\n\n\nclass SRUnet1024(Unet):\n    def __init__(self, *args, **kwargs):\n        default_kwargs = dict(\n            dim=128,\n            dim_mults=(1, 2, 4, 8),\n            num_resnet_blocks=(2, 4, 8, 8),\n            layer_attns=False,\n            layer_cross_attns=(False, False, False, True),\n            attn_heads=8,\n            ff_mult=2.,\n            memory_efficient=True)\n        super().__init__(*args, **{ ** default_kwargs, ** kwargs})\n\n\n# main imagen ddpm class, which is a cascading DDPM from Ho et al.\nclass ImagenCriterion(nn.Layer):\n    \"\"\"\n    Criterion for Imagen. It calculates the final loss.\n    \"\"\"\n\n    def __init__(self, name='mse_loss', p2_loss_weight_k=1):\n        super(ImagenCriterion, self).__init__()\n        self.p2_loss_weight_k = p2_loss_weight_k\n\n        if name == 'l1_loss':\n            self.loss_func = F.l1_loss\n        elif name == 'mse_loss':\n            self.loss_func = F.mse_loss\n        elif name == 'smooth_l1_loss':\n            self.loss_func = F.smooth_l1_loss\n        else:\n            raise NotImplementedError()\n\n    def forward(self, pred, target, log_snr, p2_loss_weight_gamma):\n        \"\"\"\n        Args:\n            pred(Tensor):\n                The logits of prediction. Its data type should be float32 and\n                its shape is [batch_size, sequence_length, vocab_size].\n            target(Tensor):\n                The labels of the prediction, default is noise.\n\n        Returns:\n            Tensor: The pretraining loss. Its data type should be float32 and its shape is [1].\n\n        \"\"\"\n        losses = self.loss_func(pred, target, reduction=\"none\")\n        losses = reduce(losses, 'b ... -> b', 'mean')\n\n        # p2 loss reweighting\n\n        if p2_loss_weight_gamma > 0:\n            loss_weight = (\n                self.p2_loss_weight_k + log_snr.exp())**-p2_loss_weight_gamma\n            losses = losses * loss_weight\n\n        return losses.mean()\n\n\nclass ImagenModel(nn.Layer):\n    def __init__(\n            self,\n            unets,\n            image_sizes,\n            text_encoder_name=None,\n            text_embed_dim=1024,\n            channels=3,\n            timesteps=1000,\n            cond_drop_prob=0.1,\n            noise_schedules='cosine',\n            pred_objectives='noise',\n            random_crop_sizes=None,\n            lowres_noise_schedule='linear',\n            lowres_sample_noise_level=0.2,\n            per_sample_random_aug_noise_level=False,\n            condition_on_text=True,\n            auto_normalize_img=True,\n            p2_loss_weight_gamma=0.5,\n            dynamic_thresholding=True,\n            dynamic_thresholding_percentile=0.95,\n            only_train_unet_number=None,\n            is_sr=False,\n            is_video=False,\n            fused_linear=False, ):\n        super().__init__()\n\n        # conditioning hparams\n\n        self.condition_on_text = condition_on_text\n        self.unconditional = not condition_on_text\n        self.is_sr = is_sr\n        self.is_video = is_video\n\n        # channels\n\n        self.channels = channels\n\n        # automatically take care of ensuring that first unet is unconditional\n        # while the rest of the unets are conditioned on the low resolution image produced by previous unet\n\n        unets = cast_tuple(unets)\n        num_unets = len(unets)\n\n        # determine noise schedules per unet\n\n        timesteps = cast_tuple(timesteps, num_unets)\n\n        # make sure noise schedule defaults to 'cosine', 'cosine', and then 'linear' for rest of super-resoluting unets\n\n        noise_schedules = cast_tuple(noise_schedules)\n        noise_schedules = pad_tuple_to_length(noise_schedules, 2, 'cosine')\n        noise_schedules = pad_tuple_to_length(noise_schedules, num_unets,\n                                              'linear')\n\n        # construct noise schedulers\n\n        noise_scheduler_klass = GaussianDiffusionContinuousTimes\n        self.noise_schedulers = nn.LayerList([])\n\n        for timestep, noise_schedule in zip(timesteps, noise_schedules):\n            noise_scheduler = noise_scheduler_klass(\n                noise_schedule=noise_schedule, timesteps=timestep)\n            self.noise_schedulers.append(noise_scheduler)\n\n        # randomly cropping for upsampler training\n\n        self.random_crop_sizes = cast_tuple(random_crop_sizes, num_unets)\n        assert not exists(\n            first(self.random_crop_sizes)\n        ), 'you should not need to randomly crop image during training for base unet, only for upsamplers - so pass in `random_crop_sizes = (None, 128, 256)` as example'\n\n        # lowres augmentation noise schedule\n\n        self.lowres_noise_schedule = GaussianDiffusionContinuousTimes(\n            noise_schedule=lowres_noise_schedule)\n\n        # ddpm objectives - predicting noise by default\n\n        self.pred_objectives = cast_tuple(pred_objectives, num_unets)\n\n        # get text encoder\n\n        self.text_encoder_name = text_encoder_name\n\n        if text_encoder_name is None:\n            pass\n        elif 't5' in text_encoder_name:\n            self.text_embed_dim = default(\n                text_embed_dim, lambda: get_encoded_dim(text_encoder_name))\n            self.t5_encoder = get_t5_model(\n                name=text_encoder_name, pretrained=True)\n            self.tokenizer = get_t5_tokenizer(name=text_encoder_name)\n            self.t5_encode_text = t5_encode_text\n        elif 'deberta' in text_encoder_name:\n            self.text_embed_dim = default(\n                text_embed_dim,\n                lambda: get_debertav2_encoded_dim(text_encoder_name))\n            self.debertav2_encoder = get_debertav2_model(\n                name=text_encoder_name, pretrained=True)\n            self.tokenizer = get_debertav2_tokenizer(name=text_encoder_name)\n            self.debertav2_encode_text = debertav2_encode_text\n        else:\n            raise NotImplementedError(\"Please implement the text encoder.\")\n\n        # construct unets\n\n        self.unets = nn.LayerList([])\n\n        self.unet_being_trained_index = -1  # keeps track of which unet is being trained at the moment\n        self.only_train_unet_number = only_train_unet_number\n\n        for ind, one_unet in enumerate(unets):\n            assert isinstance(one_unet, Unet)\n            is_first = ind == 0\n\n            one_unet = one_unet.cast_model_parameters(\n                cond_on_text=self.condition_on_text,\n                text_embed_dim=self.text_embed_dim\n                if self.condition_on_text else None,\n                channels=self.channels,\n                channels_out=self.channels)\n\n            self.unets.append(one_unet)\n\n        # unet image sizes\n\n        image_sizes = cast_tuple(image_sizes)\n        self.image_sizes = image_sizes\n\n        self.sample_channels = cast_tuple(self.channels, num_unets)\n\n        self.right_pad_dims_to_datatype = partial(\n            rearrange, pattern=('b -> b 1 1 1'))\n\n        # cascading ddpm related stuff\n\n        self.lowres_sample_noise_level = lowres_sample_noise_level\n        self.per_sample_random_aug_noise_level = per_sample_random_aug_noise_level\n\n        # classifier free guidance\n\n        self.cond_drop_prob = cond_drop_prob\n        self.can_classifier_guidance = cond_drop_prob > 0.\n\n        # normalize and unnormalize image functions\n\n        self.normalize_img = normalize_neg_one_to_one if auto_normalize_img else identity\n        self.unnormalize_img = unnormalize_zero_to_one if auto_normalize_img else identity\n        self.input_image_range = (0. if auto_normalize_img else -1., 1.)\n\n        # dynamic thresholding\n\n        self.dynamic_thresholding = cast_tuple(dynamic_thresholding, num_unets)\n        self.dynamic_thresholding_percentile = dynamic_thresholding_percentile\n\n        # p2 loss weight\n\n        self.p2_loss_weight_gamma = cast_tuple(p2_loss_weight_gamma, num_unets)\n\n        assert all([\n            (gamma_value <= 2) for gamma_value in self.p2_loss_weight_gamma\n        ]), 'in paper, they noticed any gamma greater than 2 is harmful'\n\n        # one temp parameter for keeping track of device\n\n    def get_unet(self, unet_number):\n        assert 0 < unet_number <= len(self.unets)\n        index = unet_number - 1\n\n        if isinstance(self.unets, nn.LayerList):\n            unets_list = [unet for unet in self.unets]\n            delattr(self, 'unets')\n            self.unets = unets_list\n        self.unet_being_trained_index = index\n        return self.unets[index]\n\n    def reset_unets(self, ):\n        self.unets = nn.LayerList([*self.unets])\n        self.unet_being_trained_index = -1\n\n    @contextmanager\n    def one_unet_in_gpu(self, unet_number=None, unet=None):\n        assert exists(unet_number) ^ exists(unet)\n\n        if exists(unet_number):\n            unet = self.unets[unet_number - 1]\n\n        yield\n\n    def reset_unets_all(self, ):\n        self.unets = nn.LayerList([*self.unets])\n        self.unet_being_trained_index = -1\n\n    # overriding state dict functions\n\n    def state_dict(self, *args, **kwargs):\n        self.reset_unets()\n        return super().state_dict(*args, **kwargs)\n\n    def load_state_dict(self, *args, **kwargs):\n        self.reset_unets_all()\n        return self.unets[self.unet_being_trained_index].set_state_dict(\n            *args, **kwargs)\n\n    # gaussian diffusion methods\n\n    def p_mean_variance(self,\n                        unet,\n                        x,\n                        t,\n                        *,\n                        noise_scheduler,\n                        text_embeds=None,\n                        text_mask=None,\n                        cond_images=None,\n                        lowres_cond_img=None,\n                        self_cond=None,\n                        lowres_noise_times=None,\n                        cond_scale=1.,\n                        model_output=None,\n                        t_next=None,\n                        pred_objective='noise',\n                        dynamic_threshold=True):\n        assert not (\n            cond_scale != 1. and not self.can_classifier_guidance\n        ), 'imagen was not trained with conditional dropout, and thus one cannot use classifier free guidance (cond_scale anything other than 1)'\n        time_var = noise_scheduler.get_condition(t)\n        pred = default(model_output, lambda: unet.forward_with_cond_scale(x, time_var, text_embeds = text_embeds, text_mask = text_mask, cond_images = cond_images, cond_scale = cond_scale, lowres_cond_img = lowres_cond_img, lowres_noise_times = self.lowres_noise_schedule.get_condition(lowres_noise_times)))\n\n        if pred_objective == 'noise':\n            x_start = noise_scheduler.predict_start_from_noise(\n                x, t=t, noise=pred)\n        elif pred_objective == 'x_start':\n            x_start = pred\n        elif pred_objective == 'v':\n            x_start = noise_scheduler.predict_start_from_v(x, t=t, v=pred)\n        else:\n            raise ValueError(f'unknown objective {pred_objective}')\n\n        if dynamic_threshold:\n            # following pseudocode in appendix\n            # s is the dynamic threshold, determined by percentile of absolute values of reconstructed sample per batch element\n            s = paddle.quantile(\n                rearrange(x_start, 'b ... -> b (...)').abs(),\n                self.dynamic_thresholding_percentile,\n                axis=-1)\n\n            s.clip_(min=1.)\n            s = right_pad_dims_to(x_start, s)\n            x_start = x_start.clip(-s, s) / s\n        else:\n            x_start.clip_(-1., 1.)\n\n        mean_and_variance = noise_scheduler.q_posterior(\n            x_start=x_start, x_t=x, t=t, t_next=t_next)\n        return mean_and_variance, x_start\n\n    @paddle.no_grad()\n    def p_sample(self,\n                 unet,\n                 x,\n                 t,\n                 *,\n                 noise_scheduler,\n                 t_next=None,\n                 text_embeds=None,\n                 text_mask=None,\n                 cond_images=None,\n                 cond_scale=1.,\n                 self_cond=None,\n                 lowres_cond_img=None,\n                 lowres_noise_times=None,\n                 pred_objective='noise',\n                 dynamic_threshold=True):\n        b = x.shape[0]\n        (model_mean, _, model_log_variance), x_start = self.p_mean_variance(\n            unet,\n            x=x,\n            t=t,\n            t_next=t_next,\n            noise_scheduler=noise_scheduler,\n            text_embeds=text_embeds,\n            text_mask=text_mask,\n            cond_images=cond_images,\n            cond_scale=cond_scale,\n            lowres_cond_img=lowres_cond_img,\n            self_cond=self_cond,\n            lowres_noise_times=lowres_noise_times,\n            pred_objective=pred_objective,\n            dynamic_threshold=dynamic_threshold)\n        noise = paddle.randn(shape=x.shape, dtype=x.dtype)\n        # no noise when t == 0\n        is_last_sampling_timestep = (t_next == 0) if isinstance(\n            noise_scheduler, GaussianDiffusionContinuousTimes) else (t == 0)\n        nonzero_mask = (1 - is_last_sampling_timestep.cast('float32')).reshape(\n            [b, *((1, ) * (len(x.shape) - 1))])\n        pred = model_mean + nonzero_mask * (0.5 * model_log_variance\n                                            ).exp() * noise\n        return pred, x_start\n\n    @paddle.no_grad()\n    def p_sample_loop(self,\n                      unet,\n                      shape,\n                      *,\n                      noise_scheduler,\n                      lowres_cond_img=None,\n                      lowres_noise_times=None,\n                      text_embeds=None,\n                      text_mask=None,\n                      cond_images=None,\n                      inpaint_images=None,\n                      inpaint_masks=None,\n                      inpaint_resample_times=5,\n                      init_images=None,\n                      skip_steps=None,\n                      cond_scale=1,\n                      pred_objective='noise',\n                      dynamic_threshold=True):\n\n        batch = shape[0]\n        img = paddle.randn(shape)\n\n        # for initialization with an image or video\n\n        if exists(init_images):\n            img += init_images\n\n        # keep track of x0, for self conditioning\n\n        x_start = None\n\n        # prepare inpainting\n\n        has_inpainting = exists(inpaint_images) and exists(inpaint_masks)\n        resample_times = inpaint_resample_times if has_inpainting else 1\n\n        if has_inpainting:\n            inpaint_images = self.normalize_img(inpaint_images)\n            inpaint_images = resize_image_to(inpaint_images, shape[-1])\n            inpaint_masks = resize_image_to(\n                rearrange(inpaint_masks, 'b ... -> b 1 ...').cast('float32'),\n                shape[-1]).cast('bool')\n\n        # time\n\n        timesteps = noise_scheduler.get_sampling_timesteps(batch)\n\n        # whether to skip any steps\n\n        skip_steps = default(skip_steps, 0)\n        timesteps = timesteps[skip_steps:]\n\n        for times, times_next in tqdm(\n                timesteps, desc='sampling loop time step',\n                total=len(timesteps)):\n            is_last_timestep = times_next == 0\n\n            for r in reversed(range(resample_times)):\n                is_last_resample_step = r == 0\n\n                if has_inpainting:\n                    noised_inpaint_images, *_ = noise_scheduler.q_sample(\n                        inpaint_images, t=times)\n                    img = img * ~inpaint_masks + noised_inpaint_images * inpaint_masks\n\n                self_cond = x_start if unet.self_cond else None\n\n                img, x_start = self.p_sample(\n                    unet,\n                    img,\n                    times,\n                    t_next=times_next,\n                    text_embeds=text_embeds,\n                    text_mask=text_mask,\n                    cond_images=cond_images,\n                    cond_scale=cond_scale,\n                    self_cond=self_cond,\n                    lowres_cond_img=lowres_cond_img,\n                    lowres_noise_times=lowres_noise_times,\n                    noise_scheduler=noise_scheduler,\n                    pred_objective=pred_objective,\n                    dynamic_threshold=dynamic_threshold)\n\n                if has_inpainting and not (is_last_resample_step or\n                                           paddle.all(is_last_timestep)):\n                    renoised_img = noise_scheduler.q_sample_from_to(\n                        img, times_next, times)\n\n                    img = paddle.where(\n                        self.right_pad_dims_to_datatype(is_last_timestep), img,\n                        renoised_img)\n\n        img.clip_(-1., 1.)\n\n        # final inpainting\n\n        if has_inpainting:\n            img = img * ~inpaint_masks + inpaint_images * inpaint_masks\n\n        unnormalize_img = self.unnormalize_img(img)\n        return unnormalize_img\n\n    @paddle.no_grad()\n    @eval_decorator\n    def sample(\n            self,\n            texts=None,\n            text_masks=None,\n            text_embeds=None,\n            cond_images=None,\n            inpaint_images=None,\n            inpaint_masks=None,\n            inpaint_resample_times=5,\n            init_images=None,\n            skip_steps=None,\n            batch_size=1,\n            cond_scale=1.,\n            lowres_sample_noise_level=None,\n            start_at_unet_number=1,\n            start_image_or_video=None,\n            stop_at_unet_number=None,\n            return_all_unet_outputs=True,\n            return_pil_images=False, ):\n        self.reset_unets()\n\n        cond_images = maybe(cast_uint8_images_to_float)(cond_images)\n\n        if exists(texts) and not exists(\n                text_embeds) and not self.unconditional:\n            with paddle.amp.auto_cast(enable=False):\n                if 't5' in self.text_encoder_name:\n                    text_embeds, text_masks = self.t5_encode_text(\n                        t5=self.t5_encoder, texts=texts, return_attn_mask=True)\n                elif 'debert' in self.text_encoder_name:\n                    text_embeds, text_masks = self.debertav2_encode_text(\n                        debertav2=self.debertav2_encoder,\n                        texts=texts,\n                        return_attn_mask=True)\n\n        if not self.unconditional:\n            text_masks = default(\n                text_masks, lambda: paddle.any(text_embeds != 0., axis=-1))\n            batch_size = text_embeds.shape[0]\n\n        if exists(inpaint_images):\n            if self.unconditional:\n                if batch_size == 1:  # assume researcher wants to broadcast along inpainted images\n                    batch_size = inpaint_images.shape[0]\n\n            assert inpaint_images.shape[\n                0] == batch_size, 'number of inpainting images must be equal to the specified batch size on sample `sample(batch_size=<int>)``'\n            assert not (\n                self.condition_on_text and\n                inpaint_images.shape[0] != text_embeds.shape[0]\n            ), 'number of inpainting images must be equal to the number of text to be conditioned on'\n\n        assert not (\n            self.condition_on_text and not exists(text_embeds)\n        ), 'text or text encodings must be passed into imagen if specified'\n        assert not (\n            not self.condition_on_text and exists(text_embeds)\n        ), 'imagen specified not to be conditioned on text, yet it is presented'\n        assert not (\n            exists(text_embeds) and\n            text_embeds.shape[-1] != self.text_embed_dim\n        ), f'invalid text embedding dimension being passed in (should be {self.text_embed_dim})'\n\n        assert not (\n            exists(inpaint_images) ^ exists(inpaint_masks)\n        ), 'inpaint images and masks must be both passed in to do inpainting'\n\n        outputs = []\n\n        lowres_sample_noise_level = default(lowres_sample_noise_level,\n                                            self.lowres_sample_noise_level)\n\n        num_unets = len(self.unets)\n\n        # condition scaling\n\n        cond_scale = cast_tuple(cond_scale, num_unets)\n\n        # for initial image and skipping steps\n\n        init_images = cast_tuple(init_images, num_unets)\n        init_images = [\n            maybe(self.normalize_img)(init_image) for init_image in init_images\n        ]\n\n        skip_steps = cast_tuple(skip_steps, num_unets)\n\n        # handle starting at a unet greater than 1, for training only-upscaler training\n\n        if start_at_unet_number > 1:\n\n            assert not exists(stop_at_unet_number\n                              ) or start_at_unet_number <= stop_at_unet_number\n            assert exists(\n                start_image_or_video\n            ), 'starting image or video must be supplied if only doing upscaling'\n\n            prev_image_size = self.image_sizes[start_at_unet_number - 1]\n            img = resize_image_to(start_image_or_video, prev_image_size)\n\n        # go through each unet in cascade\n\n        for unet_number, unet, channel, image_size, noise_scheduler, pred_objective, dynamic_threshold, unet_cond_scale, unet_init_images, unet_skip_steps in tqdm(\n                zip(\n                    range(1, num_unets + 1), self.unets, self.sample_channels,\n                    self.image_sizes, self.noise_schedulers,\n                    self.pred_objectives, self.dynamic_thresholding,\n                    cond_scale, init_images, skip_steps)):\n\n            lowres_cond_img = lowres_noise_times = None\n            shape = (batch_size, channel, image_size, image_size)\n\n            if unet.lowres_cond:\n                lowres_noise_times = self.lowres_noise_schedule.get_times(\n                    batch_size, lowres_sample_noise_level)\n\n                lowres_cond_img = resize_image_to(img, image_size)\n                lowres_cond_img = self.normalize_img(lowres_cond_img)\n                lowres_cond_img, *_ = self.lowres_noise_schedule.q_sample(\n                    x_start=lowres_cond_img,\n                    t=lowres_noise_times,\n                    noise=paddle.randn(\n                        shape=lowres_cond_img.shape,\n                        dtype=lowres_cond_img.dtype))\n\n            if exists(unet_init_images):\n                unet_init_images = resize_image_to(unet_init_images,\n                                                   image_size)\n\n            shape = (batch_size, self.channels, image_size, image_size)\n\n            img = self.p_sample_loop(\n                unet,\n                shape,\n                text_embeds=text_embeds,\n                text_mask=text_masks,\n                cond_images=cond_images,\n                inpaint_images=inpaint_images,\n                inpaint_masks=inpaint_masks,\n                inpaint_resample_times=inpaint_resample_times,\n                init_images=unet_init_images,\n                skip_steps=unet_skip_steps,\n                cond_scale=unet_cond_scale,\n                lowres_cond_img=lowres_cond_img,\n                lowres_noise_times=lowres_noise_times,\n                noise_scheduler=noise_scheduler,\n                pred_objective=pred_objective,\n                dynamic_threshold=dynamic_threshold)\n\n            outputs.append(img)\n\n            if exists(stop_at_unet_number\n                      ) and stop_at_unet_number == unet_number:\n                break\n\n        output_index = -1 if not return_all_unet_outputs else slice(\n            None)  # either return last unet output or all unet outputs\n\n        if not return_pil_images:\n            return outputs[output_index]\n\n        if not return_all_unet_outputs:\n            outputs = outputs[-1:]\n\n        pil_images = list(\n            map(lambda img: list(map(T.ToPILImage(), img.unbind(dim=0))),\n                outputs))\n\n        return pil_images[\n            output_index]  # now you have a bunch of pillow images you can just .save(/where/ever/you/want.png)\n\n    def p_losses(self,\n                 unet,\n                 x_start,\n                 times,\n                 *,\n                 noise_scheduler,\n                 lowres_cond_img=None,\n                 lowres_aug_times=None,\n                 text_embeds=None,\n                 text_mask=None,\n                 cond_images=None,\n                 noise=None,\n                 times_next=None,\n                 pred_objective='noise',\n                 p2_loss_weight_gamma=0.,\n                 random_crop_size=None):\n        is_video = x_start.ndim == 5\n\n        noise = default(noise, lambda: paddle.randn(shape=x_start.shape, dtype=x_start.dtype))\n\n        # normalize to [-1, 1]\n\n        x_start = self.normalize_img(x_start)\n        lowres_cond_img = maybe(self.normalize_img)(lowres_cond_img)\n\n        # random cropping during training\n        # for upsamplers\n\n        if exists(random_crop_size):\n            if is_video:\n                frames = x_start.shape[2]\n                x_start, lowres_cond_img, noise = rearrange_many(\n                    (x_start, lowres_cond_img,\n                     noise), 'b c f h w -> (b f) c h w')\n\n            aug = K.RandomCrop((random_crop_size, random_crop_size), p=1.)\n\n            # make sure low res conditioner and image both get augmented the same way\n            # detailed https://kornia.readthedocs.io/en/latest/augmentation.module.html?highlight=randomcrop#kornia.augmentation.RandomCrop\n            x_start = aug(x_start)\n            lowres_cond_img = aug(lowres_cond_img, params=aug._params)\n            noise = aug(noise, params=aug._params)\n\n            if is_video:\n                x_start, lowres_cond_img, noise = rearrange_many(\n                    (x_start, lowres_cond_img, noise),\n                    '(b f) c h w -> b c f h w',\n                    f=frames)\n\n        # get x_t\n\n        x_noisy, log_snr, alpha, sigma = noise_scheduler.q_sample(\n            x_start=x_start, t=times, noise=noise)\n\n        # also noise the lowres conditioning image\n        # at sample time, they then fix the noise level of 0.1 - 0.3\n\n        lowres_cond_img_noisy = None\n        if exists(lowres_cond_img):\n            lowres_aug_times = default(lowres_aug_times, times)\n            lowres_cond_img_noisy, *_ = self.lowres_noise_schedule.q_sample(\n                x_start=lowres_cond_img,\n                t=lowres_aug_times,\n                noise=paddle.randn(\n                    shape=lowres_cond_img.shape, dtype=lowres_cond_img.dtype))\n\n        # time condition\n\n        noise_cond = noise_scheduler.get_condition(times)\n\n        # unet kwargs\n\n        unet_kwargs = dict(\n            text_embeds=text_embeds,\n            text_mask=text_mask,\n            cond_images=cond_images,\n            lowres_noise_times=self.lowres_noise_schedule.get_condition(\n                lowres_aug_times),\n            lowres_cond_img=lowres_cond_img_noisy,\n            cond_drop_prob=self.cond_drop_prob, )\n\n        # self condition if needed\n\n        # Because 'unet' can be an instance of DistributedDataParallel coming from the\n        # ImagenTrainer.unet_being_trained when invoking ImagenTrainer.forward(), we need to\n        # access the member 'module' of the wrapped unet instance.\n        self_cond = unet._layers.self_cond if isinstance(\n            unet, paddle.DataParallel) else unet.self_cond\n\n        if self_cond and random() < 0.5:\n            with paddle.no_grad():\n                pred = unet.forward(x_noisy, noise_cond,\n                                    **unet_kwargs).detach()\n\n                x_start = noise_scheduler.predict_start_from_noise(\n                    x_noisy, t=times,\n                    noise=pred) if pred_objective == 'noise' else pred\n\n                unet_kwargs = { ** unet_kwargs, 'self_cond': x_start}\n\n        # get prediction\n\n        pred = unet.forward(x_noisy, noise_cond, **unet_kwargs)\n\n        # prediction objective\n\n        if pred_objective == 'noise':\n            target = noise\n        elif pred_objective == 'x_start':\n            target = x_start\n        elif pred_objective == 'v':\n            # derivation detailed in Appendix D of Progressive Distillation paper\n            # https://arxiv.org/abs/2202.00512\n            # this makes distillation viable as well as solve an issue with color shifting in upresoluting unets, noted in imagen-video\n            target = alpha * noise - sigma * x_start\n        else:\n            raise ValueError(f'unknown objective {pred_objective}')\n\n        return pred, target, log_snr, p2_loss_weight_gamma\n\n    def forward(self,\n                images,\n                unet=None,\n                texts=None,\n                text_embeds=None,\n                text_masks=None,\n                unet_number=None,\n                cond_images=None):\n        if self.is_video and images.ndim == 4:\n            images = rearrange(images, 'b c h w -> b c 1 h w')\n\n        assert images.shape[-1] == images.shape[\n            -2], f'the images you pass in must be a square, but received dimensions of {images.shape[2]}, {images.shape[-1]}'\n        assert not (\n            len(self.unets) > 1 and not exists(unet_number)\n        ), f'you must specify which unet you want trained, from a range of 1 to {len(self.unets)}, if you are training cascading DDPM (multiple unets)'\n        unet_number = default(unet_number, 1)\n        assert not exists(\n            self.only_train_unet_number\n        ) or self.only_train_unet_number == unet_number, 'you can only train on unet #{self.only_train_unet_number}'\n\n        images = cast_uint8_images_to_float(images)\n        cond_images = maybe(cast_uint8_images_to_float)(cond_images)\n\n        assert is_float_dtype(\n            images.dtype\n        ), f'images tensor needs to be floats but {images.dtype} dtype found instead'\n\n        unet_index = unet_number - 1\n\n        unet = default(unet, lambda: self.get_unet(unet_number))\n\n        noise_scheduler = self.noise_schedulers[unet_index]\n        p2_loss_weight_gamma = self.p2_loss_weight_gamma[unet_index]\n        pred_objective = self.pred_objectives[unet_index]\n        target_image_size = self.image_sizes[unet_index]\n        random_crop_size = self.random_crop_sizes[unet_index]\n        if self.is_sr:\n            prev_image_size = self.image_sizes[unet_index - 1]\n        else:\n            prev_image_size = None\n        b, c, h, w = images.shape\n\n        assert images.shape[1] == self.channels\n        assert h >= target_image_size and w >= target_image_size\n\n        times = noise_scheduler.sample_random_times(b)\n\n        if exists(texts) and not exists(\n                text_embeds) and not self.unconditional:\n            assert len(texts) == len(\n                images\n            ), 'number of text captions does not match up with the number of images given'\n            with paddle.amp.auto_cast(enable=False):\n                if 't5' in self.text_encoder_name:\n                    text_embeds, text_masks = self.t5_encode_text(\n                        t5=self.t5_encoder,\n                        texts=texts,\n                        tokenizer=self.tokenizer,\n                        return_attn_mask=True)\n                elif 'deberta' in self.text_encoder_name:\n                    text_embeds, text_masks = self.debertav2_encode_text(\n                        debertav2=self.debertav2_encoder,\n                        texts=texts,\n                        tokenizer=self.tokenizer,\n                        return_attn_mask=True)\n                else:\n                    raise NotImplementedError(\n                        \"Please implement the text encoder.\")\n\n        if not self.unconditional:\n            text_masks = default(\n                text_masks, lambda: paddle.any(text_embeds != 0., axis=-1))\n\n        assert not (\n            self.condition_on_text and not exists(text_embeds)\n        ), 'text or text encodings must be passed into decoder if specified'\n        assert not (\n            not self.condition_on_text and exists(text_embeds)\n        ), 'decoder specified not to be conditioned on text, yet it is presented'\n\n        assert not (\n            exists(text_embeds) and\n            text_embeds.shape[-1] != self.text_embed_dim\n        ), f'invalid text embedding dimension being passed in (should be {self.text_embed_dim})'\n\n        lowres_cond_img = lowres_aug_times = None\n        if exists(prev_image_size):\n            lowres_cond_img = resize_image_to(images, prev_image_size)\n            lowres_cond_img = resize_image_to(lowres_cond_img,\n                                              target_image_size)\n\n            if self.per_sample_random_aug_noise_level:\n                lowres_aug_times = self.lowres_noise_schedule.sample_random_times(\n                    b)\n            else:\n                lowres_aug_time = self.lowres_noise_schedule.sample_random_times(\n                    1)\n                lowres_aug_times = repeat(lowres_aug_time, '1 -> b', b=b)\n\n        images = resize_image_to(images, target_image_size)\n\n        return self.p_losses(\n            unet,\n            images,\n            times,\n            text_embeds=text_embeds,\n            text_mask=text_masks,\n            cond_images=cond_images,\n            noise_scheduler=noise_scheduler,\n            lowres_cond_img=lowres_cond_img,\n            lowres_aug_times=lowres_aug_times,\n            pred_objective=pred_objective,\n            p2_loss_weight_gamma=p2_loss_weight_gamma,\n            random_crop_size=random_crop_size)\n\n\ndef imagen_397M_text2im_64(**kwargs):\n    use_recompute = kwargs.pop('use_recompute')\n    recompute_granularity = kwargs.pop('recompute_granularity')\n    model = ImagenModel(\n        unets=Unet64_397M(use_recompute=use_recompute),\n        image_sizes=(64, ),\n        **kwargs)\n    return model\n\n\ndef imagen_text2im_64(**kwargs):\n    use_recompute = kwargs.pop('use_recompute')\n    recompute_granularity = kwargs.pop('recompute_granularity')\n    if 'lowres_cond' in kwargs:\n        lowres_cond = kwargs.pop('lowres_cond')\n    else:\n        lowres_cond = False\n    model = ImagenModel(\n        unets=BaseUnet64(\n            lowres_cond=lowres_cond, use_recompute=use_recompute),\n        image_sizes=(64, ),\n        **kwargs)\n    return model\n\n\ndef imagen_text2im_64_debertav2(**kwargs):\n    use_recompute = kwargs.pop('use_recompute')\n    recompute_granularity = kwargs.pop('recompute_granularity')\n    model = ImagenModel(\n        unets=BaseUnet64(\n            dim=360, use_recompute=use_recompute),\n        image_sizes=(64, ),\n        **kwargs)\n    return model\n\n\ndef imagen_text2im_64_SR256(**kwargs):\n    use_recompute = kwargs.pop('use_recompute')\n    recompute_granularity = kwargs.pop('recompute_granularity')\n    model = ImagenModel(\n        unets=(BaseUnet64(use_recompute=use_recompute),\n               SRUnet256(use_recompute=use_recompute)),\n        image_sizes=(64, 256),\n        **kwargs)\n    return model\n\n\ndef imagen_SR256(**kwargs):\n    use_recompute = kwargs.pop('use_recompute')\n    recompute_granularity = kwargs.pop('recompute_granularity')\n    if 'lowres_cond' in kwargs:\n        lowres_cond = kwargs.pop('lowres_cond')\n    else:\n        lowres_cond = False\n    model = ImagenModel(\n        unets=SRUnet256(\n            lowres_cond=lowres_cond, use_recompute=use_recompute),\n        image_sizes=(256, 64),\n        **kwargs)\n    return model\n\n\ndef imagen_SR1024(**kwargs):\n    use_recompute = kwargs.pop('use_recompute')\n    recompute_granularity = kwargs.pop('recompute_granularity')\n    if 'lowres_cond' in kwargs:\n        lowres_cond = kwargs.pop('lowres_cond')\n    else:\n        lowres_cond = False\n    model = ImagenModel(\n        unets=SRUnet1024(\n            dim=128, lowres_cond=lowres_cond, use_recompute=use_recompute),\n        image_sizes=(1024, 256),\n        **kwargs)\n    return model\n"
  },
  {
    "path": "ppfleetx/models/multimodal_model/imagen/unet.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport math\nfrom pathlib import Path\nfrom functools import partial\n\nimport paddle\nfrom paddle import nn\nfrom paddle import nn, einsum\nimport paddle.nn.functional as F\nfrom paddle.distributed.fleet.utils import recompute\n\nfrom .utils import (zeros_, zero_init_, default, exists, cast_tuple, l2norm,\n                    resize_image_to, prob_mask_like, masked_mean, Identity,\n                    repeat, repeat_many, Rearrange, rearrange, rearrange_many,\n                    EinopsToAndFrom, Parallel, Always, print_once)\n\nfrom ppfleetx.models.language_model.t5.modeling import finfo\n\n\nclass LayerNorm(nn.Layer):\n    def __init__(self, feats, stable=False, dim=-1):\n        super().__init__()\n        self.stable = stable\n        self.dim = dim\n\n        self.g = self.create_parameter(\n            [feats, *((1, ) * (-dim - 1))],\n            default_initializer=nn.initializer.Constant(value=1.))\n\n    def forward(self, x):\n        dtype, dim = x.dtype, self.dim\n\n        if self.stable:\n            x = x / x.amax(axis=dim, keepdim=True).detach()\n\n        eps = 1e-5 if x.dtype == paddle.float32 else 1e-3\n        var = paddle.var(x, axis=dim, unbiased=False, keepdim=True)\n        mean = paddle.mean(x, axis=dim, keepdim=True)\n\n        return (x - mean) * (\n            var + eps).rsqrt().cast(dtype) * self.g.cast(dtype)\n\n\nChanLayerNorm = partial(LayerNorm, dim=-3)\n\n\nclass Residual(nn.Layer):\n    def __init__(self, fn):\n        super().__init__()\n        self.fn = fn\n\n    def forward(self, x, **kwargs):\n        return self.fn(x, **kwargs) + x\n\n\n# attention pooling\n\n\nclass PerceiverAttention(nn.Layer):\n    def __init__(self, *, dim, dim_head=64, heads=8, cosine_sim_attn=False):\n        super().__init__()\n        self.scale = dim_head**-0.5 if not cosine_sim_attn else 1\n        self.cosine_sim_attn = cosine_sim_attn\n        self.cosine_sim_scale = 16 if cosine_sim_attn else 1\n\n        self.heads = heads\n        inner_dim = dim_head * heads\n\n        self.norm = nn.LayerNorm(dim)\n        self.norm_latents = nn.LayerNorm(dim)\n\n        self.to_q = nn.Linear(dim, inner_dim, bias_attr=False)\n        self.to_kv = nn.Linear(dim, inner_dim * 2, bias_attr=False)\n\n        self.to_out = nn.Sequential(\n            nn.Linear(\n                inner_dim, dim, bias_attr=False), nn.LayerNorm(dim))\n\n    def forward(self, x, latents, mask=None):\n        x = self.norm(x)\n        latents = self.norm_latents(latents)\n\n        b, h = x.shape[0], self.heads\n\n        q = self.to_q(latents)\n\n        # the paper differs from Perceiver in which they also concat the key / values derived from the latents to be attended to\n        kv_input = paddle.concat((x, latents), axis=-2)\n        k, v = self.to_kv(kv_input).chunk(2, axis=-1)\n\n        q, k, v = rearrange_many((q, k, v), 'b n (h d) -> b h n d', h=h)\n\n        q = q * self.scale\n\n        # cosine sim attention\n\n        if self.cosine_sim_attn:\n            q, k = map(l2norm, (q, k))\n\n        # similarities and masking\n\n        sim = einsum('... i d, ... j d  -> ... i j', q,\n                     k) * self.cosine_sim_scale\n\n        if exists(mask):\n            max_neg_value = -finfo(sim.dtype).max\n            mask = F.pad(mask, (0, latents.shape[-2]), value=True)\n            mask = rearrange(mask, 'b j -> b 1 1 j')\n            sim = paddle.where(mask == 0, paddle.to_tensor(max_neg_value), sim)\n\n        # attention\n\n        attn = F.softmax(sim, axis=-1, dtype=paddle.float32)\n        attn = attn.cast(sim.dtype)\n\n        out = einsum('... i j, ... j d -> ... i d', attn, v)\n        B, H, N, D = out.shape\n        out = out.transpose([0, 2, 1, 3]).reshape([B, N, -1])\n        return self.to_out(out)\n\n\nclass PerceiverResampler(nn.Layer):\n    def __init__(\n            self,\n            *,\n            dim,\n            depth,\n            dim_head=64,\n            heads=8,\n            num_latents=64,\n            num_latents_mean_pooled=4,  # number of latents derived from mean pooled representation of the sequence\n            max_seq_len=512,\n            ff_mult=4,\n            cosine_sim_attn=False):\n        super().__init__()\n        self.pos_emb = nn.Embedding(max_seq_len, dim)\n\n        self.latents = self.create_parameter(\n            [num_latents, dim], default_initializer=nn.initializer.Normal())\n\n        self.to_latents_from_mean_pooled_seq = None\n\n        if num_latents_mean_pooled > 0:\n            self.to_latents_from_mean_pooled_seq = nn.Sequential(\n                LayerNorm(dim),\n                nn.Linear(dim, dim * num_latents_mean_pooled),\n                Rearrange(\n                    'b (n d) -> b n d', n=num_latents_mean_pooled))\n\n        self.layers = nn.LayerList([])\n        for _ in range(depth):\n            self.layers.append(\n                nn.LayerList([\n                    PerceiverAttention(\n                        dim=dim,\n                        dim_head=dim_head,\n                        heads=heads,\n                        cosine_sim_attn=cosine_sim_attn), FeedForward(\n                            dim=dim, mult=ff_mult)\n                ]))\n\n    def forward(self, x, mask=None):\n        n = x.shape[1]\n        pos_emb = self.pos_emb(paddle.arange(n))\n\n        x_with_pos = x + pos_emb\n\n        latents = repeat(self.latents, 'n d -> b n d', b=x.shape[0])\n\n        if exists(self.to_latents_from_mean_pooled_seq):\n            meanpooled_seq = masked_mean(\n                x, axis=1, mask=paddle.ones(\n                    x.shape[:2], dtype=paddle.bool))\n            meanpooled_latents = self.to_latents_from_mean_pooled_seq(\n                meanpooled_seq)\n            latents = paddle.concat((meanpooled_latents, latents), axis=-2)\n\n        for attn, ff in self.layers:\n            latents = attn(x_with_pos, latents, mask=mask) + latents\n            latents = ff(latents) + latents\n\n        return latents\n\n\n# attention\n\n\nclass Attention(nn.Layer):\n    def __init__(\n            self,\n            dim,\n            *,\n            dim_head=64,\n            heads=8,\n            context_dim=None,\n            cosine_sim_attn=False,\n            use_recompute=False, ):\n        super().__init__()\n        self.use_recompute = use_recompute\n        self.scale = dim_head**-0.5 if not cosine_sim_attn else 1.\n        self.cosine_sim_attn = cosine_sim_attn\n        self.cosine_sim_scale = 16 if cosine_sim_attn else 1\n\n        self.heads = heads\n        inner_dim = dim_head * heads\n\n        self.norm = LayerNorm(dim)\n\n        self.null_kv = self.create_parameter(\n            [2, dim_head], default_initializer=nn.initializer.Normal())\n        self.to_q = nn.Linear(dim, inner_dim, bias_attr=False)\n        self.to_kv = nn.Linear(dim, dim_head * 2, bias_attr=False)\n\n        self.to_context = nn.Sequential(\n            nn.LayerNorm(context_dim), nn.Linear(\n                context_dim, dim_head * 2)) if exists(context_dim) else None\n\n        self.to_out = nn.Sequential(\n            nn.Linear(\n                inner_dim, dim, bias_attr=False), LayerNorm(dim))\n\n    def forward(self, x, context=None, mask=None, attn_bias=None):\n        if self.use_recompute:\n            return recompute(self._forward, x, context, mask, attn_bias)\n        else:\n            return self._forward(x, context, mask, attn_bias)\n\n    def _forward(self, x, context=None, mask=None, attn_bias=None):\n        b, n = x.shape[:2]\n\n        x = self.norm(x)\n\n        q, k, v = (self.to_q(x), *self.to_kv(x).chunk(2, axis=-1))\n\n        q = rearrange(q, 'b n (h d) -> b h n d', h=self.heads)\n\n        q = q * self.scale\n\n        # add null key / value for classifier free guidance in prior net\n\n        nk, nv = repeat_many(self.null_kv.unbind(axis=-2), 'd -> b 1 d', b=b)\n        k = paddle.concat((nk, k), axis=-2)\n        v = paddle.concat((nv, v), axis=-2)\n\n        # add text conditioning, if present\n\n        if exists(context):\n            assert exists(self.to_context)\n            ck, cv = self.to_context(context).chunk(2, axis=-1)\n            k = paddle.concat((ck, k), axis=-2)\n            v = paddle.concat((cv, v), axis=-2)\n\n        # cosine sim attention\n\n        if self.cosine_sim_attn:\n            q, k = map(l2norm, (q, k))\n\n        # calculate query / key similarities\n\n        sim = einsum('b h i d, b j d -> b h i j', q, k) * self.cosine_sim_scale\n\n        # relative positional encoding (T5 style)\n\n        if exists(attn_bias):\n            sim = sim + attn_bias\n\n        # masking\n\n        max_neg_value = -finfo(sim.dtype).max\n\n        if exists(mask):\n            mask = F.pad(mask, (1, 0), value=True)\n            mask = rearrange(mask, 'b j -> b 1 1 j')\n            sim = paddle.where(mask == 0, paddle.to_tensor(max_neg_value), sim)\n\n        # attention\n\n        attn = F.softmax(sim, axis=-1, dtype=paddle.float32)\n\n        # aggregate values\n\n        out = einsum('b h i j, b j d -> b h i d', attn, v)\n\n        out = rearrange(out, 'b h n d -> b n (h d)')\n        return self.to_out(out)\n\n\n# decoder\n\n\ndef Upsample(dim, dim_out=None):\n    dim_out = default(dim_out, dim)\n\n    return nn.Sequential(\n        nn.Upsample(\n            scale_factor=2, mode='nearest'),\n        nn.Conv2D(\n            dim, dim_out, 3, padding=1))\n\n\nclass PixelShuffleUpsample(nn.Layer):\n    \"\"\"\n    code shared by @MalumaDev at DALLE2 for addressing checkboard artifacts\n    https://arxiv.org/ftp/arxiv/papers/1707/1707.02937.pdf\n    \"\"\"\n\n    def __init__(self, dim, dim_out=None):\n        super().__init__()\n        dim_out = default(dim_out, dim)\n        conv = nn.Conv2D(dim, dim_out * 4, 1)\n\n        self.net = nn.Sequential(conv, nn.Silu(), nn.PixelShuffle(2))\n\n        self.init_conv_(conv)\n\n    def init_conv_(self, conv):\n        o, i, h, w = conv.weight.shape\n        conv_weight = paddle.empty([o // 4, i, h, w])\n        nn.initializer.KaimingUniform(conv_weight)\n        conv_weight = repeat(conv_weight, 'o ... -> (o 4) ...')\n\n        conv.weight.set_value(conv_weight)\n        zeros_(conv.bias)\n\n    def forward(self, x):\n        return self.net(x)\n\n\ndef Downsample(dim, dim_out=None):\n    dim_out = default(dim_out, dim)\n    return nn.Sequential(\n        Rearrange(\n            'b c (h s1) (w s2) -> b (c s1 s2) h w', s1=2, s2=2),\n        nn.Conv2D(dim * 4, dim_out, 1))\n\n\nclass SinusoidalPosEmb(nn.Layer):\n    def __init__(self, dim):\n        super().__init__()\n        self.dim = dim\n\n    def forward(self, x):\n        half_dim = self.dim // 2\n        emb = math.log(10000) / (half_dim - 1)\n        emb = paddle.exp(paddle.arange(half_dim) * -emb)\n        emb = x[:, None] * emb[None, :]\n        return paddle.concat((emb.sin(), emb.cos()), axis=-1)\n\n\nclass LearnedSinusoidalPosEmb(nn.Layer):\n    \"\"\" following @crowsonkb 's lead with learned sinusoidal pos emb \"\"\"\n    \"\"\" https://github.com/crowsonkb/v-diffusion-jax/blob/master/diffusion/models/danbooru_128.py#L8 \"\"\"\n\n    def __init__(self, dim):\n        super().__init__()\n        assert (dim % 2) == 0\n        half_dim = dim // 2\n        self.weights = self.create_parameter(\n            [half_dim], default_initializer=nn.initializer.Normal())\n\n    def forward(self, x):\n        x = x[:, None]\n        freqs = x * self.weights[None, :] * 2 * math.pi\n        fouriered = paddle.concat((freqs.sin(), freqs.cos()), axis=-1)\n        fouriered = paddle.concat((x, fouriered), axis=-1)\n        return fouriered\n\n\nclass Block(nn.Layer):\n    def __init__(self, dim, dim_out, groups=8, norm=True):\n        super().__init__()\n        self.groupnorm = nn.GroupNorm(groups, dim) if norm else Identity()\n        self.activation = nn.Silu()\n        self.project = nn.Conv2D(dim, dim_out, 3, padding=1)\n\n    def forward(self, x, scale_shift=None):\n        x = self.groupnorm(x)\n\n        if exists(scale_shift):\n            scale, shift = scale_shift\n            x = x * (scale + 1) + shift\n\n        x = self.activation(x)\n        return self.project(x)\n\n\nclass ResnetBlock(nn.Layer):\n    def __init__(self,\n                 dim,\n                 dim_out,\n                 *,\n                 cond_dim=None,\n                 time_cond_dim=None,\n                 groups=8,\n                 linear_attn=False,\n                 use_gca=False,\n                 squeeze_excite=False,\n                 use_recompute=False,\n                 **attn_kwargs):\n        super().__init__()\n\n        self.time_mlp = None\n        self.use_recompute = use_recompute\n\n        if exists(time_cond_dim):\n            self.time_mlp = nn.Sequential(\n                nn.Silu(), nn.Linear(time_cond_dim, dim_out * 2))\n\n        self.cross_attn = None\n\n        if exists(cond_dim):\n            attn_klass = CrossAttention if not linear_attn else LinearCrossAttention\n\n            self.cross_attn = attn_klass(\n                dim=dim_out, context_dim=cond_dim, **attn_kwargs)\n\n        self.block1 = Block(dim, dim_out, groups=groups)\n        self.block2 = Block(dim_out, dim_out, groups=groups)\n\n        self.gca = GlobalContext(\n            dim_in=dim_out, dim_out=dim_out) if use_gca else Always(1)\n\n        self.res_conv = nn.Conv2D(dim, dim_out,\n                                  1) if dim != dim_out else Identity()\n\n    def forward(self, x, time_emb=None, cond=None):\n        scale_shift = None\n        if exists(self.time_mlp) and exists(time_emb):\n            time_emb = self.time_mlp(time_emb)\n            time_emb = time_emb[:, :, None, None]\n            scale_shift = time_emb.chunk(2, axis=1)\n\n        h = self.block1(x)\n\n        if exists(self.cross_attn):\n            assert exists(cond)\n            h = h.transpose([0, 2, 3, 1])\n            n, b, c, *_ = h.shape\n            h = h.reshape([n, b * c, -1])\n            h = self.cross_attn(h, context=cond) + h\n            h = h.reshape([n, b, c, -1])\n            h = h.transpose([0, 3, 1, 2])\n\n        h = self.block2(h, scale_shift=scale_shift)\n\n        h = h * self.gca(h)\n\n        return h + self.res_conv(x)\n\n\nclass CrossAttention(nn.Layer):\n    def __init__(self,\n                 dim,\n                 *,\n                 context_dim=None,\n                 dim_head=64,\n                 heads=8,\n                 norm_context=False,\n                 cosine_sim_attn=False):\n        super().__init__()\n        self.scale = dim_head**-0.5 if not cosine_sim_attn else 1.\n        self.cosine_sim_attn = cosine_sim_attn\n        self.cosine_sim_scale = 16 if cosine_sim_attn else 1\n\n        self.heads = heads\n        inner_dim = dim_head * heads\n\n        context_dim = default(context_dim, dim)\n\n        self.norm = LayerNorm(dim)\n        self.norm_context = LayerNorm(\n            context_dim) if norm_context else Identity()\n\n        self.null_kv = self.create_parameter(\n            [2, dim_head], default_initializer=nn.initializer.Normal())\n        self.to_q = nn.Linear(dim, inner_dim, bias_attr=False)\n        self.to_kv = nn.Linear(context_dim, inner_dim * 2, bias_attr=False)\n\n        self.to_out = nn.Sequential(\n            nn.Linear(\n                inner_dim, dim, bias_attr=False), LayerNorm(dim))\n\n    def forward(self, x, context, mask=None):\n        b, n = x.shape[:2]\n\n        x = self.norm(x)\n        context = self.norm_context(context)\n\n        q, k, v = (self.to_q(x), *self.to_kv(context).chunk(2, axis=-1))\n\n        q, k, v = rearrange_many(\n            (q, k, v), 'b n (h d) -> b h n d', h=self.heads)\n\n        # add null key / value for classifier free guidance in prior net\n\n        nk, nv = repeat_many(\n            self.null_kv.unbind(axis=-2), 'd -> b h 1 d', h=self.heads, b=b)\n\n        k = paddle.concat((nk, k), axis=-2)\n        v = paddle.concat((nv, v), axis=-2)\n\n        q = q * self.scale\n\n        # cosine sim attention\n\n        if self.cosine_sim_attn:\n            q, k = map(l2norm, (q, k))\n\n        # similarities\n\n        sim = einsum('b h i d, b h j d -> b h i j', q,\n                     k) * self.cosine_sim_scale\n\n        # masking\n\n        max_neg_value = -finfo(sim.dtype).max\n\n        if exists(mask):\n            mask = F.pad(mask, (1, 0), value=True)\n            mask = rearrange(mask, 'b j -> b 1 1 j')\n            sim = paddle.where(mask == 0, paddle.to_tensor(max_neg_value), sim)\n\n        attn = F.softmax(sim, axis=-1, dtype=paddle.float32)\n        attn = attn.cast(sim.dtype)\n\n        out = einsum('b h i j, b h j d -> b h i d', attn, v)\n        out = rearrange(out, 'b h n d -> b n (h d)')\n        return self.to_out(out)\n\n\nclass LinearCrossAttention(CrossAttention):\n    def forward(self, x, context, mask=None):\n        b, n = x.shape[:2]\n\n        x = self.norm(x)\n        context = self.norm_context(context)\n\n        q, k, v = (self.to_q(x), *self.to_kv(context).chunk(2, axis=-1))\n\n        q, k, v = rearrange_many(\n            (q, k, v), 'b n (h d) -> (b h) n d', h=self.heads)\n\n        # add null key / value for classifier free guidance in prior net\n\n        nk, nv = repeat_many(\n            self.null_kv.unbind(axis=-2), 'd -> (b h) 1 d', h=self.heads, b=b)\n\n        k = paddle.concat((nk, k), axis=-2)\n        v = paddle.concat((nv, v), axis=-2)\n\n        # masking\n\n        max_neg_value = -finfo(x.dtype).max\n\n        if exists(mask):\n            mask = F.pad(mask, (1, 0), value=True)\n            mask = rearrange(mask, 'b n -> b n 1')\n            k = paddle.where(mask == 0, paddle.to_tensor(max_neg_value), k)\n            v = paddle.where(mask == 0, paddle.to_tensor(0.), v)\n\n        # linear attention\n\n        q = F.softmax(q, axis=-1)\n        k = F.softmax(k, axis=-2)\n\n        q = q * self.scale\n\n        context = einsum('b n d, b n e -> b d e', k, v)\n        out = einsum('b n d, b d e -> b n e', q, context)\n        out = rearrange(out, '(b h) n d -> b n (h d)', h=self.heads)\n        return self.to_out(out)\n\n\nclass LinearAttention(nn.Layer):\n    def __init__(self,\n                 dim,\n                 dim_head=32,\n                 heads=8,\n                 dropout=0.05,\n                 context_dim=None,\n                 **kwargs):\n        super().__init__()\n        self.scale = dim_head**-0.5\n        self.heads = heads\n        inner_dim = dim_head * heads\n        self.norm = ChanLayerNorm(dim)\n\n        self.nonlin = nn.Silu()\n\n        self.to_q = nn.Sequential(\n            nn.Dropout(dropout),\n            nn.Conv2D(\n                dim, inner_dim, 1, bias_attr=False),\n            nn.Conv2D(\n                inner_dim,\n                inner_dim,\n                3,\n                bias_attr=False,\n                padding=1,\n                groups=inner_dim))\n\n        self.to_k = nn.Sequential(\n            nn.Dropout(dropout),\n            nn.Conv2D(\n                dim, inner_dim, 1, bias_attr=False),\n            nn.Conv2D(\n                inner_dim,\n                inner_dim,\n                3,\n                bias_attr=False,\n                padding=1,\n                groups=inner_dim))\n\n        self.to_v = nn.Sequential(\n            nn.Dropout(dropout),\n            nn.Conv2D(\n                dim, inner_dim, 1, bias_attr=False),\n            nn.Conv2D(\n                inner_dim,\n                inner_dim,\n                3,\n                bias_attr=False,\n                padding=1,\n                groups=inner_dim))\n\n        self.to_context = nn.Sequential(\n            nn.LayerNorm(context_dim),\n            nn.Linear(\n                context_dim, inner_dim * 2,\n                bias_attr=False)) if exists(context_dim) else None\n\n        self.to_out = nn.Sequential(\n            nn.Conv2D(\n                inner_dim, dim, 1, bias_attr=False), ChanLayerNorm(dim))\n\n    def forward(self, fmap, context=None):\n        h, x, y = self.heads, *fmap.shape[-2:]\n\n        fmap = self.norm(fmap)\n        q, k, v = map(lambda fn: fn(fmap), (self.to_q, self.to_k, self.to_v))\n        q, k, v = rearrange_many(\n            (q, k, v), 'b (h c) x y -> (b h) (x y) c', h=h)\n\n        if exists(context):\n            assert exists(self.to_context)\n            ck, cv = self.to_context(context).chunk(2, axis=-1)\n            ck, cv = rearrange_many((ck, cv), 'b n (h d) -> (b h) n d', h=h)\n            k = paddle.concat((k, ck), axis=-2)\n            v = paddle.concat((v, cv), axis=-2)\n\n        q = F.softmax(q, axis=-1)\n        k = F.softmax(k, axis=-2)\n\n        q = q * self.scale\n\n        context = einsum('b n d, b n e -> b d e', k, v)\n        out = einsum('b n d, b d e -> b n e', q, context)\n        out = rearrange(out, '(b h) (x y) d -> b (h d) x y', h=h, x=x, y=y)\n\n        out = self.nonlin(out)\n        return self.to_out(out)\n\n\nclass GlobalContext(nn.Layer):\n    \"\"\" basically a superior form of squeeze-excitation that is attention-esque \"\"\"\n\n    def __init__(self, *, dim_in, dim_out):\n        super().__init__()\n        self.to_k = nn.Conv2D(dim_in, 1, 1)\n        hidden_dim = max(3, dim_out // 2)\n\n        self.net = nn.Sequential(\n            nn.Conv2D(dim_in, hidden_dim, 1),\n            nn.Silu(), nn.Conv2D(hidden_dim, dim_out, 1), nn.Sigmoid())\n\n    def forward(self, x):\n        context = self.to_k(x)\n        x, context = rearrange_many((x, context), 'b n ... -> b n (...)')\n        out = einsum('b i n, b c n -> b c i', F.softmax(context, axis=-1), x)\n        out = out[:, :, :, None]\n        return self.net(out)\n\n\ndef FeedForward(dim, mult=2):\n    hidden_dim = int(dim * mult)\n    return nn.Sequential(\n        LayerNorm(dim),\n        nn.Linear(\n            dim, hidden_dim, bias_attr=False),\n        nn.GELU(),\n        LayerNorm(hidden_dim),\n        nn.Linear(\n            hidden_dim, dim, bias_attr=False))\n\n\ndef ChanFeedForward(\n        dim, mult=2\n):  # in paper, it seems for self attention layers they did feedforwards with twice channel width\n    hidden_dim = int(dim * mult)\n    return nn.Sequential(\n        ChanLayerNorm(dim),\n        nn.Conv2D(\n            dim, hidden_dim, 1, bias_attr=False),\n        nn.GELU(),\n        ChanLayerNorm(hidden_dim),\n        nn.Conv2D(\n            hidden_dim, dim, 1, bias_attr=False))\n\n\nclass TransformerBlock(nn.Layer):\n    def __init__(\n            self,\n            dim,\n            *,\n            depth=1,\n            heads=8,\n            dim_head=32,\n            ff_mult=2,\n            context_dim=None,\n            cosine_sim_attn=False,\n            use_recompute=False, ):\n        super().__init__()\n        self.layers = nn.LayerList([])\n\n        for _ in range(depth):\n            self.layers.append(\n                nn.LayerList([\n                    Attention(\n                        dim=dim,\n                        heads=heads,\n                        dim_head=dim_head,\n                        context_dim=context_dim,\n                        cosine_sim_attn=cosine_sim_attn,\n                        use_recompute=use_recompute), FeedForward(\n                            dim=dim, mult=ff_mult)\n                ]))\n\n    def forward(self, x, context=None):\n        x = x.transpose([0, 2, 3, 1])\n        n, b, c, *_ = x.shape\n        x = x.reshape([n, b * c, -1])\n        for attn, ff in self.layers:\n            x = attn(x, context=context) + x\n            x = ff(x) + x\n        x = x.reshape([n, b, c, -1])\n        x = x.transpose([0, 3, 1, 2])\n        return x\n\n\nclass LinearAttentionTransformerBlock(nn.Layer):\n    def __init__(self,\n                 dim,\n                 *,\n                 depth=1,\n                 heads=8,\n                 dim_head=32,\n                 ff_mult=2,\n                 context_dim=None,\n                 **kwargs):\n        super().__init__()\n        self.layers = nn.LayerList([])\n\n        for _ in range(depth):\n            self.layers.append(\n                nn.LayerList([\n                    LinearAttention(\n                        dim=dim,\n                        heads=heads,\n                        dim_head=dim_head,\n                        context_dim=context_dim), ChanFeedForward(\n                            dim=dim, mult=ff_mult)\n                ]))\n\n    def forward(self, x, context=None):\n        for attn, ff in self.layers:\n            x = attn(x, context=context) + x\n            x = ff(x) + x\n        return x\n\n\nclass CrossEmbedLayer(nn.Layer):\n    def __init__(self, dim_in, kernel_sizes, dim_out=None, stride=2):\n        super().__init__()\n        assert all([*map(lambda t: (t % 2) == (stride % 2), kernel_sizes)])\n        dim_out = default(dim_out, dim_in)\n\n        kernel_sizes = sorted(kernel_sizes)\n        num_scales = len(kernel_sizes)\n\n        # calculate the dimension at each scale\n        dim_scales = [int(dim_out / (2**i)) for i in range(1, num_scales)]\n        dim_scales = [*dim_scales, dim_out - sum(dim_scales)]\n\n        self.convs = nn.LayerList([])\n        for kernel, dim_scale in zip(kernel_sizes, dim_scales):\n            self.convs.append(\n                nn.Conv2D(\n                    dim_in,\n                    dim_scale,\n                    kernel,\n                    stride=stride,\n                    padding=(kernel - stride) // 2))\n\n    def forward(self, x):\n        fmaps = tuple(map(lambda conv: conv(x), self.convs))\n        return paddle.concat(fmaps, axis=1)\n\n\nclass UpsampleCombiner(nn.Layer):\n    def __init__(self,\n                 dim,\n                 *,\n                 enabled=False,\n                 dim_ins=tuple(),\n                 dim_outs=tuple()):\n        super().__init__()\n        dim_outs = cast_tuple(dim_outs, len(dim_ins))\n        assert len(dim_ins) == len(dim_outs)\n\n        self.enabled = enabled\n\n        if not self.enabled:\n            self.dim_out = dim\n            return\n\n        self.fmap_convs = nn.LayerList([\n            Block(dim_in, dim_out)\n            for dim_in, dim_out in zip(dim_ins, dim_outs)\n        ])\n        self.dim_out = dim + (sum(dim_outs) if len(dim_outs) > 0 else 0)\n\n    def forward(self, x, fmaps=None):\n        target_size = x.shape[-1]\n\n        fmaps = default(fmaps, tuple())\n\n        if not self.enabled or len(fmaps) == 0 or len(self.fmap_convs) == 0:\n            return x\n\n        fmaps = [resize_image_to(fmap, target_size) for fmap in fmaps]\n        outs = [conv(fmap) for fmap, conv in zip(fmaps, self.fmap_convs)]\n        return paddle.concat((x, *outs), axis=1)\n\n\nclass Unet(nn.Layer):\n    def __init__(self,\n                 *,\n                 dim,\n                 image_embed_dim=1024,\n                 text_embed_dim=1024,\n                 num_resnet_blocks=1,\n                 cond_dim=None,\n                 num_image_tokens=4,\n                 num_time_tokens=2,\n                 learned_sinu_pos_emb_dim=16,\n                 out_dim=None,\n                 dim_mults=(1, 2, 4, 8),\n                 cond_images_channels=0,\n                 channels=3,\n                 channels_out=None,\n                 attn_dim_head=64,\n                 attn_heads=8,\n                 ff_mult=2.,\n                 lowres_cond=False,\n                 layer_attns=True,\n                 layer_attns_depth=1,\n                 layer_mid_attns_depth=1,\n                 layer_attns_add_text_cond=True,\n                 attend_at_middle=True,\n                 layer_cross_attns=True,\n                 use_linear_attn=False,\n                 use_linear_cross_attn=False,\n                 cond_on_text=True,\n                 max_text_len=256,\n                 init_dim=None,\n                 resnet_groups=8,\n                 init_conv_kernel_size=7,\n                 init_cross_embed=True,\n                 init_cross_embed_kernel_sizes=(3, 7, 15),\n                 cross_embed_downsample=False,\n                 cross_embed_downsample_kernel_sizes=(2, 4),\n                 attn_pool_text=True,\n                 attn_pool_num_latents=32,\n                 dropout=0.,\n                 memory_efficient=False,\n                 init_conv_to_final_conv_residual=False,\n                 use_global_context_attn=True,\n                 scale_skip_connection=True,\n                 final_resnet_block=True,\n                 final_conv_kernel_size=3,\n                 cosine_sim_attn=False,\n                 self_cond=False,\n                 combine_upsample_fmaps=False,\n                 pixel_shuffle_upsample=True,\n                 use_recompute=False):\n        super().__init__()\n\n        self.use_recompute = use_recompute\n        # guide researchers\n\n        assert attn_heads > 1, 'you need to have more than 1 attention head, ideally at least 4 or 8'\n\n        if dim < 128:\n            print_once(\n                'The base dimension of your u-net should ideally be no smaller than 128, as recommended by a professional DDPM trainer https://nonint.com/2022/05/04/friends-dont-let-friends-train-small-diffusion-models/'\n            )\n\n        # save locals to take care of some hyperparameters for cascading DDPM\n\n        self._locals = locals()\n        self._locals.pop('self', None)\n        self._locals.pop('__class__', None)\n\n        # determine dimensions\n\n        self.channels = channels\n        self.channels_out = default(channels_out, channels)\n\n        init_channels = channels * (1 + int(lowres_cond) + int(self_cond))\n        init_dim = default(init_dim, dim)\n\n        self.self_cond = self_cond\n\n        # optional image conditioning\n\n        self.has_cond_image = cond_images_channels > 0\n        self.cond_images_channels = cond_images_channels\n\n        init_channels += cond_images_channels\n\n        # initial convolution\n\n        self.init_conv = CrossEmbedLayer(\n            init_channels,\n            dim_out=init_dim,\n            kernel_sizes=init_cross_embed_kernel_sizes,\n            stride=1) if init_cross_embed else nn.Conv2D(\n                init_channels,\n                init_dim,\n                init_conv_kernel_size,\n                padding=init_conv_kernel_size // 2)\n\n        dims = [init_dim, *map(lambda m: dim * m, dim_mults)]\n        in_out = list(zip(dims[:-1], dims[1:]))\n\n        # time conditioning\n\n        cond_dim = default(cond_dim, dim)\n        time_cond_dim = dim * 4 * (2 if lowres_cond else 1)\n\n        # embedding time for log(snr) noise from continuous version\n\n        sinu_pos_emb = LearnedSinusoidalPosEmb(learned_sinu_pos_emb_dim)\n        sinu_pos_emb_input_dim = learned_sinu_pos_emb_dim + 1\n\n        self.to_time_hiddens = nn.Sequential(\n            sinu_pos_emb,\n            nn.Linear(sinu_pos_emb_input_dim, time_cond_dim), nn.Silu())\n\n        self.to_time_cond = nn.Sequential(\n            nn.Linear(time_cond_dim, time_cond_dim))\n\n        # project to time tokens as well as time hiddens\n\n        self.to_time_tokens = nn.Sequential(\n            nn.Linear(time_cond_dim, cond_dim * num_time_tokens),\n            Rearrange(\n                'b (n d) -> b n d', n=num_time_tokens))\n\n        # low res aug noise conditioning\n\n        self.lowres_cond = lowres_cond\n\n        if lowres_cond:\n            self.to_lowres_time_hiddens = nn.Sequential(\n                LearnedSinusoidalPosEmb(learned_sinu_pos_emb_dim),\n                nn.Linear(learned_sinu_pos_emb_dim + 1, time_cond_dim),\n                nn.Silu())\n\n            self.to_lowres_time_cond = nn.Sequential(\n                nn.Linear(time_cond_dim, time_cond_dim))\n\n            self.to_lowres_time_tokens = nn.Sequential(\n                nn.Linear(time_cond_dim, cond_dim * num_time_tokens),\n                Rearrange(\n                    'b (n d) -> b n d', n=num_time_tokens))\n\n        # normalizations\n\n        self.norm_cond = nn.LayerNorm(cond_dim)\n\n        # text encoding conditioning (optional)\n\n        self.text_to_cond = None\n\n        if cond_on_text:\n            assert exists(\n                text_embed_dim\n            ), 'text_embed_dim must be given to the unet if cond_on_text is True'\n            self.text_to_cond = nn.Linear(text_embed_dim, cond_dim)\n\n        # finer control over whether to condition on text encodings\n\n        self.cond_on_text = cond_on_text\n\n        # attention pooling\n\n        self.attn_pool = PerceiverResampler(\n            dim=cond_dim,\n            depth=2,\n            dim_head=attn_dim_head,\n            heads=attn_heads,\n            num_latents=attn_pool_num_latents,\n            cosine_sim_attn=cosine_sim_attn) if attn_pool_text else None\n\n        # for classifier free guidance\n\n        self.max_text_len = max_text_len\n\n        self.null_text_embed = self.create_parameter(\n            [1, max_text_len, cond_dim],\n            default_initializer=nn.initializer.Normal())\n        self.null_text_hidden = self.create_parameter(\n            [1, time_cond_dim], default_initializer=nn.initializer.Normal())\n\n        # for non-attention based text conditioning at all points in the network where time is also conditioned\n\n        self.to_text_non_attn_cond = None\n\n        if cond_on_text:\n            self.to_text_non_attn_cond = nn.Sequential(\n                nn.LayerNorm(cond_dim),\n                nn.Linear(cond_dim, time_cond_dim),\n                nn.Silu(), nn.Linear(time_cond_dim, time_cond_dim))\n\n        # attention related params\n\n        attn_kwargs = dict(\n            heads=attn_heads,\n            dim_head=attn_dim_head,\n            cosine_sim_attn=cosine_sim_attn,\n            use_recompute=use_recompute)\n\n        num_layers = len(in_out)\n\n        # resnet block klass\n\n        num_resnet_blocks = cast_tuple(num_resnet_blocks, num_layers)\n        resnet_groups = cast_tuple(resnet_groups, num_layers)\n\n        resnet_klass = partial(ResnetBlock, **attn_kwargs)\n\n        layer_attns = cast_tuple(layer_attns, num_layers)\n        layer_attns_depth = cast_tuple(layer_attns_depth, num_layers)\n        layer_cross_attns = cast_tuple(layer_cross_attns, num_layers)\n\n        use_linear_attn = cast_tuple(use_linear_attn, num_layers)\n        use_linear_cross_attn = cast_tuple(use_linear_cross_attn, num_layers)\n\n        assert all([\n            layers == num_layers\n            for layers in list(\n                map(len, (resnet_groups, layer_attns, layer_cross_attns)))\n        ])\n\n        # downsample klass\n\n        downsample_klass = Downsample\n\n        if cross_embed_downsample:\n            downsample_klass = partial(\n                CrossEmbedLayer,\n                kernel_sizes=cross_embed_downsample_kernel_sizes)\n\n        # initial resnet block (for memory efficient unet)\n\n        self.init_resnet_block = resnet_klass(\n            init_dim,\n            init_dim,\n            time_cond_dim=time_cond_dim,\n            groups=resnet_groups[0],\n            use_gca=use_global_context_attn) if memory_efficient else None\n\n        # scale for resnet skip connections\n\n        self.skip_connect_scale = 1. if not scale_skip_connection else (2\n                                                                        **-0.5)\n\n        # layers\n\n        self.downs = nn.LayerList([])\n        self.ups = nn.LayerList([])\n        num_resolutions = len(in_out)\n\n        layer_params = [\n            num_resnet_blocks, resnet_groups, layer_attns, layer_attns_depth,\n            layer_cross_attns, use_linear_attn, use_linear_cross_attn\n        ]\n        reversed_layer_params = list(map(reversed, layer_params))\n\n        # downsampling layers\n\n        skip_connect_dims = []  # keep track of skip connection dimensions\n\n        for ind, ((dim_in, dim_out), layer_num_resnet_blocks, groups,\n                  layer_attn, layer_attn_depth, layer_cross_attn,\n                  layer_use_linear_attn, layer_use_linear_cross_attn\n                  ) in enumerate(zip(in_out, *layer_params)):\n            is_last = ind >= (num_resolutions - 1)\n\n            layer_cond_dim = cond_dim if layer_cross_attn or layer_use_linear_cross_attn else None\n\n            if layer_attn:\n                transformer_block_klass = TransformerBlock\n            elif layer_use_linear_attn:\n                transformer_block_klass = LinearAttentionTransformerBlock\n            else:\n                transformer_block_klass = Identity\n\n            current_dim = dim_in\n\n            # whether to pre-downsample, from memory efficient unet\n\n            pre_downsample = None\n\n            if memory_efficient:\n                pre_downsample = downsample_klass(dim_in, dim_out)\n                current_dim = dim_out\n\n            skip_connect_dims.append(current_dim)\n\n            # whether to do post-downsample, for non-memory efficient unet\n\n            post_downsample = None\n            if not memory_efficient:\n                post_downsample = downsample_klass(\n                    current_dim, dim_out) if not is_last else Parallel(\n                        nn.Conv2D(\n                            dim_in, dim_out, 3, padding=1),\n                        nn.Conv2D(dim_in, dim_out, 1))\n\n            self.downs.append(\n                nn.LayerList([\n                    pre_downsample, resnet_klass(\n                        current_dim,\n                        current_dim,\n                        cond_dim=layer_cond_dim,\n                        linear_attn=layer_use_linear_cross_attn,\n                        time_cond_dim=time_cond_dim,\n                        groups=groups,\n                        use_recompute=use_recompute), nn.LayerList([\n                            ResnetBlock(\n                                current_dim,\n                                current_dim,\n                                time_cond_dim=time_cond_dim,\n                                groups=groups,\n                                use_gca=use_global_context_attn,\n                                use_recompute=use_recompute)\n                            for _ in range(layer_num_resnet_blocks)\n                        ]), transformer_block_klass(\n                            dim=current_dim,\n                            depth=layer_attn_depth,\n                            ff_mult=ff_mult,\n                            context_dim=cond_dim,\n                            **attn_kwargs), post_downsample\n                ]))\n\n        # middle layers\n\n        mid_dim = dims[-1]\n\n        self.mid_block1 = ResnetBlock(\n            mid_dim,\n            mid_dim,\n            cond_dim=cond_dim,\n            time_cond_dim=time_cond_dim,\n            groups=resnet_groups[-1],\n            use_recompute=use_recompute)\n        self.mid_attn = TransformerBlock(\n            mid_dim, depth=layer_mid_attns_depth,\n            **attn_kwargs) if attend_at_middle else None\n        self.mid_block2 = ResnetBlock(\n            mid_dim,\n            mid_dim,\n            cond_dim=cond_dim,\n            time_cond_dim=time_cond_dim,\n            groups=resnet_groups[-1],\n            use_recompute=use_recompute)\n\n        # upsample klass\n\n        upsample_klass = Upsample if not pixel_shuffle_upsample else PixelShuffleUpsample\n\n        # upsampling layers\n\n        upsample_fmap_dims = []\n\n        for ind, (\n            (dim_in, dim_out), layer_num_resnet_blocks, groups, layer_attn,\n                layer_attn_depth, layer_cross_attn, layer_use_linear_attn,\n                layer_use_linear_cross_attn\n        ) in enumerate(zip(reversed(in_out), *reversed_layer_params)):\n            is_last = ind == (len(in_out) - 1)\n\n            layer_cond_dim = cond_dim if layer_cross_attn or layer_use_linear_cross_attn else None\n\n            if layer_attn:\n                transformer_block_klass = TransformerBlock\n            elif layer_use_linear_attn:\n                transformer_block_klass = LinearAttentionTransformerBlock\n            else:\n                transformer_block_klass = Identity\n\n            skip_connect_dim = skip_connect_dims.pop()\n\n            upsample_fmap_dims.append(dim_out)\n\n            self.ups.append(\n                nn.LayerList([\n                    resnet_klass(\n                        dim_out + skip_connect_dim,\n                        dim_out,\n                        cond_dim=layer_cond_dim,\n                        linear_attn=layer_use_linear_cross_attn,\n                        time_cond_dim=time_cond_dim,\n                        groups=groups,\n                        use_recompute=use_recompute), nn.LayerList([\n                            ResnetBlock(\n                                dim_out + skip_connect_dim,\n                                dim_out,\n                                time_cond_dim=time_cond_dim,\n                                groups=groups,\n                                use_gca=use_global_context_attn,\n                                use_recompute=use_recompute)\n                            for _ in range(layer_num_resnet_blocks)\n                        ]), transformer_block_klass(\n                            dim=dim_out,\n                            depth=layer_attn_depth,\n                            ff_mult=ff_mult,\n                            context_dim=cond_dim,\n                            **attn_kwargs), upsample_klass(dim_out, dim_in)\n                    if not is_last or memory_efficient else Identity()\n                ]))\n\n        # whether to combine feature maps from all upsample blocks before final resnet block out\n\n        self.upsample_combiner = UpsampleCombiner(\n            dim=dim,\n            enabled=combine_upsample_fmaps,\n            dim_ins=upsample_fmap_dims,\n            dim_outs=dim)\n\n        # whether to do a final residual from initial conv to the final resnet block out\n\n        self.init_conv_to_final_conv_residual = init_conv_to_final_conv_residual\n        final_conv_dim = self.upsample_combiner.dim_out + (\n            dim if init_conv_to_final_conv_residual else 0)\n\n        # final optional resnet block and convolution out\n\n        self.final_res_block = ResnetBlock(\n            final_conv_dim,\n            dim,\n            time_cond_dim=time_cond_dim,\n            groups=resnet_groups[0],\n            use_gca=True,\n            use_recompute=use_recompute) if final_resnet_block else None\n\n        final_conv_dim_in = dim if final_resnet_block else final_conv_dim\n        final_conv_dim_in += (channels if lowres_cond else 0)\n\n        self.final_conv = nn.Conv2D(\n            final_conv_dim_in,\n            self.channels_out,\n            final_conv_kernel_size,\n            padding=final_conv_kernel_size // 2)\n\n        zero_init_(self.final_conv)\n\n    # if the current settings for the unet are not correct\n    # for cascading DDPM, then reinit the unet with the right settings\n    def cast_model_parameters(self, *, text_embed_dim, channels, channels_out,\n                              cond_on_text):\n        if channels == self.channels and \\\n            cond_on_text == self.cond_on_text and \\\n            text_embed_dim == self._locals['text_embed_dim'] and \\\n            channels_out == self.channels_out:\n            return self\n\n        updated_kwargs = dict(\n            text_embed_dim=text_embed_dim,\n            channels=channels,\n            channels_out=channels_out,\n            cond_on_text=cond_on_text)\n\n        return self.__class__(**{ ** self._locals, ** updated_kwargs})\n\n    # methods for returning the full unet config as well as its parameter state\n\n    def to_config_and_state_dict(self):\n        return self._locals, self.state_dict()\n\n    # class method for rehydrating the unet from its config and state dict\n\n    @classmethod\n    def from_config_and_state_dict(klass, config, state_dict):\n        unet = klass(**config)\n        unet.load_state_dict(state_dict)\n        return unet\n\n    # methods for persisting unet to disk\n\n    def persist_to_file(self, path):\n        path = Path(path)\n        path.parents[0].mkdir(exist_ok=True, parents=True)\n\n        config, state_dict = self.to_config_and_state_dict()\n        pkg = dict(config=config, state_dict=state_dict)\n        paddle.save(pkg, str(path))\n\n    # class method for rehydrating the unet from file saved with `persist_to_file`\n\n    @classmethod\n    def hydrate_from_file(klass, path):\n        path = Path(path)\n        assert path.exists()\n        pkg = paddle.load(str(path))\n\n        assert 'config' in pkg and 'state_dict' in pkg\n        config, state_dict = pkg['config'], pkg['state_dict']\n\n        return Unet.from_config_and_state_dict(config, state_dict)\n\n    # forward with classifier free guidance\n\n    def forward_with_cond_scale(self, *args, cond_scale=1., **kwargs):\n        #print(\"forward_with_cond_scale.args[1]: \", args[1])\n        logits = self.forward(*args, **kwargs)\n\n        if cond_scale == 1:\n            return logits\n\n        null_logits = self.forward(*args, cond_drop_prob=1., **kwargs)\n        return null_logits + (logits - null_logits) * cond_scale\n\n    def forward(self,\n                x,\n                time,\n                *,\n                lowres_cond_img=None,\n                lowres_noise_times=None,\n                text_embeds=None,\n                text_mask=None,\n                self_cond=None,\n                cond_images=None,\n                cond_drop_prob=0.,\n                use_recompute=False):\n        batch_size = x.shape[0]\n\n        # condition on self\n\n        if self.self_cond:\n            self_cond = default(self_cond, lambda: paddle.zeros_like(x))\n            x = paddle.concat((x, self_cond), axis=1)\n\n        # add low resolution conditioning, if present\n\n        assert not (self.lowres_cond and not exists(lowres_cond_img)\n                    ), 'low resolution conditioning image must be present'\n        assert not (self.lowres_cond and not exists(lowres_noise_times)\n                    ), 'low resolution conditioning noise time must be present'\n\n        if exists(lowres_cond_img):\n            x = paddle.concat((x, lowres_cond_img), axis=1)\n\n        # condition on input image\n\n        assert not (\n            self.has_cond_image ^ exists(cond_images)\n        ), 'you either requested to condition on an image on the unet, but the conditioning image is not supplied, or vice versa'\n\n        if exists(cond_images):\n            assert cond_images.shape[\n                1] == self.cond_images_channels, 'the number of channels on the conditioning image you are passing in does not match what you specified on initialiation of the unet'\n            cond_images = resize_image_to(cond_images, x.shape[-1])\n            x = paddle.concat((cond_images, x), axis=1)\n\n        # initial convolution\n\n        x = self.init_conv(x)\n\n        # init conv residual\n\n        if self.init_conv_to_final_conv_residual:\n            init_conv_residual = x.clone()\n\n        # time conditioning\n\n        time_hiddens = self.to_time_hiddens(time)\n\n        # derive time tokens\n\n        time_tokens = self.to_time_tokens(time_hiddens)\n        t = self.to_time_cond(time_hiddens)\n        if use_recompute:\n            t.stop_gradient = True\n        # add lowres time conditioning to time hiddens\n        # and add lowres time tokens along sequence dimension for attention\n\n        if self.lowres_cond:\n            lowres_time_hiddens = self.to_lowres_time_hiddens(\n                lowres_noise_times)\n            lowres_time_tokens = self.to_lowres_time_tokens(\n                lowres_time_hiddens)\n            lowres_t = self.to_lowres_time_cond(lowres_time_hiddens)\n\n            t = t + lowres_t\n            time_tokens = paddle.concat(\n                (time_tokens, lowres_time_tokens), axis=-2)\n\n        # text conditioning\n\n        text_tokens = None\n\n        if exists(text_embeds) and self.cond_on_text:\n\n            # conditional dropout\n\n            text_keep_mask = prob_mask_like((batch_size, ), 1 - cond_drop_prob)\n\n            text_keep_mask_embed = text_keep_mask[:, None, None]\n            text_keep_mask_hidden = text_keep_mask[:, None]\n\n            # calculate text embeds\n\n            text_tokens = self.text_to_cond(text_embeds)\n\n            text_tokens = text_tokens[:, :self.max_text_len]\n\n            if exists(text_mask):\n                text_mask = text_mask[:, :self.max_text_len]\n\n            text_tokens_len = text_tokens.shape[1]\n            remainder = self.max_text_len - text_tokens_len\n\n            if remainder > 0:\n                text_tokens = F.pad(text_tokens, (0, remainder),\n                                    data_format='NLC')\n\n            if exists(text_mask):\n                text_mask = text_mask[:, :, None].cast('float32')\n                if remainder > 0:\n                    text_mask = F.pad(text_mask, (0, remainder),\n                                      data_format='NLC')\n\n                text_keep_mask_embed = text_mask.cast(\n                    bool) & text_keep_mask_embed\n\n            null_text_embed = self.null_text_embed.cast(text_tokens.dtype)\n\n            text_tokens = paddle.where(text_keep_mask_embed, text_tokens,\n                                       null_text_embed)\n\n            if exists(self.attn_pool):\n                text_tokens = self.attn_pool(text_tokens)\n\n            # extra non-attention conditioning by projecting and then summing text embeddings to time\n            # termed as text hiddens\n\n            mean_pooled_text_tokens = text_tokens.mean(axis=-2)\n\n            text_hiddens = self.to_text_non_attn_cond(mean_pooled_text_tokens)\n\n            null_text_hidden = self.null_text_hidden.cast(t.dtype)\n\n            text_hiddens = paddle.where(text_keep_mask_hidden, text_hiddens,\n                                        null_text_hidden)\n\n            t = t + text_hiddens\n\n        # main conditioning tokens (c)\n\n        c = time_tokens if not exists(text_tokens) else paddle.concat(\n            (time_tokens, text_tokens), axis=-2)\n\n        # normalize conditioning tokens\n\n        c = self.norm_cond(c)\n        if use_recompute:\n            c.stop_gradient = True\n\n        # initial resnet block (for memory efficient unet)\n\n        if exists(self.init_resnet_block):\n            x = self.init_resnet_block(x, t)\n\n        hiddens = []\n\n        for pre_downsample, init_block, resnet_blocks, attn_block, post_downsample in self.downs:\n            if exists(pre_downsample):\n                x = pre_downsample(x)\n\n            x = init_block(x, t, c)\n\n            for resnet_block in resnet_blocks:\n                x = resnet_block(x, t)\n                hiddens.append(x)\n\n            x = attn_block(x, c)\n            hiddens.append(x)\n\n            if exists(post_downsample):\n                x = post_downsample(x)\n\n        x = self.mid_block1(x, t, c)\n\n        if exists(self.mid_attn):\n            x = self.mid_attn(x)\n\n        x = self.mid_block2(x, t, c)\n\n        add_skip_connection = lambda x: paddle.concat((x, hiddens.pop() * self.skip_connect_scale), axis=1)\n\n        up_hiddens = []\n\n        for init_block, resnet_blocks, attn_block, upsample in self.ups:\n            x = add_skip_connection(x)\n            x = init_block(x, t, c)\n\n            for resnet_block in resnet_blocks:\n                x = add_skip_connection(x)\n                x = resnet_block(x, t)\n\n            x = attn_block(x, c)\n            up_hiddens.append(x)\n            x = upsample(x)\n\n        x = self.upsample_combiner(x, up_hiddens)\n\n        if self.init_conv_to_final_conv_residual:\n            x = paddle.concat((x, init_conv_residual), axis=1)\n\n        if exists(self.final_res_block):\n            x = self.final_res_block(x, t)\n\n        if exists(lowres_cond_img):\n            x = paddle.concat((x, lowres_cond_img), axis=1)\n\n        return self.final_conv(x)\n"
  },
  {
    "path": "ppfleetx/models/multimodal_model/imagen/utils.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport math\nfrom functools import partial, wraps\n\nimport paddle\nfrom paddle import nn\nimport paddle.nn.functional as F\nfrom paddle import expm1\n\n# helper functions\n\n\ndef exists(val):\n    return val is not None\n\n\ndef identity(t, *args, **kwargs):\n    return t\n\n\ndef first(arr, d=None):\n    if len(arr) == 0:\n        return d\n    return arr[0]\n\n\ndef maybe(fn):\n    @wraps(fn)\n    def inner(x):\n        if not exists(x):\n            return x\n        return fn(x)\n\n    return inner\n\n\ndef once(fn):\n    called = False\n\n    @wraps(fn)\n    def inner(x):\n        nonlocal called\n        if called:\n            return\n        called = True\n        return fn(x)\n\n    return inner\n\n\nprint_once = once(print)\n\n\ndef default(val, d):\n    if exists(val):\n        return val\n    return d() if callable(d) else d\n\n\ndef cast_tuple(val, length=None):\n    if isinstance(val, list):\n        val = tuple(val)\n\n    output = val if isinstance(val, tuple) else ((val, ) * default(length, 1))\n\n    if exists(length):\n        assert len(output) == length\n\n    return output\n\n\ndef is_float_dtype(dtype):\n    return any([\n        dtype == float_dtype\n        for float_dtype in (paddle.float64, paddle.float32, paddle.float16,\n                            paddle.bfloat16)\n    ])\n\n\ndef cast_uint8_images_to_float(images):\n    if not images.dtype == paddle.uint8:\n        return images\n    return images / 255\n\n\nzeros_ = nn.initializer.Constant(value=0.)\n\n\ndef zero_init_(m):\n    zeros_(m.weight)\n    if exists(m.bias):\n        zeros_(m.bias)\n\n\ndef eval_decorator(fn):\n    def inner(model, *args, **kwargs):\n        was_training = model.training\n        model.eval()\n        out = fn(model, *args, **kwargs)\n        if was_training:\n            model.train(was_training)\n        return out\n\n    return inner\n\n\ndef pad_tuple_to_length(t, length, fillvalue=None):\n    remain_length = length - len(t)\n    if remain_length <= 0:\n        return t\n    return (*t, *((fillvalue, ) * remain_length))\n\n\n# helper classes\n\n\nclass Identity(nn.Layer):\n    def __init__(self, *args, **kwargs):\n        super().__init__()\n\n    def forward(self, x, *args, **kwargs):\n        return x\n\n\n# tensor helpers\n\n\ndef log(t, eps: float=1e-12):\n    return paddle.log(t.clip(min=eps))\n\n\nclass Parallel(nn.Layer):\n    def __init__(self, *fns):\n        super().__init__()\n        self.fns = nn.LayerList(fns)\n\n    def forward(self, x):\n        outputs = [fn(x) for fn in self.fns]\n        return sum(outputs)\n\n\ndef l2norm(t):\n    return F.normalize(t, axis=-1)\n\n\ndef right_pad_dims_to(x, t):\n    padding_dims = x.ndim - t.ndim\n    if padding_dims <= 0:\n        return t\n    return t.reshape([*t.shape, *((1, ) * padding_dims)])\n\n\ndef masked_mean(t, *, axis, mask=None):\n    if not exists(mask):\n        return t.mean(axis=axis)\n\n    denom = mask.sum(axis=axis, keepdim=True)\n    mask = mask[:, :, None]\n    masked_t = paddle.where(mask == 0, paddle.to_tensor(0.), t)\n\n    return masked_t.sum(axis=axis) / denom.clip(min=1e-5)\n\n\ndef resize_image_to(image, target_image_size, clamp_range=None):\n    orig_image_size = image.shape[-1]\n\n    if orig_image_size == target_image_size:\n        return image\n\n    out = F.interpolate(\n        image, (target_image_size, target_image_size), mode='nearest')\n\n    if exists(clamp_range):\n        out = out.clip(*clamp_range)\n\n    return out\n\n\n# image normalization functions\n# ddpms expect images to be in the range of -1 to 1\n\n\ndef normalize_neg_one_to_one(img):\n    return img * 2 - 1\n\n\ndef unnormalize_zero_to_one(normed_img):\n    return (normed_img + 1) * 0.5\n\n\n# classifier free guidance functions\n\n\ndef prob_mask_like(shape, prob):\n    if prob == 1:\n        return paddle.ones(shape, dtype=paddle.bool)\n    elif prob == 0:\n        return paddle.zeros(shape, dtype=paddle.bool)\n    else:\n        return paddle.zeros(shape).cast('float32').uniform_(0, 1) < prob\n\n\ndef rearrange(tensor,\n              pattern: str,\n              b: int=-1,\n              h: int=-1,\n              w: int=-1,\n              c: int=-1,\n              x: int=-1,\n              y: int=-1,\n              n: int=-1,\n              s1: int=-1,\n              s2: int=-1):\n    if pattern == 'b n (h d) -> b h n d':\n        B, N, _ = tensor.shape\n        return tensor.reshape([B, N, h, -1]).transpose([0, 2, 1, 3])\n    elif pattern == 'b n (h d) -> (b h) n d':\n        B, N, _ = tensor.shape\n        return tensor.reshape([B, N, h, -1]).transpose([0, 2, 1, 3]).reshape(\n            [B * h, N, -1])\n    elif pattern == 'b (h c) x y -> (b h) (x y) c':\n        B, _, _, _ = tensor.shape\n        return tensor.reshape([B, h, -1, x, y]).transpose(\n            [0, 1, 3, 4, 2]).reshape([B * h, x * y, -1])\n    elif pattern == 'b n ... -> b n (...)':\n        B, N = tensor.shape[:2]\n        return tensor.reshape([B, N, -1])\n    elif pattern == 'b ... -> b (...)':\n        B = tensor.shape[0]\n        return tensor.reshape([B, -1])\n    elif pattern == 'b j -> b 1 1 j':\n        return tensor[:, None, None, :]\n    elif pattern == 'b h n d -> b n (h d)':\n        B, H, N, D = tensor.shape\n        return tensor.transpose([0, 2, 1, 3]).reshape([B, N, -1])\n    elif pattern == '(b h) (x y) d -> b (h d) x y':\n        _, _, D = tensor.shape\n        return tensor.reshape([-1, h, x, y, D]).transpose(\n            [0, 1, 4, 2, 3]).reshape([-1, h * D, x, y])\n    elif pattern == '(b h) n d -> b n (h d)':\n        _, N, D = tensor.shape\n        return tensor.reshape([-1, h, N, D]).transpose([0, 2, 1, 3]).reshape(\n            [-1, N, h * D])\n    elif pattern == 'b n -> b n 1':\n        return tensor[:, :, None]\n    elif pattern == 'b c h w -> b (h w) c':\n        B, C, H, W = tensor.shape\n        return tensor.transpose([0, 2, 3, 1]).reshape([B, -1, C])\n    elif pattern == 'b (h w) c -> b c h w':\n        B, _, C = tensor.shape\n        return tensor.reshape([B, h, w, C]).transpose([0, 3, 1, 2])\n    elif pattern == 'b (n d) -> b n d':\n        B, _ = tensor.shape\n        return tensor.reshape([B, n, -1])\n    elif pattern == 'b ... -> b 1 ...':\n        return tensor[:, None]\n    elif pattern == 'b -> b 1 1 1':\n        return tensor[:, None, None, None]\n    elif pattern == 'b c (h s1) (w s2) -> b (c s1 s2) h w':\n        assert s1 is not None\n        assert s2 is not None\n        B, C, H, W = tensor.shape\n        tensor = tensor.reshape([B, C, H // s1, s1, W // s2, s2])\n        tensor = tensor.transpose([0, 1, 3, 5, 2, 4])\n        return tensor.reshape([B, C * s1 * s2, H // s1, W // s2])\n\n\ndef rearrange_many(tensors, pattern: str, h: int=-1, x: int=-1, y: int=-1):\n    assert isinstance(tensors, (\n        list, tuple)), \"rearrange_many type must be list or tuple\"\n    if isinstance(tensors, tuple):\n        tensors = list(tensors)\n    if len(tensors) == 0:\n        raise TypeError(\"Rearrange can't be applied to an empty list\")\n    for i, tensor in enumerate(tensors):\n        tensors[i] = rearrange(tensor, pattern, h=h, x=x, y=y)\n    return tensors\n\n\ndef repeat(tensor, pattern: str, h: int=-1, b: int=-1):\n    if pattern == '1 -> b':\n        if b > 1:\n            b = paddle.to_tensor([b])\n            return paddle.tile(tensor, repeat_times=b)\n        else:\n            return tensor\n    elif pattern == 't -> b t':\n        tensor = tensor[None, :]\n        return paddle.tile(tensor, repeat_times=(b, 1))\n    elif pattern == 'n d -> b n d':\n        tensor = tensor[None, :]\n        return paddle.tile(tensor, repeat_times=(b, 1, 1))\n    elif pattern == 'o ... -> (o 4) ...':\n        return paddle.tile(tensor, repeat_times=(4, 1, 1, 1))\n    elif pattern == 'd -> b h 1 d':\n        tensor = tensor[None, None, None, :]\n        return paddle.tile(tensor, repeat_times=(b, h, 1, 1))\n    elif pattern == 'd -> b 1 d':\n        tensor = tensor[None, None, :]\n        return paddle.tile(tensor, repeat_times=(b, 1, 1))\n\n\ndef repeat_many(tensors, pattern: str, h: int=-1, b: int=-1):\n    assert isinstance(tensors, (list, tuple))\n    if isinstance(tensors, tuple):\n        tensors = list(tensors)\n    if len(tensors) == 0:\n        raise TypeError(\"Rearrange can't be applied to an empty list\")\n    for i, tensor in enumerate(tensors):\n        tensors[i] = repeat(tensor, pattern, h=h, b=b)\n    return tensors\n\n\ndef reduce(losses, pattern: str, reduction: str='mean'):\n    if pattern == 'b ... -> b':\n        axes = list(range(1, len(losses.shape)))\n        return losses.mean(axes)\n\n\nclass EinopsToAndFrom(nn.Layer):\n    def __init__(self, from_einops, to_einops, fn):\n        super().__init__()\n        self.from_einops = from_einops\n        self.to_einops = to_einops\n        self.fn = fn\n\n    def forward(self, x, **kwargs):\n        shape = x.shape\n        reconstitute_kwargs = dict(\n            tuple(zip(self.from_einops.split(' '), shape)))\n        x = rearrange(x, f'{self.from_einops} -> {self.to_einops}')\n        x = self.fn(x, **kwargs)\n        x = rearrange(x, f'{self.to_einops} -> {self.from_einops}',\n                      **reconstitute_kwargs)\n        return x\n\n\nclass Rearrange(nn.Layer):\n    def __init__(self, pattern, n=None, s1=None, s2=None):\n        super().__init__()\n        self.pattern = pattern\n        self.n = n\n        self.s1 = s1\n        self.s2 = s2\n\n    def forward(self, x, **kwargs):\n        x = rearrange(x, f'{self.pattern}', n=self.n, s1=self.s1, s2=self.s2)\n        return x\n\n\n# classifier free guidance functions\n\n# gaussian diffusion with continuous time helper functions and classes\n# large part of this was thanks to @crowsonkb at https://github.com/crowsonkb/v-diffusion-jax/blob/master/diffusion/utils.py\n\n\ndef beta_linear_log_snr(t):\n    return -paddle.log(expm1(1e-4 + 10 * (t**2)))\n\n\ndef alpha_cosine_log_snr(t, s: float=0.008):\n    return -log(\n        (paddle.cos((t + s) / (1 + s) * math.pi * 0.5)**-2) - 1, eps=1e-5\n    )  # not sure if this accounts for beta being clipped to 0.999 in discrete version\n\n\ndef log_snr_to_alpha_sigma(log_snr):\n    return paddle.sqrt(F.sigmoid(log_snr)), paddle.sqrt(F.sigmoid(-log_snr))\n\n\nclass GaussianDiffusionContinuousTimes(nn.Layer):\n    def __init__(self, *, noise_schedule, timesteps=1000):\n        super().__init__()\n\n        if noise_schedule == 'linear':\n            self.log_snr = beta_linear_log_snr\n        elif noise_schedule == \"cosine\":\n            self.log_snr = alpha_cosine_log_snr\n        else:\n            raise ValueError(f'invalid noise schedule {noise_schedule}')\n\n        self.num_timesteps = timesteps\n\n    def get_times(self, batch_size, noise_level):\n        return paddle.full((batch_size, ), noise_level, dtype=paddle.float32)\n\n    def sample_random_times(self, batch_size):\n        return paddle.zeros((batch_size, )).cast('float32').uniform_(0, 1)\n\n    def get_condition(self, times):\n        return maybe(self.log_snr)(times)\n\n    def get_sampling_timesteps(self, batch):\n        times = paddle.linspace(1., 0., self.num_timesteps + 1)\n        times = repeat(times, 't -> b t', b=batch)\n        times = paddle.stack((times[:, :-1], times[:, 1:]), axis=0)\n        times = times.unbind(axis=-1)\n        return times\n\n    def q_posterior(self, x_start, x_t, t, *, t_next=None):\n        t_next = default(\n            t_next, lambda: (t - 1. / self.num_timesteps).clip(min=0.))\n        \"\"\" https://openreview.net/attachment?id=2LdBqxc1Yv&name=supplementary_material \"\"\"\n        log_snr = self.log_snr(t)\n        log_snr_next = self.log_snr(t_next)\n        log_snr, log_snr_next = map(\n            partial(right_pad_dims_to, x_t), (log_snr, log_snr_next))\n\n        alpha, sigma = log_snr_to_alpha_sigma(log_snr)\n        alpha_next, sigma_next = log_snr_to_alpha_sigma(log_snr_next)\n\n        # c - as defined near eq 33\n        c = -expm1(log_snr - log_snr_next)\n        posterior_mean = alpha_next * (x_t * (1 - c) / alpha + c * x_start)\n\n        # following (eq. 33)\n        posterior_variance = (sigma_next**2) * c\n        posterior_log_variance_clipped = log(posterior_variance, eps=1e-20)\n        return posterior_mean, posterior_variance, posterior_log_variance_clipped\n\n    def q_sample(self, x_start, t, noise=None):\n        dtype = x_start.dtype\n\n        if isinstance(t, float):\n            batch = x_start.shape[0]\n            t = paddle.full((batch, ), t, dtype=dtype)\n\n        noise = default(noise, lambda: paddle.randn(shape=x_start.shape, dtype=dtype))\n        log_snr = self.log_snr(t).cast(dtype)\n        log_snr_padded_dim = right_pad_dims_to(x_start, log_snr)\n        alpha, sigma = log_snr_to_alpha_sigma(log_snr_padded_dim)\n\n        return alpha * x_start + sigma * noise, log_snr, alpha, sigma\n\n    def q_sample_from_to(self, x_from, from_t, to_t, noise=None):\n        shape, dtype = x_from.shape, x_from.dtype\n        batch = shape[0]\n\n        if isinstance(from_t, float):\n            from_t = paddle.full((batch, ), from_t, dtype=dtype)\n\n        if isinstance(to_t, float):\n            to_t = paddle.full((batch, ), to_t, dtype=dtype)\n\n        noise = default(noise, lambda: paddle.randn(shape=x_from.shape, dtype=x_from.dtype))\n\n        log_snr = self.log_snr(from_t)\n        log_snr_padded_dim = right_pad_dims_to(x_from, log_snr)\n        alpha, sigma = log_snr_to_alpha_sigma(log_snr_padded_dim)\n\n        log_snr_to = self.log_snr(to_t)\n        log_snr_padded_dim_to = right_pad_dims_to(x_from, log_snr_to)\n        alpha_to, sigma_to = log_snr_to_alpha_sigma(log_snr_padded_dim_to)\n\n        return x_from * (alpha_to / alpha) + noise * (sigma_to * alpha - sigma\n                                                      * alpha_to) / alpha\n\n    def predict_start_from_v(self, x_t, t, v):\n        log_snr = self.log_snr(t)\n        log_snr = right_pad_dims_to(x_t, log_snr)\n        alpha, sigma = log_snr_to_alpha_sigma(log_snr)\n        return alpha * x_t - sigma * v\n\n    def predict_start_from_noise(self, x_t, t, noise):\n        log_snr = self.log_snr(t)\n        log_snr = right_pad_dims_to(x_t, log_snr)\n        alpha, sigma = log_snr_to_alpha_sigma(log_snr)\n        return (x_t - sigma * noise) / alpha.clip(min=1e-8)\n\n\nclass Always():\n    def __init__(self, val):\n        self.val = val\n\n    def __call__(self, *args, **kwargs):\n        return self.val\n"
  },
  {
    "path": "ppfleetx/models/multimodal_model/multimodal_module.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport sys\nimport copy\n\nimport paddle\n\nfrom ppfleetx.core.module.basic_module import BasicModule\nimport ppfleetx.models.multimodal_model.imagen as imagen\nfrom ppfleetx.utils.log import logger\n\nfrom .utils import process_configs\n\n\nclass MultiModalModule(BasicModule):\n    def __init__(self, configs):\n        self.nranks = paddle.distributed.get_world_size()\n        super(MultiModalModule, self).__init__(configs)\n\n        self.loss_fn = self.get_loss_fn()\n\n    def process_configs(self, configs):\n        configs = process_configs(configs)\n        return configs\n\n    def forward(self, batch):\n        return self.model(**batch)\n\n    def training_step(self, batch):\n        preds, targets, log_snr, p2_loss_weight_gamma = self(batch)\n        loss = self.loss_fn(preds, targets, log_snr, p2_loss_weight_gamma)\n        return loss\n\n    def training_step_end(self, log_dict):\n        speed = self.configs.Engine.logging_freq / log_dict['train_cost']\n\n        logger.info(\n            \"[train] epoch: %d, batch: %d, loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, learning rate: %.5e\"\n            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'],\n               1. / speed, speed, log_dict['lr']))\n\n    def validation_step(self, batch):\n        tokens, position_ids, labels, loss_mask = batch\n        preds = self(tokens, position_ids)\n        preds = paddle.cast(preds, dtype=\"float32\")\n        loss = self.loss_fn(preds, labels, loss_mask)\n        return loss\n\n    def validation_step_end(self, log_dict):\n        speed = self.configs.Engine.logging_freq / log_dict['eval_cost']\n        logger.info(\n            \"[eval] epoch: %d, batch: %d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s\"\n            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'],\n               1. / speed, speed))\n\n    def test_step(self, batch):\n        tokens, position_ids, labels, loss_mask = batch\n        preds = self(tokens, position_ids)\n        preds = paddle.cast(preds, dtype=\"float32\")\n        loss = self.loss_fn(preds, labels, loss_mask)\n        return loss\n\n    def test_step_end(self, log_dict):\n        speed = self.configs.Engine.logging_freq / log_dict['test_cost']\n        logger.info(\n            \"[test] epoch: %d, batch: %d, loss: %.9f, avg_test_cost: %.5f sec, speed: %.2f step/s\"\n            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'],\n               1. / speed, speed))\n\n    def input_spec(self):\n        return [\n            InputSpec(\n                shape=[None, None], name=\"tokens\", dtype='int64'), InputSpec(\n                    shape=[None, None], name=\"ids\", dtype='int64')\n        ]\n\n    def training_epoch_end(self, log_dict):\n        logger.info(\"[Training] epoch: %d, total time: %.5f sec\" %\n                    (log_dict['epoch'], log_dict['train_cost']))\n\n\nclass ImagenModule(MultiModalModule):\n    def __init__(self, configs):\n        super(ImagenModule, self).__init__(configs)\n\n    def get_model(self):\n        model_setting = copy.deepcopy(self.configs.Model)\n        model_setting.pop(\"module\")\n        imagen_model = model_setting.pop(\"name\")\n        model = getattr(imagen, imagen_model)(**model_setting)\n        return model\n\n    def get_loss_fn(self):\n        model_setting = copy.deepcopy(self.configs.Loss)\n        loss_fn = imagen.ImagenCriterion(**model_setting)\n        return loss_fn\n\n    def pretreating_batch(self, batch):\n        return batch\n"
  },
  {
    "path": "ppfleetx/models/multimodal_model/utils.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport logging\nimport os\nimport sys\nimport copy\n\nimport yaml\nimport numpy as np\nimport paddle\nimport paddle.distributed as dist\nfrom paddle.fluid import core\nimport argparse\nfrom functools import reduce\n\nfrom ppfleetx.distributed.apis import env\n\n\ndef process_global_configs(config):\n    \"\"\"\n    process global configs for hybrid parallel\n    \"\"\"\n    dp_degree = config['Distributed']['dp_degree']\n    sharding_degree = config['Distributed']['sharding']['sharding_degree']\n\n    configs = config['Global']\n    if configs['global_batch_size'] is None and configs[\n            'local_batch_size'] is None:\n        raise ValueError(\n            \"global_batch_size or local_batch_size should be set.\")\n    elif configs['global_batch_size'] is not None and configs[\n            'local_batch_size'] is not None:\n        assert configs['global_batch_size'] // configs['local_batch_size'] == (dp_degree * sharding_degree), \"global_batch_size[{}] should be divided by local_batch_size[{}] \"\\\n            \"when dp_degree is [{}] and sharding_degree is [{}]\".format(configs['global_batch_size'],\n            configs['local_batch_size'], dp_degree, sharding_degree)\n    elif configs['global_batch_size'] is not None and configs[\n            'local_batch_size'] is None:\n        assert configs['global_batch_size'] % (dp_degree * sharding_degree) == 0, \\\n            \"global_batch_size[{}] should be divided by dp_degree[{}] times sharding_degree[{}]\"\\\n            .format(configs['global_batch_size'], dp_degree, sharding_degree)\n        configs['local_batch_size'] = configs['global_batch_size'] // (\n            dp_degree * sharding_degree)\n    else:\n        configs['global_batch_size'] = configs[\n            'local_batch_size'] * dp_degree * sharding_degree\n    assert configs['local_batch_size'] % configs['micro_batch_size'] == 0\n\n\ndef is_fused_matmul_bias_supported():\n    if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm():\n        return hasattr(core.eager.ops.legacy, 'fused_gemm_epilogue')\n    else:\n        return False\n\n\ndef process_fused_configs(config):\n    \"\"\"\n    process fused configs for hybrid parallel\n    \"\"\"\n\n    nranks = dist.get_world_size()\n    dp_degree = config['Distributed']['dp_degree']\n    configs = config['Fused']\n    if configs['tensor_fusion']:\n        assert nranks == dp_degree, \"tensor_fusion only support single card train or data parallel train\"\n\n\ndef process_inference_configs(config):\n    \"\"\"\n    process fused configs for hybrid parallel\n    \"\"\"\n    configs = config['Inference']\n\n    if configs['model_dir'] is None:\n        configs['model_dir'] = config['Engine']['save_load']['output_dir']\n\n    if configs['mp_degree'] is None:\n        configs['mp_degree'] = config['Distributed']['mp_degree']\n\n\ndef process_model_configs(config):\n    \"\"\"\n    process model configs for hybrid parallel\n    \"\"\"\n    configs = config['Model']\n\n    if configs['use_recompute']:\n        if not configs['recompute_granularity']:\n            configs['recompute_granularity'] = 'full'\n\n    if configs['fused_linear'] and not is_fused_matmul_bias_supported():\n        configs['fused_linear'] = False\n        logging.warning(\n            \"The flag fused_linear only valid for cuda version higher than 11.6, \"\n            \"but the paddle is compiled with cuda \" + paddle.version.cuda())\n\n\ndef process_optim_configs(config):\n    \"\"\"\n    process optim configs for hybrid parallel\n    \"\"\"\n    config['Optimizer']['multi_precision'] = config['Engine']['mix_precision'][\n        'enable']\n\n\ndef process_engine_configs(config):\n    \"\"\"\n    process engine configs for hybrid parallel\n    \"\"\"\n    configs = config['Engine']\n    configs['test_iters'] = configs['eval_iters'] * 10 \\\n        if configs.get('test_iters', None) is None \\\n        else configs['test_iters']\n    configs['accumulate_steps'] = config['Global']['local_batch_size'] \\\n        // config['Global']['micro_batch_size']\n\n\ndef process_configs(config):\n\n    process_fused_configs(config)\n    process_model_configs(config)\n    process_optim_configs(config)\n    process_inference_configs(config)\n\n    return config\n"
  },
  {
    "path": "ppfleetx/models/protein_folding/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/models/protein_folding/all_atom.py",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import Dict, Optional\nimport paddle\n\nfrom .common import (batched_gather, )\n\nfrom . import (\n    residue_constants,\n    r3, )\n\n\ndef get_chi_atom_indices():\n    \"\"\"Returns atom indices needed to compute chi angles for all residue types.\n\n    Returns:\n        A tensor of shape [residue_types=21, chis=4, atoms=4]. The residue types are\n        in the order specified in residue_constants.restypes + unknown residue type\n        at the end. For chi angles which are not defined on the residue, the\n        positions indices are by default set to 0.\n    \"\"\"\n    chi_atom_indices = []\n    for residue_name in residue_constants.restypes:\n        residue_name = residue_constants.restype_1to3[residue_name]\n        residue_chi_angles = residue_constants.chi_angles_atoms[residue_name]\n        atom_indices = []\n        for chi_angle in residue_chi_angles:\n            atom_indices.append(\n                [residue_constants.atom_order[atom] for atom in chi_angle])\n        for _ in range(4 - len(atom_indices)):\n            atom_indices.append(\n                [0, 0, 0, 0])  # For chi angles not defined on the AA.\n        chi_atom_indices.append(atom_indices)\n\n    chi_atom_indices.append([[0, 0, 0, 0]] * 4)  # For UNKNOWN residue.\n\n    return paddle.to_tensor(chi_atom_indices)\n\n\ndef atom37_to_torsion_angles(\n        aatype: paddle.Tensor,  # (B, T, N)\n        all_atom_pos: paddle.Tensor,  # (B, T, N, 37, 3)\n        all_atom_mask: paddle.Tensor,  # (B, T, N, 37)\n        placeholder_for_undefined=False, ) -> Dict[str, paddle.Tensor]:\n    \"\"\"Computes the 7 torsion angles (in sin, cos encoding) for each residue.\n\n    The 7 torsion angles are in the order\n    '[pre_omega, phi, psi, chi_1, chi_2, chi_3, chi_4]',\n    here pre_omega denotes the omega torsion angle between the given amino acid\n    and the previous amino acid.\n\n    Args:\n        aatype: Amino acid type, given as array with integers.\n        all_atom_pos: atom37 representation of all atom coordinates.\n        all_atom_mask: atom37 representation of mask on all atom coordinates.\n        placeholder_for_undefined: flag denoting whether to set masked torsion\n        angles to zero.\n    Returns:\n        Dict containing:\n        * 'torsion_angles_sin_cos': Array with shape (B, N, 7, 2) where the final\n            2 dimensions denote sin and cos respectively\n        * 'alt_torsion_angles_sin_cos': same as 'torsion_angles_sin_cos', but\n            with the angle shifted by pi for all chi angles affected by the naming\n            ambiguities.\n        * 'torsion_angles_mask': Mask for which chi angles are present.\n    \"\"\"\n\n    # Map aatype > 20 to 'Unknown' (20).\n    aatype = paddle.minimum(\n        aatype.astype('int'),\n        paddle.full(\n            shape=[1], fill_value=20, dtype='int'))\n\n    num_batch, num_temp, num_res = aatype.shape\n\n    # Compute the backbone angles.\n    pad = paddle.zeros([num_batch, num_temp, 1, 37, 3])\n    prev_all_atom_pos = paddle.concat(\n        [pad, all_atom_pos[..., :-1, :, :]], axis=-3)\n\n    pad = paddle.zeros([num_batch, num_temp, 1, 37])\n    prev_all_atom_mask = paddle.concat(\n        [pad, all_atom_mask[..., :-1, :]], axis=-2)\n\n    # For each torsion angle collect the 4 atom positions that define this angle.\n    # shape (B, T, N, atoms=4, xyz=3)\n    pre_omega_atom_pos = paddle.concat(\n        [\n            prev_all_atom_pos[..., 1:3, :],  # prev CA, C\n            all_atom_pos[..., 0:2, :]  # this N, CA\n        ],\n        axis=-2)\n\n    phi_atom_pos = paddle.concat(\n        [\n            prev_all_atom_pos[..., 2:3, :],  # prev C\n            all_atom_pos[..., 0:3, :]  # this N, CA, C\n        ],\n        axis=-2)\n\n    psi_atom_pos = paddle.concat(\n        [\n            all_atom_pos[..., 0:3, :],  # this N, CA, C\n            all_atom_pos[..., 4:5, :]  # this O\n        ],\n        axis=-2)\n\n    # Collect the masks from these atoms.\n    # Shape [batch, n_temp, num_res]\n    pre_omega_mask = (\n        paddle.prod(\n            prev_all_atom_mask[..., 1:3], axis=-1)  # prev CA, C\n        * paddle.prod(\n            all_atom_mask[..., 0:2], axis=-1))  # this N, CA\n    phi_mask = (\n        prev_all_atom_mask[..., 2]  # prev C\n        * paddle.prod(\n            all_atom_mask[..., 0:3], axis=-1))  # this N, CA, C\n    psi_mask = (\n        paddle.prod(\n            all_atom_mask[..., 0:3], axis=-1) *  # this N, CA, C\n        all_atom_mask[..., 4])  # this O\n\n    # Collect the atoms for the chi-angles.\n    # Compute the table of chi angle indices. Shape: [restypes, chis=4, atoms=4].\n    chi_atom_indices = get_chi_atom_indices()\n\n    # Select atoms to compute chis. Shape: [batch, num_temp, num_res, chis=4, atoms=4].\n    atom_indices = batched_gather(\n        params=chi_atom_indices, indices=aatype, axis=0, batch_dims=0)\n\n    # Gather atom positions. Shape: [batch, num_temp, num_res, chis=4, atoms=4, xyz=3].\n    chis_atom_pos = batched_gather(\n        params=all_atom_pos, indices=atom_indices, axis=0, batch_dims=3)\n\n    # Copy the chi angle mask, add the UNKNOWN residue. Shape: [restypes, 4].\n    chi_angles_mask = list(residue_constants.chi_angles_mask)\n    chi_angles_mask.append([0.0, 0.0, 0.0, 0.0])\n    chi_angles_mask = paddle.to_tensor(chi_angles_mask)\n\n    # Compute the chi angle mask. I.e. which chis angles exist according to the\n    # aatype. Shape [batch, num_temp, num_res, chis=4].\n    chis_mask = batched_gather(\n        params=chi_angles_mask, indices=aatype, axis=0, batch_dims=0)\n    # Constrain the chis_mask to those chis, where the ground truth coordinates of\n    # all defining four atoms are available.\n    # Gather the chi angle atoms mask. Shape: [batch, num_temp, num_res, chis=4, atoms=4].\n    chi_angle_atoms_mask = batched_gather(\n        params=all_atom_mask, indices=atom_indices, axis=0, batch_dims=3)\n    # Check if all 4 chi angle atoms were set. Shape: [batch, num_temp, num_res, chis=4].\n    chi_angle_atoms_mask = paddle.prod(chi_angle_atoms_mask, axis=[-1])\n    chis_mask = chis_mask * chi_angle_atoms_mask\n\n    # Stack all torsion angle atom positions.\n    # Shape (B, T, N, torsions=7, atoms=4, xyz=3)\n    torsions_atom_pos = paddle.concat(\n        [\n            pre_omega_atom_pos.unsqueeze(axis=-3),  # [:, :, :, None, :, :]\n            phi_atom_pos.unsqueeze(axis=-3),  # [:, :, :, None, :, :]\n            psi_atom_pos.unsqueeze(axis=-3),  # [:, :, :, None, :, :]\n            chis_atom_pos\n        ],\n        axis=3)\n\n    # Stack up masks for all torsion angles.\n    # shape (B, T, N, torsions=7)\n    torsion_angles_mask = paddle.concat(\n        [\n            pre_omega_mask.unsqueeze(axis=-1),  # [..., None]\n            phi_mask.unsqueeze(axis=-1),  # [..., None]\n            psi_mask.unsqueeze(axis=-1),  # [..., None]\n            chis_mask\n        ],\n        axis=-1)\n\n    # Create a frame from the first three atoms:\n    # First atom: point on x-y-plane\n    # Second atom: point on negative x-axis\n    # Third atom: origin\n    # r3.Rigids (B, T, N, torsions=7)\n    torsion_frames = r3.rigids_from_3_points_vecs(\n        point_on_neg_x_axis=r3.Vecs(torsions_atom_pos[..., 1, :]),\n        origin=r3.Vecs(torsions_atom_pos[..., 2, :]),\n        point_on_xy_plane=r3.Vecs(torsions_atom_pos[..., 0, :]))\n\n    # Compute the position of the forth atom in this frame (y and z coordinate\n    # define the chi angle)\n    # r3.Vecs (B, T, N, torsions=7)\n    forth_atom_rel_pos = r3.rigids_mul_vecs(\n        r3.invert_rigids(torsion_frames),\n        r3.vecs_from_tensor(torsions_atom_pos[..., 3, :]))\n\n    # Normalize to have the sin and cos of the torsion angle.\n    # paddle.Tensor (B, T, N, torsions=7, sincos=2)\n    torsion_angles_sin_cos = paddle.stack(\n        [forth_atom_rel_pos.z, forth_atom_rel_pos.y], axis=-1)\n    torsion_angles_sin_cos /= paddle.sqrt(\n        paddle.sum(paddle.square(torsion_angles_sin_cos),\n                   axis=-1,\n                   keepdim=True) + 1e-8)\n\n    # Mirror psi, because we computed it from the Oxygen-atom.\n    torsion_angles_sin_cos *= paddle.to_tensor(\n        [1., 1., -1., 1., 1., 1., 1.]).reshape(\n            [1, 1, 1, 7, 1])  # [None, None, None, :, None]\n\n    # Create alternative angles for ambiguous atom names.\n    chi_is_ambiguous = batched_gather(\n        paddle.to_tensor(residue_constants.chi_pi_periodic), aatype)\n    # chi_is_ambiguous (B, T, N, torsions=4)\n    mirror_torsion_angles = paddle.concat(\n        [\n            paddle.ones([num_batch, num_temp, num_res, 3]),\n            1.0 - 2.0 * chi_is_ambiguous\n        ],\n        axis=-1)\n    # mirror_torsion_angles (B, T, N, torsions=7)\n    alt_torsion_angles_sin_cos = torsion_angles_sin_cos * mirror_torsion_angles.unsqueeze(\n        axis=-1)  # [:, :, :, :, None]\n\n    if placeholder_for_undefined:\n        # Add placeholder torsions in place of undefined torsion angles\n        # (e.g. N-terminus pre-omega)\n        placeholder_torsions = paddle.stack(\n            [\n                paddle.ones(torsion_angles_sin_cos.shape[:-1]),\n                paddle.zeros(torsion_angles_sin_cos.shape[:-1])\n            ],\n            axis=-1)\n        torsion_angles_sin_cos = torsion_angles_sin_cos * torsion_angles_mask.unsqueeze(\n            axis=-1) + placeholder_torsions * (\n                1 - torsion_angles_mask.unsqueeze(axis=-1))\n        alt_torsion_angles_sin_cos = alt_torsion_angles_sin_cos * torsion_angles_mask.unsqueeze(\n            axis=-1) + placeholder_torsions * (\n                1 - torsion_angles_mask.unsqueeze(axis=-1))\n\n    return {\n        'torsion_angles_sin_cos': torsion_angles_sin_cos,  # (B, T, N, 7, 2)\n        'alt_torsion_angles_sin_cos':\n        alt_torsion_angles_sin_cos,  # (B, T, N, 7, 2)\n        'torsion_angles_mask': torsion_angles_mask  # (B, T, N, 7)\n    }\n"
  },
  {
    "path": "ppfleetx/models/protein_folding/attentions.py",
    "content": "\"\"\"attentions.py.\"\"\"\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport gc\nimport numpy as np\nimport paddle\nimport paddle.nn as nn\n\ntry:\n    from paddle import _legacy_C_ops as _C_ops\nexcept:\n    from paddle import _C_ops\n\nfrom ppfleetx.distributed.protein_folding import dap\n\nfrom .common import (\n    init_gate_linear,\n    init_final_linear,\n    mask_mean,\n    subbatch, )\n\n\nclass Attention(nn.Layer):\n    \"\"\"Multihead attention.\"\"\"\n\n    def __init__(self, config, global_config, q_dim, kv_dim, output_dim):\n        super(Attention, self).__init__()\n        self.config = config\n        self.global_config = global_config\n\n        num_head = self.config.num_head\n        key_dim = self.config.get('key_dim', q_dim)\n        value_dim = self.config.get('value_dim', kv_dim)\n\n        # TODO(GuoxiaWang): delete non fuse_attention related code on dcu\n        self.fuse_attention = self.global_config.fuse_attention\n        self.use_flash_attn = self.global_config.use_flash_attn\n        self.merge_qkv = (q_dim == kv_dim)\n\n        assert key_dim % num_head == 0\n        assert value_dim % num_head == 0\n        key_dim = key_dim // num_head\n        value_dim = value_dim // num_head\n\n        self.key_dim = key_dim\n        self.value_dim = value_dim\n\n        self.qkv_w = None\n        self.query_w = None\n        self.key_w = None\n        self.value_w = None\n        if self.merge_qkv and self.fuse_attention:\n            self.qkv_w = paddle.create_parameter(\n                [3, num_head, key_dim, q_dim],\n                'float32',\n                default_initializer=nn.initializer.XavierUniform())\n        else:\n            self.query_w = paddle.create_parameter(\n                [q_dim, num_head, key_dim],\n                'float32',\n                default_initializer=nn.initializer.XavierUniform())\n            self.key_w = paddle.create_parameter(\n                [kv_dim, num_head, key_dim],\n                'float32',\n                default_initializer=nn.initializer.XavierUniform())\n            self.value_w = paddle.create_parameter(\n                [kv_dim, num_head, value_dim],\n                'float32',\n                default_initializer=nn.initializer.XavierUniform())\n\n        self.gating_w = None\n        self.gating_b = None\n        if self.config.gating:\n            self.gating_w = paddle.create_parameter(\n                [q_dim, num_head, value_dim],\n                'float32',\n                default_initializer=nn.initializer.Constant(0.0))\n            self.gating_b = paddle.create_parameter(\n                [num_head, value_dim],\n                'float32',\n                default_initializer=nn.initializer.Constant(1.0))\n\n        if self.global_config.zero_init:\n            init = nn.initializer.Constant(0.0)\n        else:\n            init = nn.initializer.XavierUniform()\n\n        self.output_w = paddle.create_parameter(\n            [num_head, value_dim, output_dim],\n            'float32',\n            default_initializer=init)\n        self.output_b = paddle.create_parameter(\n            [output_dim],\n            'float32',\n            default_initializer=nn.initializer.Constant(0.0))\n\n    def forward(self, q_data, m_data, bias, nonbatched_bias=None):\n        \"\"\"Builds Attention module.\n        \n        Args:\n            q_data (float): A tensor of queries, shape [batch, row_size, N_queries, q_channels].\n            m_data (float): A tensor of memories from which the keys and values are\n                projected, shape [batch, row_size, N_keys, m_channels].\n            bias (float): A bias for the attention, shape [batch, row_size, num_head, N_queries, N_keys].\n            nonbatched_bias (float): Shared bias, shape [N_queries, N_keys].\n\n        Returns:\n            A float32 tensor of shape [batch_size, row_size, N_queries, output_dim].\n        \"\"\"\n        if self.fuse_attention:\n            if nonbatched_bias is not None:\n                nonbatched_bias = paddle.unsqueeze(nonbatched_bias, axis=1)\n\n            import paddle.incubate.nn.functional as F\n            output = F.fused_gate_attention(\n                query=q_data,\n                key=m_data,\n                query_weight=self.query_w,\n                key_weight=self.key_w,\n                value_weight=self.value_w,\n                qkv_weight=self.qkv_w,\n                gate_linear_weight=self.gating_w,\n                gate_linear_bias=self.gating_b,\n                out_linear_weight=self.output_w,\n                out_linear_bias=self.output_b,\n                nonbatched_bias=nonbatched_bias,\n                attn_mask=bias,\n                has_gating=self.config.gating,\n                merge_qkv=self.merge_qkv,\n                use_flash_attn=self.use_flash_attn, )\n        else:\n            c = self.key_dim**(-0.5)\n            q = paddle.einsum('nbqa,ahc->nbqhc', q_data, self.query_w) * c\n            k = paddle.einsum('nbka,ahc->nbkhc', m_data, self.key_w)\n            v = paddle.einsum('nbka,ahc->nbkhc', m_data, self.value_w)\n            logits = paddle.einsum('nbqhc,nbkhc->nbhqk', q, k) + bias\n\n            if nonbatched_bias is not None:\n                logits += paddle.unsqueeze(nonbatched_bias, axis=1)\n\n            weights = nn.functional.softmax(logits)\n            weighted_avg = paddle.einsum('nbhqk,nbkhc->nbqhc', weights, v)\n\n            if self.config.gating:\n                gate_values = paddle.einsum('nbqc,chv->nbqhv', q_data,\n                                            self.gating_w) + self.gating_b\n                gate_values = nn.functional.sigmoid(gate_values)\n                weighted_avg *= gate_values\n\n            output = paddle.einsum('nbqhc,hco->nbqo', weighted_avg,\n                                   self.output_w) + self.output_b\n        return output\n\n\nclass GlobalAttention(nn.Layer):\n    \"\"\"Global attention.\n\n    Jumper et al. (2021) Suppl. Alg. 19 \"MSAColumnGlobalAttention\" lines 2-7\n    \"\"\"\n\n    def __init__(self, config, global_config, q_dim, kv_dim, output_dim):\n        super(GlobalAttention, self).__init__()\n        self.config = config\n        self.global_config = global_config\n\n        num_head = self.config.num_head\n        key_dim = self.config.get('key_dim', q_dim)\n        value_dim = self.config.get('value_dim', kv_dim)\n\n        assert key_dim % num_head == 0\n        assert value_dim % num_head == 0\n        key_dim = key_dim // num_head\n        value_dim = value_dim // num_head\n\n        self.key_dim = key_dim\n        self.value_dim = value_dim\n\n        self.query_w = paddle.create_parameter(\n            [q_dim, num_head, key_dim],\n            'float32',\n            default_initializer=nn.initializer.XavierUniform())\n        self.key_w = paddle.create_parameter(\n            [kv_dim, key_dim],\n            'float32',\n            default_initializer=nn.initializer.XavierUniform())\n        self.value_w = paddle.create_parameter(\n            [kv_dim, value_dim],\n            'float32',\n            default_initializer=nn.initializer.XavierUniform())\n\n        if self.config.gating:\n            self.gating_w = paddle.create_parameter(\n                [q_dim, num_head, value_dim],\n                'float32',\n                default_initializer=nn.initializer.Constant(0.0))\n            self.gating_b = paddle.create_parameter(\n                [num_head, value_dim],\n                'float32',\n                default_initializer=nn.initializer.Constant(1.0))\n\n        if self.global_config.zero_init:\n            init = nn.initializer.Constant(0.0)\n        else:\n            init = nn.initializer.XavierUniform()\n\n        self.output_w = paddle.create_parameter(\n            [num_head, value_dim, output_dim],\n            'float32',\n            default_initializer=init)\n        self.output_b = paddle.create_parameter(\n            [output_dim],\n            'float32',\n            default_initializer=nn.initializer.Constant(0.0))\n\n    def forward(self, q_data, m_data, q_mask):\n        \"\"\"Builds Attention module.\n        \n        Args:\n            q_data (float): A tensor of queries, shape [batch, row_size, N_queries, q_channels].\n            m_data (float): A tensor of memories from which the keys and values are\n                projected, shape [batch, row_size, N_keys, m_channels].\n            q_mask (float): A tensor of mask.\n\n        Returns:\n            A float32 tensor of output.\n        \"\"\"\n\n        k = paddle.einsum('nbka,ac->nbkc', m_data, self.key_w)\n        v = paddle.einsum('nbka,ac->nbkc', m_data, self.value_w)\n\n        # NOTE: differ from non-global version using q_avg for attn\n        q_avg = mask_mean(q_mask, q_data, axis=2)\n        c = self.key_dim**(-0.5)\n        q = paddle.einsum('nba,ahc->nbhc', q_avg, self.query_w) * c\n\n        q_mask_ = paddle.unsqueeze(q_mask, axis=2)[..., 0]\n        bias = 1e9 * (q_mask_ - 1.)\n\n        logits = paddle.einsum('nbhc,nbkc->nbhk', q, k) + bias\n        weights = nn.functional.softmax(logits)\n        weighted_avg = paddle.einsum('nbhk,nbkc->nbhc', weights, v)\n\n        if self.config.gating:\n            gate_values = paddle.einsum('nbqc,chv->nbqhv', q_data,\n                                        self.gating_w) + self.gating_b\n            gate_values = nn.functional.sigmoid(gate_values)\n            weighted_avg = paddle.unsqueeze(weighted_avg, axis=2)\n            weighted_avg *= gate_values\n\n            output = paddle.einsum('nbqhc,hco->nbqo', weighted_avg,\n                                   self.output_w) + self.output_b\n        else:\n            output = paddle.einsum('nbhc,hco->nbo', weighted_avg,\n                                   self.output_w) + self.output_b\n            output = paddle.unsqueeze(output, axis=-1)\n\n        return output\n\n\nclass MSARowAttentionWithPairBias(nn.Layer):\n    \"\"\"MSA per-row attention biased by the pair representation.\n\n    Jumper et al. (2021) Suppl. Alg. 7 \"MSARowAttentionWithPairBias\"\n    \"\"\"\n\n    def __init__(self, channel_num, config, global_config, is_extra_msa):\n        super(MSARowAttentionWithPairBias, self).__init__()\n        self.channel_num = channel_num\n        self.config = config\n        self.global_config = global_config\n        self.is_extra_msa = is_extra_msa\n        assert config.orientation == 'per_row'\n\n        if is_extra_msa:\n            self.query_norm = nn.LayerNorm(channel_num['extra_msa_channel'])\n        else:\n            self.query_norm = nn.LayerNorm(channel_num['msa_channel'])\n\n        self.feat_2d_norm = nn.LayerNorm(channel_num['pair_channel'])\n        self.feat_2d_weights = paddle.create_parameter(\n            [channel_num['pair_channel'], self.config.num_head],\n            'float32',\n            default_initializer=nn.initializer.Normal(\n                std=1. / np.sqrt(channel_num['pair_channel'])))\n\n        if is_extra_msa:\n            extra_msa_channel = channel_num['extra_msa_channel']\n            self.attention = Attention(self.config, self.global_config,\n                                       extra_msa_channel, extra_msa_channel,\n                                       extra_msa_channel)\n        else:\n            msa_channel = channel_num['msa_channel']\n            self.attention = Attention(self.config, self.global_config,\n                                       msa_channel, msa_channel, msa_channel)\n\n    def forward(self, msa_act, msa_mask, pair_act):\n        \"\"\"MSARowAttention with masks.\n        \n        Args:\n            msa_act (float): A tensor of msa_act.\n            msa_mask (float): A tensor of msa_mask.\n            pair_act (float): A tensor of pair_act.\n\n        Returns:\n            A float32 tensor of msa_act.\n        \"\"\"\n\n        pair_act = self.feat_2d_norm(pair_act)\n\n        # [B, N_res//dap_size, N_res, cz], [cz, head] => [B, head, N_res//dap_size, N_res]\n        nonbatched_bias_before = paddle.einsum('nqkc,ch->nhqk', pair_act,\n                                               self.feat_2d_weights)\n\n        # [B, head, N_res//dap_size, N_res] => [B, head, N_res, N_res]\n        nonbatched_bias = dap.all_gather(nonbatched_bias_before, axis=2)\n        # if not self.training:\n        if not self.training and self.global_config.low_memory is True:\n            del nonbatched_bias_before\n            gc.collect()\n        nonbatched_bias = dap.all_gather_opp(nonbatched_bias, axis=2)\n\n        # [B, N_seq, N_res] => [B, N_seq//dap_size, N_res]\n        msa_mask = dap.scatter(msa_mask, axis=1)\n\n        bias = 1e9 * (msa_mask - 1.)\n        # [B, N_seq//dap_size, N_res] => [B, N_seq//dap_size, 1, 1, N_res]\n        bias = paddle.unsqueeze(bias, axis=[2, 3])\n        msa_act = self.query_norm(msa_act)\n\n        if not self.training or (self.is_extra_msa and\n                                 self.config.use_subbatch):\n            # low memory mode using subbatch\n            subbatch_size = self.config.subbatch_size\n            if not self.training:\n                subbatch_size = self.global_config.subbatch_size\n            sb_attn = subbatch(\n                self.attention, [0, 1, 2], [1, 1, 1],\n                subbatch_size,\n                1,\n                same_arg_idx={1: 0})\n            msa_act = sb_attn(msa_act, msa_act, bias, nonbatched_bias)\n        else:\n            msa_act = self.attention(msa_act, msa_act, bias, nonbatched_bias)\n\n        return msa_act\n\n\nclass MSAColumnGlobalAttention(nn.Layer):\n    \"\"\"MSA per-column global attention.\n\n    Jumper et al. (2021) Suppl. Alg. 19 \"MSAColumnGlobalAttention\"\n    \"\"\"\n\n    def __init__(self, channel_num, config, global_config):\n        super(MSAColumnGlobalAttention, self).__init__()\n        self.channel_num = channel_num\n        self.config = config\n        self.global_config = global_config\n        assert config.orientation == 'per_column'\n\n        extra_msa_channel = channel_num['extra_msa_channel']\n        self.query_norm = nn.LayerNorm(extra_msa_channel)\n        self.attention = GlobalAttention(self.config, self.global_config,\n                                         extra_msa_channel, extra_msa_channel,\n                                         extra_msa_channel)\n\n    def forward(self, msa_act, msa_mask):\n        \"\"\"MSAColumnGlobalAttention.\n        \n        Args:\n            msa_act (float): A tensor of msa_act.\n            msa_mask (float): A tensor of msa_mask.\n\n        Returns:\n            A float32 tensor of msa_act.\n        \"\"\"\n\n        # scatter if using dap, otherwise do nothing\n        # [B, N_seq, N_res] => [B, N_seq, N_res//dap_size]\n        msa_mask = dap.scatter(msa_mask, axis=2)\n\n        msa_act = paddle.transpose(msa_act, [0, 2, 1, 3])\n        msa_mask = paddle.transpose(msa_mask, [0, 2, 1])\n\n        bias = 1e9 * (msa_mask - 1.)\n        bias = paddle.unsqueeze(bias, axis=[2, 3])\n\n        msa_mask = paddle.unsqueeze(msa_mask, axis=-1)\n        msa_act = self.query_norm(msa_act)\n\n        if not self.training:\n            # low memory mode using subbatch\n            sb_attn = subbatch(\n                self.attention, [0, 1, 2], [1, 1, 1],\n                self.global_config.subbatch_size,\n                1,\n                same_arg_idx={1: 0})\n            msa_act = sb_attn(msa_act, msa_act, msa_mask)\n        else:\n            msa_act = self.attention(msa_act, msa_act, msa_mask)\n\n        msa_act = paddle.transpose(msa_act, [0, 2, 1, 3])\n        return msa_act\n\n\nclass MSAColumnAttention(nn.Layer):\n    \"\"\"MSA per-column attention.\n\n    Jumper et al. (2021) Suppl. Alg. 8 \"MSAColumnAttention\"\n    \"\"\"\n\n    def __init__(self, channel_num, config, global_config):\n        super(MSAColumnAttention, self).__init__()\n        self.channel_num = channel_num\n        self.config = config\n        self.global_config = global_config\n        assert config.orientation == 'per_column'\n\n        msa_channel = channel_num['msa_channel']\n        self.query_norm = nn.LayerNorm(msa_channel)\n        self.attention = Attention(self.config, self.global_config,\n                                   msa_channel, msa_channel, msa_channel)\n\n    def forward(self, msa_act, msa_mask):\n        \"\"\"MSAColumnAttention.\n        \n        Args:\n            msa_act (float): A tensor of msa_act.\n            msa_mask (float): A tensor of msa_mask.\n\n        Returns:\n            A float32 tensor of msa_act.\n        \"\"\"\n\n        # scatter if using dap, otherwise do nothing\n        # [B, N_seq, N_res] => [B, N_seq, N_res//dap_size]\n        msa_mask = dap.scatter(msa_mask, axis=2)\n\n        msa_act = paddle.transpose(msa_act, [0, 2, 1, 3])\n        msa_mask = paddle.transpose(msa_mask, [0, 2, 1])\n\n        bias = 1e9 * (msa_mask - 1.)\n        bias = paddle.unsqueeze(bias, axis=[2, 3])\n\n        msa_act = self.query_norm(msa_act)\n        if not self.training:\n            # low memory mode using subbatch\n            sb_attn = subbatch(\n                self.attention, [0, 1, 2], [1, 1, 1],\n                self.global_config.subbatch_size,\n                1,\n                same_arg_idx={1: 0})\n            msa_act = sb_attn(msa_act, msa_act, bias)\n        else:\n            msa_act = self.attention(msa_act, msa_act, bias)\n\n        msa_act = paddle.transpose(msa_act, [0, 2, 1, 3])\n        return msa_act\n\n\nclass TriangleAttention(nn.Layer):\n    \"\"\"Triangle Attention.\n\n    Jumper et al. (2021) Suppl. Alg. 13 \"TriangleAttentionStartingNode\"\n    Jumper et al. (2021) Suppl. Alg. 14 \"TriangleAttentionEndingNode\"\n    \"\"\"\n\n    def __init__(self,\n                 channel_num,\n                 config,\n                 global_config,\n                 name='triangle_attention'):\n        super(TriangleAttention, self).__init__()\n        self.channel_num = channel_num\n        self.config = config\n        self.global_config = global_config\n\n        assert config.orientation in ['per_row', 'per_column']\n\n        self.query_norm = nn.LayerNorm(\n            channel_num['pair_channel'], name='query_norm')\n        self.feat_2d_weights = paddle.create_parameter(\n            [channel_num['pair_channel'], self.config.num_head],\n            'float32',\n            default_initializer=nn.initializer.Normal(\n                std=1. / np.sqrt(channel_num['pair_channel'])))\n\n        self.attention = Attention(\n            self.config, self.global_config, channel_num['pair_channel'],\n            channel_num['pair_channel'], channel_num['pair_channel'])\n\n    def forward(self, pair_act, pair_mask):\n        \"\"\"Builds TriangleAttention module.\n\n        Args:\n            pair_act (float): [batch, N_res, N_res, c_z] pair activations tensor\n            pair_mask (float): [batch, N_res, N_res] mask of non-padded regions in the tensor.\n\n        Returns:\n            Update to pair_act, shape [batch, N_res, N_res, c_z].\n        \"\"\"\n        if self.config.orientation == 'per_column':\n            pair_act = pair_act.transpose([0, 2, 1, 3])\n            pair_mask = pair_mask.transpose([0, 2, 1])\n\n        # [B, N_res//dap_size, N_res]\n        bias = 1e9 * (pair_mask - 1.)\n        # [B, N_res//dap_size, 1, 1, N_res]\n        bias = paddle.unsqueeze(bias, axis=[2, 3])\n\n        pair_act = self.query_norm(pair_act)\n\n        # [B, N_res//dap_size, N_res, cz], [cz, head] => [B, head, N_res//dap_size, N_res]\n        nonbatched_bias_before = paddle.einsum('bqkc,ch->bhqk', pair_act,\n                                               self.feat_2d_weights)\n\n        # # [B, head, N_res//dap_size, N_res] => [B, head, N_res, N_res]\n        nonbatched_bias = dap.all_gather(nonbatched_bias_before, axis=2)\n        # if not self.training:\n        if not self.training and self.global_config.low_memory is True:\n            del nonbatched_bias_before\n            gc.collect()\n        nonbatched_bias = dap.all_gather_opp(nonbatched_bias, axis=2)\n\n        if not self.training:\n            # low memory mode using subbatch\n            sb_attn = subbatch(\n                self.attention, [0, 1, 2], [1, 1, 1],\n                self.global_config.subbatch_size,\n                1,\n                same_arg_idx={1: 0})\n            pair_act = sb_attn(pair_act, pair_act, bias, nonbatched_bias)\n        else:\n            pair_act = self.attention(pair_act, pair_act, bias,\n                                      nonbatched_bias)\n\n        if self.config.orientation == 'per_column':\n            pair_act = pair_act.transpose([0, 2, 1, 3])\n\n        return pair_act\n\n\nclass TriangleMultiplication(nn.Layer):\n    \"\"\"Triangle multiplication layer (\"outgoing\" or \"incoming\").\n\n    Jumper et al. (2021) Suppl. Alg. 11 \"TriangleMultiplicationOutgoing\"\n    Jumper et al. (2021) Suppl. Alg. 12 \"TriangleMultiplicationIncoming\"\n    \"\"\"\n\n    def __init__(self,\n                 channel_num,\n                 config,\n                 global_config,\n                 name='triangle_multiplication'):\n        super(TriangleMultiplication, self).__init__()\n        self.channel_num = channel_num\n        self.config = config\n        self.global_config = global_config\n\n        Linear = paddle.incubate.nn.FusedLinear if self.global_config.fuse_linear else paddle.nn.Linear\n\n        self.layer_norm_input = nn.LayerNorm(\n            self.channel_num['pair_channel'], name='layer_norm_input')\n        self.left_projection = Linear(\n            self.channel_num['pair_channel'],\n            self.config.num_intermediate_channel,\n            name='left_projection')\n        self.right_projection = Linear(\n            self.channel_num['pair_channel'],\n            self.config.num_intermediate_channel,\n            name='right_projection')\n        self.left_gate = Linear(\n            self.channel_num['pair_channel'],\n            self.config.num_intermediate_channel,\n            name='left_gate')\n        init_gate_linear(self.left_gate)\n        self.right_gate = Linear(\n            self.channel_num['pair_channel'],\n            self.config.num_intermediate_channel,\n            name='right_gate')\n        init_gate_linear(self.right_gate)\n\n        # line 4\n        self.center_layer_norm = nn.LayerNorm(\n            self.config.num_intermediate_channel, name='center_layer_norm')\n        self.output_projection = Linear(\n            self.config.num_intermediate_channel,\n            self.channel_num['pair_channel'],\n            name='output_projection')\n        init_final_linear(self.output_projection)\n        # line 3\n        self.gating_linear = Linear(\n            self.channel_num['pair_channel'],\n            self.channel_num['pair_channel'],\n            name='output_projection')\n        init_gate_linear(self.gating_linear)\n\n    def forward(self, act, mask):\n        \"\"\"Builds TriangleMultiplication module.\n\n        Args:\n            act (float): Pair activations, shape [batch, N_res, N_res, c_z]\n            mask (float): Pair mask, shape [batch, N_res, N_res].\n\n        Returns:\n            Outputs, same shape/type as act.\n        \"\"\"\n        # Outgoing [batch, N_res//dap_size, N_res] => [batch, N_res//dap_size, N_res, 1]\n        # Incoming [batch, N_res, N_res//dap_size] => [batch, N_res, N_res//dap_size, 1] \n        mask = paddle.unsqueeze(mask, axis=-1)  # [batch, N_res, N_res, 1]\n\n        # Outgoing [B, N_res//dap_size, N_res, c_z]\n        # Incoming [B, N_res, N_res//dap_size, c_z]\n        act = self.layer_norm_input(act)  # line 1\n\n        # if not self.training:\n        if not self.training and self.global_config.low_memory is True:\n            # Note(GuoxiaWang): using inplace version to save memory(low_mem=True).\n            left_proj_act = self.left_gate(act)\n            left_proj_act.sigmoid_()\n            left_proj_act.multiply_(self.left_projection(act))\n            left_proj_act.multiply_(mask)\n\n            right_proj_act_before = self.right_gate(act)\n            right_proj_act_before.sigmoid_()\n            right_proj_act_before.multiply_(self.right_projection(act))\n            right_proj_act_before.multiply_(mask)\n\n        else:\n            # Outgoing [B, N_res//dap_size, N_res, c_z] => [B, N_res//dap_size, N_res, num_intermediate_channel]\n            # Incoming [B, N_res, N_res//dap_size, c_z] => [B, N_res, N_res//dap_size, num_intermediate_channel]\n            left_proj_act = mask * self.left_projection(act)\n            right_proj_act = mask * self.right_projection(act)\n\n            # Outgoing [B, N_res//dap_size, N_res, c_z] => [B, N_res//dap_size, N_res, num_intermediate_channel]\n            # Incoming [B, N_res, N_res//dap_size, c_z] => [B, N_res, N_res//dap_size, num_intermediate_channel]\n            left_gate_values = nn.functional.sigmoid(self.left_gate(act))\n            right_gate_values = nn.functional.sigmoid(self.right_gate(act))\n\n            # Outgoing [B, N_res//dap_size, N_res, num_intermediate_channel]\n            # Incoming [B, N_res, N_res//dap_size, num_intermediate_channel]\n            left_proj_act = left_proj_act * left_gate_values\n            right_proj_act_before = right_proj_act * right_gate_values\n\n        # \"Outgoing\" edges equation: 'ikc,jkc->ijc'\n        # \"Incoming\" edges equation: 'kjc,kic->ijc'\n        # Note on the Suppl. Alg. 11 & 12 notation:\n        # For the \"outgoing\" edges, a = left_proj_act and b = right_proj_act\n        # For the \"incoming\" edges, it's swapped:\n        #   b = left_proj_act and a = right_proj_act\n\n        if self.config.equation == 'ikc,jkc->ijc':\n            # Outgoing\n            # [B, N_res//dap_size, N_res, num_intermediate_channel] => [B, N_res, N_res, num_intermediate_channel]\n            right_proj_act = dap.all_gather(right_proj_act_before, axis=1)\n            # if not self.training:\n            if not self.training and self.global_config.low_memory is True:\n                del right_proj_act_before\n                gc.collect()\n        elif self.config.equation == 'kjc,kic->ijc':\n            # Incoming\n            # [B, N_res, N_res//dap_size, num_intermediate_channel] => [B, N_res, N_res, num_intermediate_channel]\n            right_proj_act = dap.all_gather(right_proj_act_before, axis=2)\n            # if not self.training:\n            if not self.training and self.global_config.low_memory is True:\n                del right_proj_act_before\n                gc.collect()\n        else:\n            raise ValueError('unknown equation.')\n\n        # Outgoing [B, N_res//dap_size, N_res, c_z]\n        # Incoming [B, N_res, N_res//dap_size, c_z]        \n\n        # if not self.training:\n        if not self.training and self.global_config.low_memory is True:\n            gate_values = self.gating_linear(act).sigmoid_()  # line 3\n        else:\n            gate_values = nn.functional.sigmoid(\n                self.gating_linear(act))  # line 3\n\n        if self.config.equation == 'ikc,jkc->ijc':\n            # Outgoing\n            dim, out_idx = 1, 1\n            equation = 'bikc,bjkc->bijc'\n\n            # [B, N_res, N_res, num_intermediate_channel]\n            right_proj_act_after = dap.all_gather_opp(right_proj_act, axis=1)\n        elif self.config.equation == 'kjc,kic->ijc':\n            # Incoming\n            dim, out_idx = 2, 2\n            equation = 'bkjc,bkic->bijc'\n\n            # [B, N_res, N_res, num_intermediate_channel]\n            right_proj_act_after = dap.all_gather_opp(right_proj_act, axis=2)\n        else:\n            raise ValueError('unknown equation.')\n\n        if not self.training:\n            einsum_fn = subbatch(paddle.einsum, [1], [dim],\n                                 self.global_config.subbatch_size, out_idx)\n            act = einsum_fn(equation, left_proj_act, right_proj_act_after)\n        else:\n            # Outgoing equation = 'bikc,bjkc->bijc'\n            # [B, N_res//dap_size, N_res, num_intermediate_channel], [B, N_res, N_res, num_intermediate_channel]\n            # => [B, N_res//dap_size, N_res, num_intermediate_channel]\n\n            # Incoming equation = 'bkjc,bkic->bijc'\n            # [B, N_res, N_res//dap_size, num_intermediate_channel], [B, N_res, N_res, num_intermediate_channel]\n            # => [B, N_res, N_res//dap_size, num_intermediate_channel]\n            act = paddle.einsum(equation, left_proj_act, right_proj_act_after)\n\n        act = self.center_layer_norm(act)\n        act = self.output_projection(act)\n\n        act = act * gate_values\n\n        return act\n"
  },
  {
    "path": "ppfleetx/models/protein_folding/common.py",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport numpy as np\nimport functools\nimport numbers\nimport collections\nimport paddle\nimport paddle.nn as nn\nfrom paddle.distributed.fleet.utils import recompute\n\ntry:\n    from paddle import _legacy_C_ops as _C_ops\nexcept:\n    from paddle import _C_ops\n\n\ndef set_tensor_constant(tensor, constant):\n    tensor.set_value(paddle.full_like(tensor, constant))\n\n\ndef init_gate_linear(linear):\n    set_tensor_constant(linear.weight, 0)\n    set_tensor_constant(linear.bias, 1)\n\n\ndef init_final_linear(linear):\n    set_tensor_constant(linear.weight, 0)\n\n\ndef recompute_wrapper(func, *args, is_recompute=True):\n    \"\"\"Function wrapper for recompute\"\"\"\n    if is_recompute:\n        return recompute(func, *args)\n    else:\n        return func(*args)\n\n\ndef subbatch(f, arg_idx, dim, bs, out_idx, same_arg_idx={}):\n    \"\"\" Converts a function to one that applies to subbatch of an input\n    dimension.\n    Args:\n        f(Callable): original function.\n        arg_idx([int]): indices of the inputs to be subbatched.\n        dim([int]): index of the dimension to be subbatched.\n        bs(int): subbatch size.\n        out_idx(int): index of the output dimension that needs stacking\n        same_arg_idx(dict), optional: index of same arg mapping. e.g {1: 0} means arg[1] == arg[0],\n                            we assign _args[1] = _args[0] avoiding slice repeatly.\n    Returns:\n        converted function.\n    \"\"\"\n\n    @functools.wraps(f)\n    def wrapper(*args, **kwargs):\n\n        assert len(arg_idx) == len(\n            dim\n        ), f'Number of batching args and number of batching dims should match.'\n\n        inps = [args[i] for i in arg_idx]\n        dim_width = [inp.shape[d] for inp, d in zip(inps, dim)]\n        assert len(set(dim_width)) == 1, f'Batch sizes should be kept equal.'\n\n        inp_dim = {inp: d for inp, d in zip(inps, dim)}\n\n        dim_width = dim_width[0]\n        if dim_width < bs:\n            return f(*args, **kwargs)\n\n        outs = []\n        for slice_at in np.arange(0, dim_width, bs):\n            _args = []\n            for i, inp in enumerate(args):\n                if i in same_arg_idx:\n                    assert i > same_arg_idx[\n                        i], f\"expect i > same_arg_idx[i], but got i: {i} and same_arg_idx[i]: {same_arg_idx[i]}\"\n                    _args.append(_args[same_arg_idx[i]])\n                elif i in arg_idx:\n                    inp = inp.slice([inp_dim[inp]], [slice_at],\n                                    [slice_at + bs])\n                    _args.append(inp)\n                else:\n                    _args.append(inp)\n            outs.append(f(*_args, **kwargs))\n\n        return paddle.concat(outs, out_idx)\n\n    return wrapper\n\n\ndef batched_gather(params, indices, axis=0, batch_dims=0):\n    # Implement gather with batching, like tensorflow:\n    # https://www.tensorflow.org/api_docs/python/tf/gather#batching\n    # print(params.shape, indices.shape, axis)\n    p, i = params, indices\n    rank = len(p.shape)\n    axis = (rank + axis) % rank\n    # The stride of axis\n    stride = p.shape[batch_dims + axis]\n\n    if batch_dims == 0 and len(i.shape) == 1:\n        return paddle.gather(p, i, axis=axis)\n\n    elif batch_dims == 0:\n        flat_i = i.reshape([-1])\n        gathered = paddle.gather(p, flat_i, axis=axis)\n        shape = p.shape[:axis] + i.shape\n        if axis < rank - 1:\n            shape += params.shape[axis + 1:]\n        return gathered.reshape(shape)\n\n    b = batch_dims\n    a = axis\n    assert p.shape[:b] == i.shape[:b]\n    bn = np.prod(p.shape[:b])\n\n    # Shift batch dimensions right to bundle with axis\n    if a > 0:\n        perm = list(range(rank))\n        perm = perm[b:(b + a)] + perm[:b] + perm[(b + a):]\n        p = p.transpose(perm)\n\n    # Merge params' batch+axis\n    p = p.reshape(p.shape[:a] + [-1] + p.shape[(b + a + 1):])\n\n    # indices = [Batch..., Index...]\n    # Expand the index values across batch elements\n    strides = paddle.arange(bn, dtype=\"int64\").unsqueeze(-1) * stride\n    i = i.reshape([bn, -1])\n    flat_i = paddle.flatten(i + strides)\n\n    # Do gather\n    gathered = paddle.gather(p, flat_i, axis=axis)\n\n    # Unbundle batch and index dimensions\n    unbundled_shape = p.shape[:a] + indices.shape + p.shape[a + 1:]\n    gathered = gathered.reshape(unbundled_shape)\n\n    # Shift batch dimensions back to the left\n    if a > 0:\n        perm = list(range(len(unbundled_shape)))\n        perm = perm[a:(a + b)] + perm[:a] + perm[(a + b):]\n        gathered = gathered.transpose(perm)\n\n    return gathered\n\n\ndef mask_mean(mask, value, axis=None, drop_mask_channel=False, eps=1e-10):\n    if drop_mask_channel:\n        mask = mask[:, 0]\n\n    mask_shape = mask.shape\n    value_shape = value.shape\n    assert len(mask_shape) == len(value_shape)\n\n    if isinstance(axis, numbers.Integral):\n        axis = [axis]\n    elif axis is None:\n        axis = list(range(len(mask_shape)))\n\n    assert isinstance(axis, collections.abc.Iterable), \\\n        'axis needs to be either an iterable, integer or \"None\"'\n\n    broadcast_factor = 1.\n    for axis_ in axis:\n        value_size = value_shape[axis_]\n        mask_size = mask_shape[axis_]\n        if mask_size == 1:\n            broadcast_factor *= value_size\n        else:\n            assert mask_size == value_size\n\n    return (paddle.sum(mask * value, axis=axis) /\n            (paddle.sum(mask, axis=axis) * broadcast_factor + eps))\n\n\nclass Transition(nn.Layer):\n    \"\"\"Transition layer.\n\n    Jumper et al. (2021) Suppl. Alg. 9 \"MSATransition\"\n    Jumper et al. (2021) Suppl. Alg. 15 \"PairTransition\"\n    \"\"\"\n\n    def __init__(self, channel_num, config, global_config, is_extra_msa,\n                 transition_type):\n        super(Transition, self).__init__()\n        assert transition_type in ['msa_transition', 'pair_transition']\n        self.channel_num = channel_num\n        self.config = config\n        self.global_config = global_config\n        self.is_extra_msa = is_extra_msa\n        self.transition_type = transition_type\n\n        Linear = paddle.incubate.nn.FusedLinear if self.global_config.fuse_linear else paddle.nn.Linear\n\n        if transition_type == 'msa_transition' and is_extra_msa:\n            in_dim = channel_num['extra_msa_channel']\n        elif transition_type == 'msa_transition' and not is_extra_msa:\n            in_dim = channel_num['msa_channel']\n        elif transition_type == 'pair_transition':\n            in_dim = channel_num['pair_channel']\n\n        self.input_layer_norm = nn.LayerNorm(in_dim)\n        self.transition1 = Linear(\n            in_dim,\n            int(in_dim * self.config.num_intermediate_factor),\n            weight_attr=paddle.ParamAttr(\n                initializer=nn.initializer.KaimingNormal()))\n\n        if self.global_config.zero_init:\n            last_init = nn.initializer.Constant(0.0)\n        else:\n            last_init = nn.initializer.TruncatedNormal()\n\n        self.transition2 = Linear(\n            int(in_dim * self.config.num_intermediate_factor),\n            in_dim,\n            weight_attr=paddle.ParamAttr(initializer=last_init))\n\n    def forward(self, act, mask):\n        act = self.input_layer_norm(act)\n\n        def transition_module(x):\n            x = self.transition1(x)\n            x = nn.functional.relu(x)\n            x = self.transition2(x)\n            return x\n\n        if not self.training:\n            # low memory mode using subbatch\n            sb_transition = subbatch(transition_module, [0], [1],\n                                     self.global_config.subbatch_size, 1)\n            act = sb_transition(act)\n        else:\n            act = transition_module(act)\n\n        return act\n\n\nclass Dropout(nn.Layer):\n    def __init__(self, p=0.5, axis=None, mode=\"upscale_in_train\", name=None):\n        super(Dropout, self).__init__()\n\n        if not isinstance(p, (float, int)):\n            raise TypeError(\"p argument should be a number\")\n        if p < 0 or p > 1:\n            raise ValueError(\"p argument should between 0 and 1\")\n\n        mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer\n        if mode not in ('downscale_in_infer', 'upscale_in_train'):\n            raise ValueError(\n                \"mode argument should be 'downscale_in_infer' or 'upscale_in_train'\"\n            )\n\n        if axis and not isinstance(axis, (int, list, tuple)):\n            raise TypeError(\"datatype of axis argument should be int or list\")\n\n        self.p = p\n        self.axis = axis\n        self.mode = mode\n        self.name = name\n\n    def forward(self, input):\n        # fast return for p == 0\n        if self.p == 0:\n            return input\n\n        if self.axis == None:\n            out = nn.functional.dropout(\n                input,\n                p=self.p,\n                axis=self.axis,\n                training=self.training,\n                mode=self.mode,\n                name=self.name)\n        else:\n            seed = None\n            drop_axes = [self.axis] if isinstance(self.axis,\n                                                  int) else list(self.axis)\n            if paddle.static.default_main_program().random_seed != 0:\n                seed = paddle.static.default_main_program().random_seed\n\n            out, mask = _C_ops.dropout_nd(\n                input, 'dropout_prob', self.p, 'is_test', not self.training,\n                'fix_seed', seed is not None, 'seed', seed if seed is not None\n                else 0, 'dropout_implementation', self.mode, 'axis', drop_axes)\n\n        return out\n\n    def extra_repr(self):\n        name_str = ', name={}'.format(self.name) if self.name else ''\n        return 'p={}, axis={}, mode={}{}'.format(self.p, self.axis, self.mode,\n                                                 name_str)\n\n\ndef dgram_from_positions(positions, num_bins, min_bin, max_bin):\n    lower_breaks = paddle.linspace(min_bin, max_bin, num_bins)\n    lower_breaks = paddle.square(lower_breaks)\n    upper_breaks = paddle.concat([\n        lower_breaks[1:], paddle.full(\n            shape=[1], fill_value=1e8, dtype='float32')\n    ])\n\n    def _squared_difference(x, y):\n        return paddle.square(x - y)\n\n    dist2 = paddle.sum(_squared_difference(\n        paddle.unsqueeze(\n            positions, axis=-2),\n        paddle.unsqueeze(\n            positions, axis=-3)),\n                       axis=-1,\n                       keepdim=True)\n\n    dgram = ((dist2 > lower_breaks.astype(dist2.dtype)).astype('float32') *\n             (dist2 < upper_breaks.astype(dist2.dtype)).astype('float32'))\n    return dgram\n"
  },
  {
    "path": "ppfleetx/models/protein_folding/evoformer.py",
    "content": "\"\"\"evoformer.py.\"\"\"\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport gc\nimport paddle\nimport paddle.nn as nn\n\nfrom ppfleetx.distributed.protein_folding import bp, dap\n\nfrom .attentions import (\n    MSARowAttentionWithPairBias,\n    MSAColumnGlobalAttention,\n    MSAColumnAttention,\n    TriangleMultiplication,\n    TriangleAttention, )\n\nfrom .common import (\n    Transition,\n    Dropout,\n    recompute_wrapper,\n    dgram_from_positions, )\n\nfrom .template import (TemplateEmbedding, )\nfrom .outer_product_mean import (OuterProductMean, )\n\nfrom . import (\n    residue_constants,\n    all_atom, )\n\n\nclass EvoformerIteration(nn.Layer):\n    \"\"\"Single iteration (block) of Evoformer stack.\n\n    Jumper et al. (2021) Suppl. Alg. 6 \"EvoformerStack\" lines 2-10\n    \"\"\"\n\n    def __init__(self, channel_num, config, global_config, is_extra_msa=False):\n        super(EvoformerIteration, self).__init__()\n        self.channel_num = channel_num\n        self.config = config\n        self.global_config = global_config\n        self.is_extra_msa = is_extra_msa\n\n        assert self.global_config.outer_product_mean_position in [\n            'origin', 'middle', 'first', 'end'\n        ]\n\n        # Row-wise Gated Self-attention with Pair Bias\n        self.msa_row_attention_with_pair_bias = MSARowAttentionWithPairBias(\n            channel_num, self.config.msa_row_attention_with_pair_bias,\n            self.global_config, is_extra_msa)\n        dropout_rate, dropout_axis = self._parse_dropout_params(\n            self.msa_row_attention_with_pair_bias)\n        self.msa_row_attn_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \\\n            if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)\n\n        if self.is_extra_msa:\n            self.msa_column_global_attention = MSAColumnGlobalAttention(\n                channel_num, config.msa_column_attention, global_config)\n            dropout_rate, dropout_axis = self._parse_dropout_params(\n                self.msa_column_global_attention)\n            self.msa_col_attn_dropout = nn.Dropout(\n                dropout_rate, axis=dropout_axis) \\\n                    if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)\n        else:\n            self.msa_column_attention = MSAColumnAttention(\n                channel_num, config.msa_column_attention, global_config)\n            dropout_rate, dropout_axis = self._parse_dropout_params(\n                self.msa_column_attention)\n            self.msa_col_attn_dropout = nn.Dropout(\n                dropout_rate, axis=dropout_axis) \\\n                    if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)\n\n        self.msa_transition = Transition(\n            channel_num, self.config.msa_transition, self.global_config,\n            is_extra_msa, 'msa_transition')\n        dropout_rate, dropout_axis = self._parse_dropout_params(\n            self.msa_transition)\n        self.msa_transition_dropout = nn.Dropout(\n            dropout_rate, axis=dropout_axis) \\\n                if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)\n\n        # OuterProductMean\n        self.outer_product_mean = OuterProductMean(\n            channel_num,\n            self.config.outer_product_mean,\n            self.global_config,\n            self.is_extra_msa,\n            name='outer_product_mean')\n\n        # Dropout\n        dropout_rate, dropout_axis = self._parse_dropout_params(\n            self.outer_product_mean)\n        self.outer_product_mean_dropout = nn.Dropout(\n            dropout_rate, axis=dropout_axis) \\\n                if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)\n\n        # Triangle Multiplication.\n        self.triangle_multiplication_outgoing = TriangleMultiplication(\n            channel_num,\n            self.config.triangle_multiplication_outgoing,\n            self.global_config,\n            name='triangle_multiplication_outgoing')\n\n        dropout_rate, dropout_axis = self._parse_dropout_params(\n            self.triangle_multiplication_outgoing)\n        self.triangle_outgoing_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \\\n            if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)\n\n        self.triangle_multiplication_incoming = TriangleMultiplication(\n            channel_num,\n            self.config.triangle_multiplication_incoming,\n            self.global_config,\n            name='triangle_multiplication_incoming')\n\n        dropout_rate, dropout_axis = self._parse_dropout_params(\n            self.triangle_multiplication_incoming)\n        self.triangle_incoming_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \\\n            if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)\n\n        # TriangleAttention.\n        self.triangle_attention_starting_node = TriangleAttention(\n            channel_num,\n            self.config.triangle_attention_starting_node,\n            self.global_config,\n            name='triangle_attention_starting_node')\n\n        dropout_rate, dropout_axis = self._parse_dropout_params(\n            self.triangle_attention_starting_node)\n        self.triangle_starting_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \\\n            if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)\n\n        self.triangle_attention_ending_node = TriangleAttention(\n            channel_num,\n            self.config.triangle_attention_ending_node,\n            self.global_config,\n            name='triangle_attention_ending_node')\n\n        dropout_rate, dropout_axis = self._parse_dropout_params(\n            self.triangle_attention_ending_node)\n        self.triangle_ending_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \\\n            if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)\n\n        # Pair transition.\n        self.pair_transition = Transition(\n            channel_num, self.config.pair_transition, self.global_config,\n            is_extra_msa, 'pair_transition')\n\n        dropout_rate, dropout_axis = self._parse_dropout_params(\n            self.pair_transition)\n        self.pair_transition_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \\\n            if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)\n\n    def _parse_dropout_params(self, module):\n        \"\"\"tbd.\"\"\"\n\n        dropout_rate = 0.0 if self.global_config.deterministic else \\\n            module.config.dropout_rate\n        dropout_axis = None\n        if module.config.shared_dropout:\n            dropout_axis = {\n                'per_row': [0, 2, 3],\n                'per_column': [0, 1, 3],\n            }[module.config.orientation]\n\n        return dropout_rate, dropout_axis\n\n    def outer_product_mean_origin(self, msa_act, pair_act, masks):\n        \"\"\"tbd.\"\"\"\n\n        assert bp.get_world_size(\n        ) == 1, \"Branch Parallel degree must be 1 for outer_product_mean_origin\"\n\n        msa_mask, pair_mask = masks['msa'], masks['pair']\n\n        # [B, N_seq//dap_size, N_res, c_m]\n        residual = self.msa_row_attention_with_pair_bias(msa_act, msa_mask,\n                                                         pair_act)\n        residual = self.msa_row_attn_dropout(residual)\n        msa_act = msa_act + residual\n\n        # [B, N_seq//dap_size, N_res, c_m] => [B, N_seq, N_res//dap_size, c_m]\n        msa_act = dap.row_to_col(msa_act)\n\n        if self.is_extra_msa:\n            # [B, N_seq, N_res//dap_size, c_m]\n            residual = self.msa_column_global_attention(msa_act, msa_mask)\n            residual = self.msa_col_attn_dropout(residual)\n            msa_act = msa_act + residual\n\n            # [B, N_seq, N_res//dap_size, c_m]\n            residual = self.msa_transition(msa_act, msa_mask)\n            residual = self.msa_transition_dropout(residual)\n            msa_act = msa_act + residual\n\n        else:\n            # [B, N_seq, N_res//dap_size, c_m]\n            residual = self.msa_column_attention(msa_act, msa_mask)\n            residual = self.msa_col_attn_dropout(residual)\n            msa_act = msa_act + residual\n\n            # [B, N_seq, N_res//dap_size, c_m]\n            residual = self.msa_transition(msa_act, msa_mask)\n            residual = self.msa_transition_dropout(residual)\n            msa_act = msa_act + residual\n\n        # [B, N_res//dap_size, N_res, c_z]\n        residual = self.outer_product_mean(msa_act, msa_mask)\n        outer_product_mean = self.outer_product_mean_dropout(residual)\n        # if not self.training:  # for inference\n        if not self.training and self.global_config.low_memory is True:\n            pair_act.add_(outer_product_mean)\n            del outer_product_mean\n            gc.collect()\n        else:\n            pair_act = pair_act + outer_product_mean\n\n        # [B, N_seq, N_res//dap_size, c_m] => [B, N_seq//dap_size, N_res, c_m]\n        msa_act = dap.col_to_row(msa_act)\n\n        # scatter if using dap, otherwise do nothing\n        pair_mask_row = dap.scatter(pair_mask, axis=1)\n        pair_mask_col = dap.scatter(pair_mask, axis=2)\n\n        # [B, N_res//dap_size, N_res, c_z]\n        # TODO(GuoxiaWang): why have diffrence whether remove pair_act = pair_act.clone()\n        # pair_act = pair_act.clone()\n        residual = self.triangle_multiplication_outgoing(pair_act,\n                                                         pair_mask_row)\n        residual = self.triangle_outgoing_dropout(residual)\n        # if not self.training:  # for inference\n        if not self.training and self.global_config.low_memory is True:\n            pair_act.add_(residual)\n            del residual\n            gc.collect()\n        else:\n            pair_act = pair_act + residual\n\n        # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res//dap_size, c_z]\n        pair_act = dap.row_to_col(pair_act)\n        # [B, N_res, N_res//dap_size, c_z]\n        residual = self.triangle_multiplication_incoming(pair_act,\n                                                         pair_mask_col)\n        residual = self.triangle_incoming_dropout(residual)\n        # if not self.training:  # for inference\n        if not self.training and self.global_config.low_memory is True:\n            pair_act.add_(residual)\n            del residual\n            gc.collect()\n        else:\n            pair_act = pair_act + residual\n\n        # [B, N_res, N_res//dap_size, c_z] => [B, N_res//dap_size, N_res, c_z]\n        pair_act = dap.col_to_row(pair_act)\n        # [B, N_res//dap_size, N_res, c_z]\n        residual = self.triangle_attention_starting_node(pair_act,\n                                                         pair_mask_row)\n        residual = self.triangle_starting_dropout(residual)\n        # if not self.training:  # for inference\n        if not self.training and self.global_config.low_memory is True:\n            pair_act.add_(residual)\n            del residual\n            gc.collect()\n        else:\n            pair_act = pair_act + residual\n\n        # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res//dap_size, c_z]\n        pair_act = dap.row_to_col(pair_act)\n        # [B, N_res, N_res//dap_size, c_z]\n        residual = self.triangle_attention_ending_node(pair_act, pair_mask_col)\n        residual = self.triangle_ending_dropout(residual)\n        # if not self.training:  # for inference\n        if not self.training and self.global_config.low_memory is True:\n            pair_act.add_(residual)\n            del residual\n            gc.collect()\n        else:\n            pair_act = pair_act + residual\n\n        residual = self.pair_transition(pair_act, pair_mask)\n        residual = self.pair_transition_dropout(residual)\n        # if not self.training:  # for inference\n        if not self.training and self.global_config.low_memory is True:\n            pair_act.add_(residual)\n            del residual\n            gc.collect()\n        else:\n            pair_act = pair_act + residual\n\n        # [B, N_res, N_res//dap_size, c_z] => [B, N_res//dap_size, N_res, c_z]\n        pair_act = dap.col_to_row(pair_act)\n\n        return msa_act, pair_act\n\n    def outer_product_mean_first(self, msa_act, pair_act, masks):\n        \"\"\"tbd.\"\"\"\n\n        raise NotImplementedError(\n            \"BP or DAP does not support outer_product_mean_first\")\n\n    def outer_product_mean_end(self, msa_act, pair_act, masks):\n        \"\"\"tbd.\"\"\"\n\n        msa_mask, pair_mask = masks['msa'], masks['pair']\n\n        if bp.get_world_size() > 1:\n            # Note(GuoxiaWang): add zeros trigger the status of stop_gradient=False within recompute context.\n            pair_act = pair_act + paddle.zeros_like(pair_act)\n\n            # Note(GuoxiaWang): reduce the pair_act's gradient from msa branch and pair branch\n            if not pair_act.stop_gradient:\n                pair_act._register_grad_hook(bp.all_reduce)\n\n            if bp.get_rank_in_group() == 0:\n                # [B, N_seq//dap_size, N_res, c_m]\n                residual = self.msa_row_attention_with_pair_bias(\n                    msa_act, msa_mask, pair_act)\n                residual = self.msa_row_attn_dropout(residual)\n                msa_act = msa_act + residual\n\n                # [B, N_seq//dap_size, N_res, c_m] => [B, N_seq, N_res//dap_size, c_m]\n                msa_act = dap.row_to_col(msa_act)\n\n                if self.is_extra_msa:\n                    # [B, N_seq, N_res//dap_size, c_m]\n                    residual = self.msa_column_global_attention(msa_act,\n                                                                msa_mask)\n                    residual = self.msa_col_attn_dropout(residual)\n                    msa_act = msa_act + residual\n\n                    # [B, N_seq, N_res//dap_size, c_m]\n                    residual = self.msa_transition(msa_act, msa_mask)\n                    residual = self.msa_transition_dropout(residual)\n                    msa_act = msa_act + residual\n\n                else:\n                    # [B, N_seq, N_res//dap_size, c_m]\n                    residual = self.msa_column_attention(msa_act, msa_mask)\n                    residual = self.msa_col_attn_dropout(residual)\n                    msa_act = msa_act + residual\n\n                    # [B, N_seq, N_res//dap_size, c_m]\n                    residual = self.msa_transition(msa_act, msa_mask)\n                    residual = self.msa_transition_dropout(residual)\n                    msa_act = msa_act + residual\n\n                # [B, N_res//dap_size, N_res, c_z]\n                residual = self.outer_product_mean(msa_act, msa_mask)\n                outer_product_mean = self.outer_product_mean_dropout(residual)\n\n                # [B, N_seq, N_res//dap_size, c_m] => [B, N_seq//dap_size, N_res, c_m]\n                msa_act = dap.col_to_row(msa_act)\n\n            if bp.get_rank_in_group() == 1:\n                # scatter if using dap, otherwise do nothing\n                pair_mask_row = dap.scatter(pair_mask, axis=1)\n                pair_mask_col = dap.scatter(pair_mask, axis=2)\n\n                # [B, N_res//dap_size, N_res, c_z]\n                residual = self.triangle_multiplication_outgoing(pair_act,\n                                                                 pair_mask_row)\n                residual = self.triangle_outgoing_dropout(residual)\n                pair_act = pair_act + residual\n\n                # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res//dap_size, c_z]\n                pair_act = dap.row_to_col(pair_act)\n                # [B, N_res, N_res//dap_size, c_z]\n                residual = self.triangle_multiplication_incoming(pair_act,\n                                                                 pair_mask_col)\n                residual = self.triangle_incoming_dropout(residual)\n                pair_act = pair_act + residual\n\n                # [B, N_res, N_res//dap_size, c_z] => [B, N_res//dap_size, N_res, c_z]\n                pair_act = dap.col_to_row(pair_act)\n                # [B, N_res//dap_size, N_res, c_z]\n                residual = self.triangle_attention_starting_node(pair_act,\n                                                                 pair_mask_row)\n                residual = self.triangle_starting_dropout(residual)\n                pair_act = pair_act + residual\n\n                # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res//dap_size, c_z]\n                pair_act = dap.row_to_col(pair_act)\n                # [B, N_res, N_res//dap_size, c_z]\n                residual = self.triangle_attention_ending_node(pair_act,\n                                                               pair_mask_col)\n                residual = self.triangle_ending_dropout(residual)\n                pair_act = pair_act + residual\n\n                residual = self.pair_transition(pair_act, pair_mask)\n                residual = self.pair_transition_dropout(residual)\n                pair_act = pair_act + residual\n\n                # [B, N_res, N_res//dap_size, c_z] => [B, N_res//dap_size, N_res, c_z]\n                pair_act = dap.col_to_row(pair_act)\n\n                outer_product_mean = paddle.zeros_like(pair_act)\n                outer_product_mean.stop_gradient = pair_act.stop_gradient\n\n            # TODO(GuoxiaWang): fix PyLayer ctx illegal access\n            msa_act = paddle.assign(msa_act)\n            pair_act = paddle.assign(pair_act)\n\n            msa_act, pair_act = bp.sync_evoformer_results(outer_product_mean,\n                                                          msa_act, pair_act)\n            # TODO(GuoxiaWang): fix PyLayer ctx illegal access\n            pair_act = paddle.assign(pair_act)\n            return msa_act, pair_act\n\n        else:\n            # [B, N_seq//dap_size, N_res, c_m]\n            residual = self.msa_row_attention_with_pair_bias(msa_act, msa_mask,\n                                                             pair_act)\n            residual = self.msa_row_attn_dropout(residual)\n            msa_act = msa_act + residual\n\n            # [B, N_seq//dap_size, N_res, c_m] => [B, N_seq, N_res//dap_size, c_m]\n            msa_act = dap.row_to_col(msa_act)\n\n            if self.is_extra_msa:\n                # [B, N_seq, N_res//dap_size, c_m]\n                residual = self.msa_column_global_attention(msa_act, msa_mask)\n                residual = self.msa_col_attn_dropout(residual)\n                msa_act = msa_act + residual\n\n                # [B, N_seq, N_res//dap_size, c_m]\n                residual = self.msa_transition(msa_act, msa_mask)\n                residual = self.msa_transition_dropout(residual)\n                msa_act = msa_act + residual\n\n            else:\n                # [B, N_seq, N_res//dap_size, c_m]\n                residual = self.msa_column_attention(msa_act, msa_mask)\n                residual = self.msa_col_attn_dropout(residual)\n                msa_act = msa_act + residual\n\n                # [B, N_seq, N_res//dap_size, c_m]\n                residual = self.msa_transition(msa_act, msa_mask)\n                residual = self.msa_transition_dropout(residual)\n                msa_act = msa_act + residual\n\n            # [B, N_res//dap_size, N_res, c_z]\n            residual = self.outer_product_mean(msa_act, msa_mask)\n            outer_product_mean = self.outer_product_mean_dropout(residual)\n\n            # [B, N_seq, N_res//dap_size, c_m] => [B, N_seq//dap_size, N_res, c_m]\n            msa_act = dap.col_to_row(msa_act)\n\n            # scatter if using dap, otherwise do nothing\n            pair_mask_row = dap.scatter(pair_mask, axis=1)\n            pair_mask_col = dap.scatter(pair_mask, axis=2)\n\n            # [B, N_res//dap_size, N_res, c_z]\n            # TODO(GuoxiaWang): why have diffrence whether remove pair_act = pair_act.clone()\n            # pair_act = pair_act.clone()\n            residual = self.triangle_multiplication_outgoing(pair_act,\n                                                             pair_mask_row)\n            residual = self.triangle_outgoing_dropout(residual)\n            pair_act = pair_act + residual\n\n            # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res//dap_size, c_z]\n            pair_act = dap.row_to_col(pair_act)\n            # [B, N_res, N_res//dap_size, c_z]\n            residual = self.triangle_multiplication_incoming(pair_act,\n                                                             pair_mask_col)\n            residual = self.triangle_incoming_dropout(residual)\n            pair_act = pair_act + residual\n\n            # [B, N_res, N_res//dap_size, c_z] => [B, N_res//dap_size, N_res, c_z]\n            pair_act = dap.col_to_row(pair_act)\n            # [B, N_res//dap_size, N_res, c_z]\n            residual = self.triangle_attention_starting_node(pair_act,\n                                                             pair_mask_row)\n            residual = self.triangle_starting_dropout(residual)\n            pair_act = pair_act + residual\n\n            # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res//dap_size, c_z]\n            pair_act = dap.row_to_col(pair_act)\n            # [B, N_res, N_res//dap_size, c_z]\n            residual = self.triangle_attention_ending_node(pair_act,\n                                                           pair_mask_col)\n            residual = self.triangle_ending_dropout(residual)\n            pair_act = pair_act + residual\n\n            residual = self.pair_transition(pair_act, pair_mask)\n            residual = self.pair_transition_dropout(residual)\n            pair_act = pair_act + residual\n\n            # [B, N_res, N_res//dap_size, c_z] => [B, N_res//dap_size, N_res, c_z]\n            pair_act = dap.col_to_row(pair_act)\n\n            pair_act = pair_act + outer_product_mean\n\n            return msa_act, pair_act\n\n    def forward(self, msa_act, pair_act, masks):\n        \"\"\"tbd.\"\"\"\n\n        if self.global_config.outer_product_mean_position in [\n                'origin', 'middle'\n        ]:\n            msa_act, pair_act = self.outer_product_mean_origin(msa_act,\n                                                               pair_act, masks)\n\n        elif self.global_config.outer_product_mean_position == 'first':\n            msa_act, pair_act = self.outer_product_mean_first(msa_act,\n                                                              pair_act, masks)\n\n        elif self.global_config.outer_product_mean_position == 'end':\n            msa_act, pair_act = self.outer_product_mean_end(msa_act, pair_act,\n                                                            masks)\n\n        else:\n            raise Error(\n                \"Only support outer_product_mean_position in ['origin', 'middle', ''first', 'end'] now!\"\n            )\n\n        return msa_act, pair_act\n\n\nclass DistEmbeddingsAndEvoformer(nn.Layer):\n    \"\"\"Embeds the input data and runs Evoformer.\n\n    Produces the MSA, single and pair representations.\n    Jumper et al. (2021) Suppl. Alg. 2 \"Inference\" line 5-18\n    \"\"\"\n\n    def __init__(self, channel_num, config, global_config):\n        super(DistEmbeddingsAndEvoformer, self).__init__()\n        self.channel_num = channel_num\n        self.config = config\n        self.global_config = global_config\n\n        Linear = paddle.incubate.nn.FusedLinear if self.global_config.fuse_linear else paddle.nn.Linear\n\n        # InputEmbedder\n        # Jumper et al. (2021) Suppl. Alg. 2 \"Inference\" line 5\n        # Jumper et al. (2021) Suppl. Alg. 3 \"InputEmbedder\"\n        self.preprocess_1d = Linear(\n            channel_num['target_feat'],\n            self.config.msa_channel,\n            name='preprocess_1d')\n        self.preprocess_msa = Linear(\n            channel_num['msa_feat'],\n            self.config.msa_channel,\n            name='preprocess_msa')\n        self.left_single = Linear(\n            channel_num['target_feat'],\n            self.config.pair_channel,\n            name='left_single')\n        self.right_single = Linear(\n            channel_num['target_feat'],\n            self.config.pair_channel,\n            name='right_single')\n\n        # RecyclingEmbedder\n        # Jumper et al. (2021) Suppl. Alg. 2 \"Inference\" line 6\n        # Jumper et al. (2021) Suppl. Alg. 32 \"RecyclingEmbedder\"\n        if self.config.recycle_pos:\n            self.prev_pos_linear = Linear(self.config.prev_pos.num_bins,\n                                          self.config.pair_channel)\n\n        # RelPosEmbedder\n        # Jumper et al. (2021) Suppl. Alg. 4 \"relpos\"\n        # Jumper et al. (2021) Suppl. Alg. 5 \"one_hot\"\n        if self.config.max_relative_feature:\n            self.pair_activiations = Linear(\n                2 * self.config.max_relative_feature + 1,\n                self.config.pair_channel)\n\n        if self.config.recycle_features:\n            self.prev_msa_first_row_norm = nn.LayerNorm(\n                self.config.msa_channel)\n            self.prev_pair_norm = nn.LayerNorm(self.config.pair_channel)\n\n        # Embed templates into the pair activations.\n        # Jumper et al. (2021) Suppl. Alg. 2 \"Inference\" lines 9-13\n        if self.config.template.enabled:\n            self.channel_num['template_angle'] = 57\n            self.channel_num['template_pair'] = 88\n            self.template_embedding = TemplateEmbedding(\n                self.channel_num, self.config.template, self.global_config)\n\n        # ExtraMSAEmbedder\n        # Jumper et al. (2021) Suppl. Alg. 2 \"Inference\" lines 14-16\n        self.extra_msa_activations = Linear(\n            25,  # 23 (20aa+unknown+gap+mask) + 1 (has_del) + 1 (del_val)\n            self.config.extra_msa_channel)\n\n        # Extra MSA Stack.\n        # Jumper et al. (2021) Suppl. Alg. 18 \"ExtraMsaStack\"\n        self.extra_msa_stack = nn.LayerList()\n        for _ in range(self.config.extra_msa_stack_num_block):\n            self.extra_msa_stack.append(\n                EvoformerIteration(\n                    self.channel_num,\n                    self.config.evoformer,\n                    self.global_config,\n                    is_extra_msa=True))\n\n        # Embed templates torsion angles\n        if self.config.template.enabled and self.config.template.embed_torsion_angles:\n            c = self.config.msa_channel\n            self.template_single_embedding = Linear(\n                self.channel_num['template_angle'], c)\n            self.template_projection = Linear(c, c)\n\n        # Main trunk of the network\n        # Jumper et al. (2021) Suppl. Alg. 2 \"Inference\" lines 17-18\n        self.evoformer_iteration = nn.LayerList()\n        for _ in range(self.config.evoformer_num_block):\n            self.evoformer_iteration.append(\n                EvoformerIteration(\n                    self.channel_num,\n                    self.config.evoformer,\n                    self.global_config,\n                    is_extra_msa=False))\n\n        self.single_activations = Linear(self.config.msa_channel,\n                                         self.config.seq_channel)\n\n    def _pseudo_beta_fn(self, aatype, all_atom_positions, all_atom_masks):\n        \"\"\"tbd.\"\"\"\n\n        gly_id = paddle.ones_like(aatype) * residue_constants.restype_order[\n            'G']\n        is_gly = paddle.equal(aatype, gly_id)\n\n        ca_idx = residue_constants.atom_order['CA']\n        cb_idx = residue_constants.atom_order['CB']\n\n        n = len(all_atom_positions.shape)\n        pseudo_beta = paddle.where(\n            paddle.tile(\n                paddle.unsqueeze(\n                    is_gly, axis=-1), [1] * len(is_gly.shape) + [3]),\n            paddle.squeeze(\n                all_atom_positions.slice([n - 2], [ca_idx], [ca_idx + 1]),\n                axis=-2),\n            paddle.squeeze(\n                all_atom_positions.slice([n - 2], [cb_idx], [cb_idx + 1]),\n                axis=-2))\n\n        if all_atom_masks is not None:\n            m = len(all_atom_masks)\n            pseudo_beta_mask = paddle.where(\n                is_gly,\n                paddle.squeeze(\n                    all_atom_masks.slice([m - 1], [ca_idx], [ca_idx + 1]),\n                    axis=-1),\n                paddle.squeeze(\n                    all_atom_masks.slice([m - 1], [cb_idx], [cb_idx + 1]),\n                    axis=-1))\n            pseudo_beta_mask = paddle.squeeze(pseudo_beta_mask, axis=-1)\n            return pseudo_beta, pseudo_beta_mask\n        else:\n            return pseudo_beta\n\n    def _create_extra_msa_feature(self, batch):\n        \"\"\"tbd.\"\"\"\n\n        # 23: 20aa + unknown + gap + bert mask\n        msa_1hot = nn.functional.one_hot(batch['extra_msa'], 23)\n        msa_feat = [\n            msa_1hot, paddle.unsqueeze(\n                batch['extra_has_deletion'], axis=-1), paddle.unsqueeze(\n                    batch['extra_deletion_value'], axis=-1)\n        ]\n        return paddle.concat(msa_feat, axis=-1)\n\n    def forward(self, batch):\n        \"\"\"tbd.\"\"\"\n\n        # InputEmbedder\n        # Jumper et al. (2021) Suppl. Alg. 2 \"Inference\" line 5\n        # Jumper et al. (2021) Suppl. Alg. 3 \"InputEmbedder\"\n        preprocess_1d = self.preprocess_1d(batch['target_feat'])\n        # preprocess_msa = self.preprocess_msa(batch['msa_feat'])\n        msa_activations = paddle.unsqueeze(preprocess_1d, axis=1) + \\\n                    self.preprocess_msa(batch['msa_feat'])\n\n        right_single = self.right_single(\n            batch['target_feat'])  # 1, n_res, 22 -> 1, n_res, 128\n        right_single = paddle.unsqueeze(\n            right_single, axis=1)  # 1, n_res, 128 -> 1, 1, n_res, 128\n        left_single = self.left_single(\n            batch['target_feat'])  # 1, n_res, 22 -> 1, n_res, 128\n        left_single = paddle.unsqueeze(\n            left_single, axis=2)  # 1, n_res, 128 -> 1, n_res, 1, 128\n        pair_activations = left_single + right_single\n\n        if not self.training and self.global_config.low_memory is True:\n            del left_single\n            del right_single\n            gc.collect()\n\n            # [B, N_res, N_res, c_z] => [B, N_res//dap_size, N_res, c_z]\n            pair_activations = dap.scatter(pair_activations, axis=1)\n\n        mask_2d = paddle.unsqueeze(\n            batch['seq_mask'], axis=1) * paddle.unsqueeze(\n                batch['seq_mask'], axis=2)\n\n        # Inject previous outputs for recycling.\n        # Jumper et al. (2021) Suppl. Alg. 2 \"Inference\" line 6\n        # Jumper et al. (2021) Suppl. Alg. 32 \"RecyclingEmbedder\"\n        if self.config.recycle_pos and 'prev_pos' in batch:\n            prev_pseudo_beta = self._pseudo_beta_fn(batch['aatype'],\n                                                    batch['prev_pos'], None)\n            dgram = dgram_from_positions(prev_pseudo_beta,\n                                         **self.config.prev_pos)\n            if not self.training and self.global_config.low_memory is True:\n                dgram = dap.scatter(dgram, axis=1)\n                pair_activations += self.prev_pos_linear(dgram)\n                del dgram\n                del prev_pseudo_beta\n                gc.collect()\n            else:\n                pair_activations += self.prev_pos_linear(dgram)\n\n        if self.config.recycle_features:\n            if 'prev_msa_first_row' in batch:\n                prev_msa_first_row = self.prev_msa_first_row_norm(batch[\n                    'prev_msa_first_row'])\n\n                # A workaround for `jax.ops.index_add`\n                msa_first_row = paddle.squeeze(\n                    msa_activations[:, 0, :], axis=1)\n                msa_first_row += prev_msa_first_row\n                msa_first_row = paddle.unsqueeze(msa_first_row, axis=1)\n                msa_activations = paddle.concat(\n                    [msa_first_row, msa_activations[:, 1:, :]], axis=1)\n                # if not self.training:  # for inference\n                if not self.training and self.global_config.low_memory is True:\n                    del prev_msa_first_row\n                    del msa_first_row\n                    gc.collect()\n\n            if 'prev_pair' in batch:\n                # if not self.training:  # for inference\n                if not self.training and self.global_config.low_memory is True:\n                    prev_pair = batch['prev_pair']\n                    prev_pair_gpu = prev_pair.cuda()\n                    prev_pair_gpu = dap.scatter(prev_pair_gpu, axis=1)\n                    pair_activations += self.prev_pair_norm(prev_pair_gpu)\n                    del prev_pair_gpu\n                    gc.collect()\n                else:\n                    pair_activations += self.prev_pair_norm(batch['prev_pair'])\n\n        # RelPosEmbedder\n        # Jumper et al. (2021) Suppl. Alg. 4 \"relpos\"\n        # Jumper et al. (2021) Suppl. Alg. 5 \"one_hot\"\n        if self.config.max_relative_feature:\n            pos = batch['residue_index']  # [bs, N_res]\n            offset = paddle.unsqueeze(pos, axis=[-1]) - \\\n                paddle.unsqueeze(pos, axis=[-2])\n            rel_pos = nn.functional.one_hot(\n                paddle.clip(\n                    offset + self.config.max_relative_feature,\n                    min=0,\n                    max=2 * self.config.max_relative_feature),\n                2 * self.config.max_relative_feature + 1)\n\n            if not self.training and self.global_config.low_memory is True:\n                rel_pos = dap.scatter(rel_pos, axis=1)\n                rel_pos_bias = self.pair_activiations(rel_pos)\n                pair_activations += rel_pos_bias\n                del rel_pos\n                del rel_pos_bias\n                gc.collect()\n            else:\n                rel_pos_bias = self.pair_activiations(rel_pos)\n                pair_activations += rel_pos_bias\n\n        # TemplateEmbedder\n        # Jumper et al. (2021) Suppl. Alg. 2 \"Inference\" lines 9-13\n        if self.config.template.enabled:\n            template_batch = {\n                k: batch[k]\n                for k in batch if k.startswith('template_')\n            }\n            template_pair_repr = self.template_embedding(\n                pair_activations, template_batch, mask_2d)\n            pair_activations += template_pair_repr\n\n        # ExtraMSAEmbedder\n        # Jumper et al. (2021) Suppl. Alg. 2 \"Inference\" lines 14-16\n        extra_msa_feat = self._create_extra_msa_feature(batch)\n        extra_msa_activations = self.extra_msa_activations(extra_msa_feat)\n        # if not self.training:  # for inference\n        if not self.training and self.global_config.low_memory is True:\n            del extra_msa_feat\n            gc.collect()\n\n        # ==================================================\n        #  Extra MSA Stack\n        # Jumper et al. (2021) Suppl. Alg. 18 \"ExtraMsaStack\"\n        # ==================================================\n\n        if not self.training and self.global_config.low_memory is True:\n            # scatter if using dap, otherwise do nothing\n            # [B, N_seq, N_res, c_m] => [B, N_seq//dap_size, N_res, c_m]\n            extra_msa_activations = dap.scatter(extra_msa_activations, axis=1)\n            # [B, N_seq, N_res, c_m] => [B, N_seq//dap_size, N_res, c_m]\n            msa_activations = dap.scatter(msa_activations, axis=1)\n\n        extra_msa_stack_input = {\n            'msa': extra_msa_activations,\n            'pair': pair_activations,\n        }\n\n        if not self.training and self.global_config.low_memory is True:\n            del pair_activations\n            gc.collect()\n\n        if bp.get_world_size() > 1:\n            extra_msa_stack_input['msa'] = bp.broadcast_grad_for_backward(\n                extra_msa_stack_input['msa'], 0)\n\n        if not self.training and self.global_config.low_memory is True:\n            pass\n        else:\n            # scatter if using dap, otherwise do nothing\n            # [B, N_seq, N_res, c_m] => [B, N_seq//dap_size, N_res, c_m]\n            extra_msa_stack_input['msa'] = dap.scatter(\n                extra_msa_stack_input['msa'], axis=1)\n            # [B, N_res, N_res, c_z] => [B, N_res//dap_size, N_res, c_z]\n            extra_msa_stack_input['pair'] = dap.scatter(\n                extra_msa_stack_input['pair'], axis=1)\n\n        for idx, extra_msa_stack_iteration in enumerate(self.extra_msa_stack):\n            extra_msa_act, extra_pair_act = recompute_wrapper(\n                extra_msa_stack_iteration,\n                extra_msa_stack_input['msa'],\n                extra_msa_stack_input['pair'],\n                {'msa': batch['extra_msa_mask'],\n                 'pair': mask_2d},\n                is_recompute=self.training and\n                idx >= self.config.extra_msa_stack_recompute_start_block_index)\n            extra_msa_stack_output = {\n                'msa': extra_msa_act,\n                'pair': extra_pair_act\n            }\n            extra_msa_stack_input = {\n                'msa': extra_msa_stack_output['msa'],\n                'pair': extra_msa_stack_output['pair']\n            }\n\n        if not self.training and self.global_config.low_memory is True:\n            pass\n        else:\n            # gather if using dap, otherwise do nothing\n            # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res, c_z]\n            extra_msa_stack_output['pair'] = dap.gather(\n                extra_msa_stack_output['pair'], axis=1)\n\n        evoformer_input = {\n            'msa': msa_activations,\n            'pair': extra_msa_stack_output['pair'],\n        }\n\n        evoformer_masks = {\n            'msa': batch['msa_mask'],\n            'pair': mask_2d,\n        }\n\n        if not self.training and self.global_config.low_memory is True:\n            del extra_msa_stack_input\n            del extra_msa_stack_output\n            gc.collect()\n\n        # ==================================================\n        #  Template angle feat\n        # Jumper et al. (2021) Suppl. Alg. 2 \"Inference\" lines 7-8\n        # ==================================================\n        if self.config.template.enabled and self.config.template.embed_torsion_angles:\n            num_templ, num_res = batch['template_aatype'].shape[1:]\n\n            aatype_one_hot = nn.functional.one_hot(batch['template_aatype'],\n                                                   22)\n            # Embed the templates aatype, torsion angles and masks.\n            # Shape (templates, residues, msa_channels)\n            ret = all_atom.atom37_to_torsion_angles(\n                aatype=batch['template_aatype'],\n                all_atom_pos=batch['template_all_atom_positions'],\n                all_atom_mask=batch['template_all_atom_masks'],\n                # Ensure consistent behaviour during testing:\n                placeholder_for_undefined=not self.global_config.zero_init)\n\n            template_features = paddle.concat(\n                [\n                    aatype_one_hot,\n                    paddle.reshape(ret['torsion_angles_sin_cos'],\n                                   [-1, num_templ, num_res, 14]),\n                    paddle.reshape(ret['alt_torsion_angles_sin_cos'],\n                                   [-1, num_templ, num_res, 14]),\n                    ret['torsion_angles_mask']\n                ],\n                axis=-1)\n\n            template_activations = self.template_single_embedding(\n                template_features)\n            template_activations = nn.functional.relu(template_activations)\n            template_activations = self.template_projection(\n                template_activations)\n\n            # Concatenate the templates to the msa.\n            evoformer_input['msa'] = paddle.concat(\n                [evoformer_input['msa'], template_activations], axis=1)\n\n            # Concatenate templates masks to the msa masks.\n            # Use mask from the psi angle, as it only depends on the backbone atoms\n            # from a single residue.\n            torsion_angle_mask = ret['torsion_angles_mask'][..., 2]\n            torsion_angle_mask = torsion_angle_mask.astype(evoformer_masks[\n                'msa'].dtype)\n            evoformer_masks['msa'] = paddle.concat(\n                [evoformer_masks['msa'], torsion_angle_mask], axis=1)\n\n        if bp.get_world_size() > 1:\n            evoformer_input['msa'] = bp.broadcast_grad_for_backward(\n                evoformer_input['msa'], 0)\n\n        # if self.training:\n        if not self.training and self.global_config.low_memory is True:\n            pass\n        else:\n            # scatter if using dap, otherwise do nothing\n            # [B, N_seq, N_res, c_m] => [B, N_seq//dap_size, N_res, c_m]\n            evoformer_input['msa'] = dap.scatter(\n                evoformer_input['msa'], axis=1)\n            # [B, N_res, N_res, c_z] => [B, N_res//dap_size, N_res, c_z]\n            evoformer_input['pair'] = dap.scatter(\n                evoformer_input['pair'], axis=1)\n\n        # ==================================================\n        #  Main MSA Stack\n        # Jumper et al. (2021) Suppl. Alg. 2 \"Inference\" lines 17-18\n        # ==================================================\n        for idx, evoformer_block in enumerate(self.evoformer_iteration):\n            msa_act, pair_act = recompute_wrapper(\n                evoformer_block,\n                evoformer_input['msa'],\n                evoformer_input['pair'],\n                evoformer_masks,\n                is_recompute=self.training and\n                idx >= self.config.evoformer_recompute_start_block_index)\n            evoformer_output = {'msa': msa_act, 'pair': pair_act}\n            evoformer_input = {\n                'msa': evoformer_output['msa'],\n                'pair': evoformer_output['pair'],\n            }\n\n        # gather if using dap, otherwise do nothing\n        # [B, N_seq//dap_size, N_res, c_m] => [B, N_seq, N_res, c_m]\n        evoformer_output['msa'] = dap.gather(evoformer_output['msa'], axis=1)\n        # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res, c_z]\n        evoformer_output['pair'] = dap.gather(evoformer_output['pair'], axis=1)\n\n        msa_activations = evoformer_output['msa']\n        pair_activations = evoformer_output['pair']\n\n        if not self.training and self.global_config.low_memory is True:\n            pair_activations_cpu = pair_activations.cpu()\n            del pair_activations\n        single_activations = self.single_activations(msa_activations[:, 0])\n\n        # if not self.training and self.global_config.low_memory is True:\n        #     pair_act_out = pair_activations_cpu\n        # else:\n        #     pair_act_out = pair_activations\n\n        num_seq = batch['msa_feat'].shape[1]\n        output = {\n            'single': single_activations,\n            'pair': pair_activations_cpu if not self.training and\n            self.global_config.low_memory is True else pair_activations,\n            # Crop away template rows such that they are not used\n            # in MaskedMsaHead.\n            'msa': msa_activations[:, :num_seq],\n            'msa_first_row': msa_activations[:, 0],\n        }\n\n        return output\n"
  },
  {
    "path": "ppfleetx/models/protein_folding/outer_product_mean.py",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\n\nfrom ppfleetx.distributed.protein_folding import dap\n\nfrom .common import subbatch\n\n\nclass OuterProductMean(nn.Layer):\n    \"\"\"Computes mean outer product.\n\n    Jumper et al. (2021) Suppl. Alg. 10 \"OuterProductMean\"\n    \"\"\"\n\n    def __init__(self,\n                 channel_num,\n                 config,\n                 global_config,\n                 is_extra_msa,\n                 name='outer_product_mean'):\n        super(OuterProductMean, self).__init__()\n        self.channel_num = channel_num\n        self.config = config\n        self.global_config = global_config\n\n        Linear = paddle.incubate.nn.FusedLinear if self.global_config.fuse_linear else paddle.nn.Linear\n\n        if is_extra_msa:\n            c_m = channel_num['extra_msa_channel']\n        else:\n            c_m = channel_num['msa_channel']\n\n        self.layer_norm_input = nn.LayerNorm(c_m, name='layer_norm_input')\n        self.left_projection = Linear(\n            c_m, self.config.num_outer_channel, name='left_projection')\n        self.right_projection = Linear(\n            c_m, self.config.num_outer_channel, name='right_projection')\n\n        if self.global_config.zero_init:\n            init_w = nn.initializer.Constant(value=0.0)\n        else:\n            init_w = nn.initializer.KaimingNormal()\n\n        self.output_w = paddle.create_parameter(\n            [\n                self.config.num_outer_channel, self.config.num_outer_channel,\n                channel_num['pair_channel']\n            ],\n            'float32',\n            default_initializer=init_w)\n        self.output_b = paddle.create_parameter(\n            [channel_num['pair_channel']],\n            'float32',\n            default_initializer=nn.initializer.Constant(value=0.0))\n\n    def forward(self, act, mask):\n        \"\"\"Builds OuterProductMean module.\n\n        Arguments:\n        act: MSA representation, shape [batch, N_seq, N_res, c_m].\n        mask: MSA mask, shape [batch, N_seq, N_res].\n\n        Returns:\n        Update to pair representation, shape [batch, N_res, N_res, c_z].\n        \"\"\"\n        # [B, N_seq, N_res//dap_size, c_m]\n        act = self.layer_norm_input(act)\n        # [B, N_seq, N_res//dap_size, c_m] => [B, N_seq, N_res//dap_size, num_outer_channel]\n        right_act_before = self.right_projection(act)\n        # [B, N_seq, N_res//dap_size, num_outer_channel] => [B, N_seq, N_res, num_outer_channel]\n        right_act = dap.all_gather(right_act_before, axis=2)\n\n        # [B, N_seq, N_res//dap_size, c_m] => [B, N_seq, N_res//dap_size, num_outer_channel]\n        left_act = self.left_projection(act)\n        # [B, N_seq, N_res] => [B, N_seq, N_res, 1]\n        mask = paddle.unsqueeze(mask, axis=-1)\n        # [B, N_seq, N_res, 1] => [B, N_seq, N_res//dap_size, 1]\n        mask_col = dap.scatter(mask, axis=2)\n        left_act = mask_col * left_act\n\n        # [B, N_seq, N_res//dap_size, 1], [B, N_seq, N_res, 1] => [B, N_res//dap_size, N_res, 1]\n        epsilon = 1e-3\n        norm = paddle.einsum('nabc,nadc->nbdc', mask_col, mask) + epsilon\n\n        def fast_einsum(equation, left_act, right_act):\n            assert equation == \"nacb,nade->ndceb\"\n            tmp = paddle.matmul(\n                x=paddle.reshape(\n                    right_act,\n                    [right_act.shape[0], right_act.shape[1], -1]),  # na(de)\n                y=paddle.reshape(\n                    left_act,\n                    [left_act.shape[0], left_act.shape[1], -1]),  # na(cb)\n                transpose_x=True,\n                transpose_y=False)  # n(de)(cb)\n            tmp = paddle.reshape(tmp, [\n                left_act.shape[0], right_act.shape[2], right_act.shape[3],\n                left_act.shape[2], left_act.shape[3]\n            ])\n            out = paddle.transpose(tmp, perm=[0, 1, 3, 2, 4])\n            return out\n\n        def compute_chunk(left_act, right_act):\n            # This is equivalent to\n            #\n            # act = jnp.einsum('abc,ade->dceb', left_act, right_act)\n            # act = jnp.einsum('dceb,cef->bdf', act, output_w) + output_b\n            #\n            # but faster. maybe for subbatch inference?\n\n            # [B, N_seq, N_res//dap_size, num_outer_channel] => [B, N_seq, num_outer_channel, N_res//dap_size]\n            left_act = left_act.transpose([0, 1, 3, 2])\n            # wait if using async communication and dap, otherwise do nothing\n            right_act_after = dap.all_gather_opp(right_act, axis=2)\n            # [B, N_seq, num_outer_channel, N_res//dap_size], [B, N_seq, N_res, num_outer_channel]\n            # => [B, N_res, num_outer_channel, num_outer_channel, N_res//dap_size]\n            act = fast_einsum('nacb,nade->ndceb', left_act, right_act_after)\n\n            # [B, N_res, num_outer_channel, num_outer_channel, N_res//dap_size], [num_outer_channel, num_outer_channel, c_z]\n            # => [B, N_res, N_res//dap_size, c_z]\n            act = paddle.einsum('ndceb,cef->ndbf', act,\n                                self.output_w) + self.output_b\n            # [B, N_res, N_res//dap_size, c_z] => [B, N_res//dap_size, N_res, c_z]\n            return act.transpose([0, 2, 1, 3])\n\n        if not self.training:\n            # low memory mode using subbatch\n            sb_chunk = subbatch(compute_chunk, [0], [2],\n                                self.config.chunk_size, 1)\n            act = sb_chunk(left_act, right_act)\n        else:\n            act = compute_chunk(left_act, right_act)\n\n        act = act / norm\n\n        return act\n"
  },
  {
    "path": "ppfleetx/models/protein_folding/quat_affine.py",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Quaternion geometry modules.\n\nThis introduces a representation of coordinate frames that is based around a\n‘QuatAffine’ object. This object describes an array of coordinate frames.\nIt consists of vectors corresponding to the\norigin of the frames as well as orientations which are stored in two\nways, as unit quaternions as well as a rotation matrices.\nThe rotation matrices are derived from the unit quaternions and the two are kept\nin sync.\nFor an explanation of the relation between unit quaternions and rotations see\nhttps://en.wikipedia.org/wiki/Quaternions_and_spatial_rotation\n\nThis representation is used in the model for the backbone frames.\n\nOne important thing to note here, is that while we update both representations\nthe jit compiler is going to ensure that only the parts that are\nactually used are executed.\n\"\"\"\n\nimport paddle\nimport functools\nimport numpy as np\nfrom typing import Tuple\n\nQUAT_TO_ROT = np.zeros((4, 4, 3, 3), dtype=np.float32)\n\nQUAT_TO_ROT[0, 0] = [[1, 0, 0], [0, 1, 0], [0, 0, 1]]  # rr\nQUAT_TO_ROT[1, 1] = [[1, 0, 0], [0, -1, 0], [0, 0, -1]]  # ii\nQUAT_TO_ROT[2, 2] = [[-1, 0, 0], [0, 1, 0], [0, 0, -1]]  # jj\nQUAT_TO_ROT[3, 3] = [[-1, 0, 0], [0, -1, 0], [0, 0, 1]]  # kk\n\nQUAT_TO_ROT[1, 2] = [[0, 2, 0], [2, 0, 0], [0, 0, 0]]  # ij\nQUAT_TO_ROT[1, 3] = [[0, 0, 2], [0, 0, 0], [2, 0, 0]]  # ik\nQUAT_TO_ROT[2, 3] = [[0, 0, 0], [0, 0, 2], [0, 2, 0]]  # jk\n\nQUAT_TO_ROT[0, 1] = [[0, 0, 0], [0, 0, -2], [0, 2, 0]]  # ir\nQUAT_TO_ROT[0, 2] = [[0, 0, 2], [0, 0, 0], [-2, 0, 0]]  # jr\nQUAT_TO_ROT[0, 3] = [[0, -2, 0], [2, 0, 0], [0, 0, 0]]  # kr\n\nQUAT_MULTIPLY = np.zeros((4, 4, 4), dtype=np.float32)\nQUAT_MULTIPLY[:, :, 0] = [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0],\n                          [0, 0, 0, -1]]\n\nQUAT_MULTIPLY[:, :, 1] = [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1],\n                          [0, 0, -1, 0]]\n\nQUAT_MULTIPLY[:, :, 2] = [[0, 0, 1, 0], [0, 0, 0, -1], [1, 0, 0, 0],\n                          [0, 1, 0, 0]]\n\nQUAT_MULTIPLY[:, :, 3] = [[0, 0, 0, 1], [0, 0, 1, 0], [0, -1, 0, 0],\n                          [1, 0, 0, 0]]\n\nQUAT_MULTIPLY_BY_VEC = QUAT_MULTIPLY[:, 1:, :]\n\n\ndef rot_to_quat(rot):\n    \"\"\"Convert rotation matrix to quaternion.\n\n    Note that this function calls self_adjoint_eig which is extremely expensive on\n    the GPU. If at all possible, this function should run on the CPU.\n\n    Args:\n        rot: rotation matrix (see below for format). rotation matrix should be shape (..., 3, 3)\n\n    Returns:\n        Quaternion as (..., 4) tensor.\n    \"\"\"\n    rot = [[rot[..., i, j] for j in range(3)] for i in range(3)]\n    [[xx, xy, xz], [yx, yy, yz], [zx, zy, zz]] = rot\n\n    # pylint: disable=bad-whitespace\n    k = [[\n        xx + yy + zz,\n        zy - yz,\n        xz - zx,\n        yx - xy,\n    ], [\n        zy - yz,\n        xx - yy - zz,\n        xy + yx,\n        xz + zx,\n    ], [\n        xz - zx,\n        xy + yx,\n        yy - xx - zz,\n        yz + zy,\n    ], [\n        yx - xy,\n        xz + zx,\n        yz + zy,\n        zz - xx - yy,\n    ]]\n\n    k = (1. / 3.) * paddle.stack(\n        [paddle.stack(\n            x, axis=-1) for x in k], axis=-2)\n\n    # Get eigenvalues in non-decreasing order and associated.\n    _, qs = paddle.linalg.eigh(k)\n    return qs[..., -1]\n\n\ndef quat_to_rot(normalized_quat):\n    \"\"\"Convert a normalized quaternion to a rotation matrix. Quat (..., 4)\"\"\"\n\n    mat = paddle.unsqueeze(normalized_quat,\n                           [-1, -3])  # normalized_quat[..., None, :, None]\n    rot_tensor = paddle.sum(\n        paddle.to_tensor(np.reshape(QUAT_TO_ROT, (4, 4, 9))) *\n        normalized_quat[..., :, None, None] * mat,\n        axis=(-3, -2))  # (..., 4, 4, 9) -> (..., 9)\n    t_shape = rot_tensor.shape[:-1]\n    t_shape.extend([3, 3])\n    rot = paddle.reshape(rot_tensor, t_shape)  # Unstack. (..., 3, 3)\n    return rot\n\n\ndef quat_multiply_by_vec(quat, vec):\n    \"\"\"Multiply a quaternion by a pure-vector quaternion.\"\"\"\n    mat = paddle.unsqueeze(vec, [-1, -3])  # vec[..., None, :, None]\n    return paddle.sum(paddle.to_tensor(QUAT_MULTIPLY_BY_VEC) *\n                      quat[..., :, None, None] * mat,\n                      axis=(-3, -2))\n\n\ndef quat_multiply(quat1, quat2):\n    \"\"\"Multiply a quaternion by another quaternion.\"\"\"\n    mat = paddle.unsqueeze(quat2, [-1, -3])  # quat2[..., None, :, None]\n    return paddle.sum(paddle.to_tensor(QUAT_MULTIPLY) *\n                      quat1[..., :, None, None] * mat,\n                      axis=(-3, -2))\n\n\ndef apply_rot_to_vec(rot, vec, unstack=False):\n    \"\"\"Multiply rotation matrix by a vector. vec is a list.\n    Returns: a list of 3 tensors of the points\n    \"\"\"\n    if unstack:\n        x, y, z = [vec[..., i] for i in range(3)]\n    else:\n        x, y, z = vec\n    return [\n        rot[..., 0, 0] * x + rot[..., 0, 1] * y + rot[..., 0, 2] * z,\n        rot[..., 1, 0] * x + rot[..., 1, 1] * y + rot[..., 1, 2] * z,\n        rot[..., 2, 0] * x + rot[..., 2, 1] * y + rot[..., 2, 2] * z\n    ]\n\n\ndef apply_rot_to_vec_np(rot, vec, unstack=False):\n    \"\"\"Multiply rotation matrix by a vector. vec is a list.\n    Returns: a list of 3 tensors of the points\n    \"\"\"\n    if unstack:\n        x, y, z = [vec[..., i] for i in range(3)]\n    else:\n        x, y, z = vec\n    return [\n        rot[0][0] * x + rot[0][1] * y + rot[0][2] * z,\n        rot[1][0] * x + rot[1][1] * y + rot[1][2] * z,\n        rot[2][0] * x + rot[2][1] * y + rot[2][2] * z\n    ]\n\n\ndef apply_inverse_rot_to_vec(rot, vec):\n    \"\"\"Multiply the inverse of a rotation matrix by a vector. vec is a list.\n    Returns: a list of 3 tensors of the points\n    \"\"\"\n    # Inverse rotation is just transpose\n    x, y, z = vec\n    return [\n        rot[..., 0, 0] * x + rot[..., 1, 0] * y + rot[..., 2, 0] * z,\n        rot[..., 0, 1] * x + rot[..., 1, 1] * y + rot[..., 2, 1] * z,\n        rot[..., 0, 2] * x + rot[..., 1, 2] * y + rot[..., 2, 2] * z\n    ]\n\n\nclass QuatAffine(object):\n    \"\"\"Affine transformation represented by quaternion and vector.\"\"\"\n\n    def __init__(self,\n                 quaternion: paddle.Tensor,\n                 translation: paddle.Tensor,\n                 rotation=None,\n                 normalize=True):\n        \"\"\"Initialize from quaternion and translation.\n\n        Args:\n        quaternion: Rotation represented by a quaternion, to be applied\n            before translation.  Must be a unit quaternion unless normalize==True.\n            shape (batch, N_res, 4)\n        translation: Translation represented as a vector. (batch, N_res, 3)\n        rotation: Same rotation as the quaternion, represented as a (batch, N_res, 3, 3)\n            tensor.  If None, rotation will be calculated from the quaternion.\n        normalize: If True, l2 normalize the quaternion on input.\n        \"\"\"\n\n        if quaternion is not None:\n            assert quaternion.shape[-1] == 4\n\n        if normalize and quaternion is not None:\n            q_length = paddle.norm(quaternion, axis=-1)\n            quaternion = quaternion / q_length[..., None]\n\n        if rotation is None:\n            rotation = quat_to_rot(quaternion)\n\n        self.quaternion = quaternion\n        self.rotation = rotation\n        self.translation = translation\n\n        assert rotation.shape[-1] == 3 and rotation.shape[-2] == 3\n        assert translation.shape[-1] == 3\n\n    def to_tensor(self):\n        return paddle.concat([self.quaternion, self.translation], axis=-1)\n\n    def stop_rot_gradient(self):\n        \"\"\"\n            stop the gradient of rotations\n        \"\"\"\n        quat = self.quaternion\n        if not quat is None:\n            quat = quat.detach()\n        return QuatAffine(\n            quaternion=quat,\n            translation=self.translation,\n            rotation=self.rotation.detach(),\n            normalize=False)\n\n    def scale_translation(self, position_scale):\n        \"\"\"Return a new quat affine with a different scale for translation.\"\"\"\n\n        return QuatAffine(\n            self.quaternion,\n            position_scale * self.translation,\n            rotation=self.rotation,\n            normalize=False)\n\n    @classmethod\n    def from_tensor(cls, tensor, normalize=False):\n        assert tensor.shape[-1] == 7\n        quaternion = tensor[..., 0:4]\n        translation = tensor[..., 4:7]\n        return cls(quaternion, translation, normalize=normalize)\n\n    def pre_compose(self, update):\n        \"\"\"Return a new QuatAffine which applies the transformation update first.\n\n        Args:\n        update: Length-6 vector. 3-vector of x, y, and z such that the quaternion\n            update is (1, x, y, z) and zero for the 3-vector is the identity\n            quaternion. 3-vector for translation concatenated.\n\n        Returns:\n        New QuatAffine object.\n        \"\"\"\n        vector_quaternion_update = update[..., 0:3]\n        trans_update = [update[..., 3], update[..., 4], update[..., 5]]\n\n        new_quaternion = (self.quaternion + quat_multiply_by_vec(\n            self.quaternion, vector_quaternion_update))\n\n        trans_update = apply_rot_to_vec(self.rotation, trans_update)\n        trans_update = paddle.stack(trans_update, axis=-1)\n        new_translation = self.translation + trans_update\n\n        return QuatAffine(new_quaternion, new_translation)\n\n    def apply_to_point(self, point, extra_dims=0):\n        \"\"\"Apply affine to a point.\n\n        Args:\n        point: List of 3 tensors to apply affine.\n            each with shape [batch_size, num_residues, num_head*num_point_qk]\n        extra_dims:  Number of dimensions at the end of the transformed_point\n            shape that are not present in the rotation and translation.  The most\n            common use is rotation N points at once with extra_dims=1 for use in a\n            network.\n\n        Returns:\n        Transformed point after applying affine.\n        \"\"\"\n        rotation = self.rotation  # [batch_size, num_residues, 3, 3]\n        translation = self.translation  # [batch_size, num_residues, 3]\n        for _ in range(extra_dims):\n            translation = paddle.unsqueeze(translation, axis=-2)\n            rotation = paddle.unsqueeze(rotation, axis=-3)\n\n        rot_point = apply_rot_to_vec(rotation, point)\n        return [\n            rot_point[0] + translation[..., 0],\n            rot_point[1] + translation[..., 1],\n            rot_point[2] + translation[..., 2]\n        ]\n\n    def invert_point(self, transformed_point, extra_dims=0):\n        \"\"\"Apply inverse of transformation to a point.\n\n        Args:\n        transformed_point: List of 3 tensors to apply affine\n        extra_dims:  Number of dimensions at the end of the transformed_point\n            shape that are not present in the rotation and translation.  The most\n            common use is rotation N points at once with extra_dims=1 for use in a\n            network.\n\n        Returns:\n        Transformed point after applying affine.\n        \"\"\"\n        rotation = self.rotation\n        translation = self.translation\n        for _ in range(extra_dims):\n            translation = paddle.unsqueeze(translation, axis=-2)\n            rotation = paddle.unsqueeze(rotation, axis=-3)\n\n        rot_point = [\n            transformed_point[0] - translation[..., 0],\n            transformed_point[1] - translation[..., 1],\n            transformed_point[2] - translation[..., 2]\n        ]\n\n        return apply_inverse_rot_to_vec(rotation, rot_point)\n\n    def invert(self):\n        \"\"\"Return a new quat affine of the invert transformation.\"\"\"\n        pass  # TODO\n\n\n######Paddle Implementation\ndef _multiply(a, b):\n    a1 = a[..., 0, 0]\n    a2 = a[..., 0, 1]\n    a3 = a[..., 0, 2]\n    a11 = a[..., 1, 0]\n    a12 = a[..., 1, 1]\n    a13 = a[..., 1, 2]\n    a21 = a[..., 2, 0]\n    a22 = a[..., 2, 1]\n    a23 = a[..., 2, 2]\n    b1 = b[..., 0, 0]\n    b2 = b[..., 1, 0]\n    b3 = b[..., 0, 1]\n    b11 = b[..., 1, 1]\n    b12 = b[..., 2, 0]\n    b13 = b[..., 0, 2]\n    b21 = b[..., 1, 2]\n    b22 = b[..., 2, 1]\n    b23 = b[..., 2, 2]\n    return paddle.stack(\n        [\n            paddle.stack(\n                [\n                    a1 * b1 + a2 * b2 + a3 * b12,\n                    a1 * b3 + a2 * b11 + a3 * b22,\n                    a1 * b13 + a2 * b21 + a3 * b23\n                ],\n                axis=-1), paddle.stack(\n                    [\n                        a11 * b1 + a12 * b2 + a13 * b12,\n                        a11 * b3 + a12 * b11 + a13 * b22,\n                        a11 * b13 + a12 * b21 + a13 * b23\n                    ],\n                    axis=-1), paddle.stack(\n                        [\n                            a21 * b1 + a22 * b2 + a23 * b12,\n                            a21 * b3 + a22 * b11 + a23 * b22,\n                            a21 * b13 + a22 * b21 + a23 * b23\n                        ],\n                        axis=-1)\n        ],\n        axis=-2)\n\n\ndef make_canonical_transform(\n        n_xyz: paddle.Tensor, ca_xyz: paddle.Tensor,\n        c_xyz: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]:\n    \"\"\"Returns translation and rotation matrices to canonicalize residue atoms.\n\n    Note that this method does not take care of symmetries. If you provide the\n    atom positions in the non-standard way, the N atom will end up not at\n    [-0.527250, 1.359329, 0.0] but instead at [-0.527250, -1.359329, 0.0]. You\n    need to take care of such cases in your code.\n\n    Args:\n        n_xyz: An array of shape [batch, n_res, 3] of nitrogen xyz coordinates.\n        ca_xyz: An array of shape [batch, n_res, 3] of carbon alpha xyz coordinates.\n        c_xyz: An array of shape [batch, n_res, 3] of carbon xyz coordinates.\n\n    Returns:\n        A tuple (translation, rotation) where:\n        translation is an array of shape [batch, n_res, 3] defining the translation.\n        rotation is an array of shape [batch, n_res, 3, 3] defining the rotation.\n        After applying the translation and rotation to all atoms in a residue:\n        * All atoms will be shifted so that CA is at the origin,\n        * All atoms will be rotated so that C is at the x-axis,\n        * All atoms will be shifted so that N is in the xy plane.\n    \"\"\"\n    assert len(n_xyz.shape) == 3, n_xyz.shape\n    assert n_xyz.shape[-1] == 3, n_xyz.shape\n    assert n_xyz.shape == ca_xyz.shape == c_xyz.shape, (\n        n_xyz.shape, ca_xyz.shape, c_xyz.shape)\n\n    # Place CA at the origin.\n    translation = -ca_xyz\n    n_xyz = n_xyz + translation\n    c_xyz = c_xyz + translation\n\n    # Place C on the x-axis.\n    c_x, c_y, c_z = [c_xyz[..., i] for i in range(3)]\n    # Rotate by angle c1 in the x-y plane (around the z-axis).\n    norm = paddle.sqrt(c_x**2 + c_y**2 + 1e-20)\n    sin_c1 = -c_y / norm\n    cos_c1 = c_x / norm\n    zeros = paddle.zeros_like(sin_c1)\n    ones = paddle.ones_like(sin_c1)\n\n    c1_rot_matrix = paddle.stack(\n        [cos_c1, -sin_c1, zeros, sin_c1, cos_c1, zeros, zeros, zeros, ones],\n        axis=-1)\n    c1_rot_matrix = c1_rot_matrix.reshape(sin_c1.shape + [3, 3])\n\n    # Rotate by angle c2 in the x-z plane (around the y-axis).\n    # norm = paddle.sqrt(1e-20 + c_x ** 2 + c_y ** 2 + c_z ** 2)\n    norm = paddle.sqrt(paddle.sum(c_xyz**2, axis=-1)) + 1e-20\n    sin_c2 = c_z / norm\n    cos_c2 = paddle.sqrt(c_x**2 + c_y**2) / norm\n    c2_rot_matrix = paddle.stack(\n        [cos_c2, zeros, sin_c2, zeros, ones, zeros, -sin_c2, zeros, cos_c2],\n        axis=-1)\n    c2_rot_matrix = c2_rot_matrix.reshape(sin_c2.shape + [3, 3])\n\n    c_rot_matrix = _multiply(c2_rot_matrix, c1_rot_matrix)\n    n_xyz = paddle.stack(\n        apply_rot_to_vec(\n            c_rot_matrix, n_xyz, unstack=True), axis=-1)\n\n    # Place N in the x-y plane.\n    _, n_y, n_z = [n_xyz[..., i] for i in range(3)]\n    # Rotate by angle alpha in the y-z plane (around the x-axis).\n    norm = paddle.sqrt(n_y**2 + n_z**2 + 1e-20)\n    sin_n = -n_z / norm\n    cos_n = n_y / norm\n    n_rot_matrix = paddle.stack(\n        [ones, zeros, zeros, zeros, cos_n, -sin_n, zeros, sin_n, cos_n],\n        axis=-1)\n    n_rot_matrix = n_rot_matrix.reshape(sin_n.shape + [3, 3])\n    # pylint: enable=bad-whitespace\n\n    return (translation, _multiply(n_rot_matrix, c_rot_matrix))\n\n\ndef make_transform_from_reference(\n        n_xyz: paddle.Tensor, ca_xyz: paddle.Tensor,\n        c_xyz: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]:\n    \"\"\"Returns rotation and translation matrices to convert from reference.\n\n    Note that this method does not take care of symmetries. If you provide the\n    atom positions in the non-standard way, the N atom will end up not at\n    [-0.527250, 1.359329, 0.0] but instead at [-0.527250, -1.359329, 0.0]. You\n    need to take care of such cases in your code.\n\n    Args:\n        n_xyz: An array of shape [batch, n_res, 3] of nitrogen xyz coordinates.\n        ca_xyz: An array of shape [batch, n_res, 3] of carbon alpha xyz coordinates.\n        c_xyz: An array of shape [batch, n_res, 3] of carbon xyz coordinates.\n\n    Returns:\n        A tuple (rotation, translation) where:\n        rotation is an array of shape [batch, n_res, 3, 3] defining the rotation.\n        translation is an array of shape [batch, n_res, 3] defining the translation.\n        After applying the translation and rotation to the reference backbone,\n        the coordinates will approximately equal to the input coordinates.\n\n        The order of translation and rotation differs from make_canonical_transform\n        because the rotation from this function should be applied before the\n        translation, unlike make_canonical_transform.\n    \"\"\"\n    translation, rotation = make_canonical_transform(n_xyz, ca_xyz, c_xyz)\n    return paddle.transpose(rotation, (0, 1, 3, 2)), -translation\n\n\n#######Numpy Implementation\ndef _multiply_np(a, b):\n    return np.stack([\n        np.array([\n            a[0][0] * b[0][0] + a[0][1] * b[1][0] + a[0][2] * b[2][0],\n            a[0][0] * b[0][1] + a[0][1] * b[1][1] + a[0][2] * b[2][1],\n            a[0][0] * b[0][2] + a[0][1] * b[1][2] + a[0][2] * b[2][2]\n        ]), np.array([\n            a[1][0] * b[0][0] + a[1][1] * b[1][0] + a[1][2] * b[2][0],\n            a[1][0] * b[0][1] + a[1][1] * b[1][1] + a[1][2] * b[2][1],\n            a[1][0] * b[0][2] + a[1][1] * b[1][2] + a[1][2] * b[2][2]\n        ]), np.array([\n            a[2][0] * b[0][0] + a[2][1] * b[1][0] + a[2][2] * b[2][0],\n            a[2][0] * b[0][1] + a[2][1] * b[1][1] + a[2][2] * b[2][1],\n            a[2][0] * b[0][2] + a[2][1] * b[1][2] + a[2][2] * b[2][2]\n        ])\n    ])\n\n\ndef make_canonical_transform_np(\n        n_xyz: np.ndarray, ca_xyz: np.ndarray,\n        c_xyz: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:\n    \"\"\"Returns translation and rotation matrices to canonicalize residue atoms.\n\n    Note that this method does not take care of symmetries. If you provide the\n    atom positions in the non-standard way, the N atom will end up not at\n    [-0.527250, 1.359329, 0.0] but instead at [-0.527250, -1.359329, 0.0]. You\n    need to take care of such cases in your code.\n\n    Args:\n        n_xyz: An array of shape [batch, 3] of nitrogen xyz coordinates.\n        ca_xyz: An array of shape [batch, 3] of carbon alpha xyz coordinates.\n        c_xyz: An array of shape [batch, 3] of carbon xyz coordinates.\n\n    Returns:\n        A tuple (translation, rotation) where:\n        translation is an array of shape [batch, 3] defining the translation.\n        rotation is an array of shape [batch, 3, 3] defining the rotation.\n        After applying the translation and rotation to all atoms in a residue:\n        * All atoms will be shifted so that CA is at the origin,\n        * All atoms will be rotated so that C is at the x-axis,\n        * All atoms will be shifted so that N is in the xy plane.\n    \"\"\"\n    assert len(n_xyz.shape) == 2, n_xyz.shape\n    assert n_xyz.shape[-1] == 3, n_xyz.shape\n    assert n_xyz.shape == ca_xyz.shape == c_xyz.shape, (\n        n_xyz.shape, ca_xyz.shape, c_xyz.shape)\n\n    # Place CA at the origin.\n    translation = -ca_xyz\n    n_xyz = n_xyz + translation\n    c_xyz = c_xyz + translation\n\n    # Place C on the x-axis.\n    c_x, c_y, c_z = [c_xyz[:, i] for i in range(3)]\n    # Rotate by angle c1 in the x-y plane (around the z-axis).\n    sin_c1 = -c_y / np.sqrt(1e-20 + c_x**2 + c_y**2)\n    cos_c1 = c_x / np.sqrt(1e-20 + c_x**2 + c_y**2)\n    zeros = np.zeros_like(sin_c1)\n    ones = np.ones_like(sin_c1)\n    # pylint: disable=bad-whitespace\n    c1_rot_matrix = np.stack([\n        np.array([cos_c1, -sin_c1, zeros]), np.array([sin_c1, cos_c1, zeros]),\n        np.array([zeros, zeros, ones])\n    ])\n\n    # Rotate by angle c2 in the x-z plane (around the y-axis).\n    sin_c2 = c_z / np.sqrt(1e-20 + c_x**2 + c_y**2 + c_z**2)\n    cos_c2 = np.sqrt(c_x**2 + c_y**2) / np.sqrt(1e-20 + c_x**2 + c_y**2 + c_z**\n                                                2)\n    c2_rot_matrix = np.stack([\n        np.array([cos_c2, zeros, sin_c2]), np.array([zeros, ones, zeros]),\n        np.array([-sin_c2, zeros, cos_c2])\n    ])\n\n    c_rot_matrix = _multiply_np(c2_rot_matrix, c1_rot_matrix)\n    n_xyz = np.stack(apply_rot_to_vec_np(c_rot_matrix, n_xyz, unstack=True)).T\n\n    # Place N in the x-y plane.\n    _, n_y, n_z = [n_xyz[:, i] for i in range(3)]\n    # Rotate by angle alpha in the y-z plane (around the x-axis).\n    sin_n = -n_z / np.sqrt(1e-20 + n_y**2 + n_z**2)\n    cos_n = n_y / np.sqrt(1e-20 + n_y**2 + n_z**2)\n    n_rot_matrix = np.stack([\n        np.array([ones, zeros, zeros]), np.array([zeros, cos_n, -sin_n]),\n        np.array([zeros, sin_n, cos_n])\n    ])\n\n    return (translation, np.transpose(\n        _multiply_np(n_rot_matrix, c_rot_matrix), [2, 0, 1]))\n\n\ndef make_transform_from_reference_np(\n        n_xyz: np.ndarray, ca_xyz: np.ndarray,\n        c_xyz: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:\n    \"\"\"Returns rotation and translation matrices to convert from reference.\n\n  Note that this method does not take care of symmetries. If you provide the\n  atom positions in the non-standard way, the N atom will end up not at\n  [-0.527250, 1.359329, 0.0] but instead at [-0.527250, -1.359329, 0.0]. You\n  need to take care of such cases in your code.\n\n  Args:\n    n_xyz: An array of shape [batch, 3] of nitrogen xyz coordinates.\n    ca_xyz: An array of shape [batch, 3] of carbon alpha xyz coordinates.\n    c_xyz: An array of shape [batch, 3] of carbon xyz coordinates.\n\n  Returns:\n    A tuple (rotation, translation) where:\n      rotation is an array of shape [batch, 3, 3] defining the rotation.\n      translation is an array of shape [batch, 3] defining the translation.\n    After applying the translation and rotation to the reference backbone,\n    the coordinates will approximately equal to the input coordinates.\n\n    The order of translation and rotation differs from make_canonical_transform\n    because the rotation from this function should be applied before the\n    translation, unlike make_canonical_transform.\n  \"\"\"\n    translation, rotation = make_canonical_transform_np(n_xyz, ca_xyz, c_xyz)\n    return np.transpose(rotation, (0, 2, 1)), -translation\n"
  },
  {
    "path": "ppfleetx/models/protein_folding/r3.py",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Transformations for 3D coordinates.\n\nThis Module contains objects for representing Vectors (Vecs), Rotation Matrices\n(Rots) and proper Rigid transformation (Rigids). These are represented as\nnamed tuples with arrays for each entry, for example a set of\n[N, M] points would be represented as a Vecs object with arrays of shape [N, M]\nfor x, y and z.\n\nThis is being done to improve readability by making it very clear what objects\nare geometric objects rather than relying on comments and array shapes.\nAnother reason for this is to avoid using matrix\nmultiplication primitives like matmul or einsum, on modern accelerator hardware\nthese can end up on specialized cores such as tensor cores on GPU or the MXU on\ncloud TPUs, this often involves lower computational precision which can be\nproblematic for coordinate geometry. Also these cores are typically optimized\nfor larger matrices than 3 dimensional, this code is written to avoid any\nunintended use of these cores on both GPUs and TPUs.\n\"\"\"\nimport paddle\nimport numpy as np\nimport collections\nfrom typing import List\n\nfrom . import (quat_affine, )\n\n# Array of rigid 3D transformations, stored as array of rotations and\n# array of translations.\nRigids = collections.namedtuple('Rigids', ['rot', 'trans'])\n\n\nclass Vecs:\n    def __init__(self, *args):\n\n        if len(args) == 1:\n            if type(args[0]) in [list, tuple] and len(args[0]) == 3:\n                self.translation = paddle.stack(args[0], axis=-1)\n            elif len(args[0]) == 1:\n                self.translation = args[0]\n            elif args[0].shape[-1] == 3:\n                self.translation = args[0]\n            else:\n                raise ValueError('Invalid number of inputs')\n        elif len(args) == 3:\n            self.translation = paddle.stack(args, axis=-1)\n        else:\n            raise ValueError('Invalid number of inputs')\n\n    def map(self, map_fn, *args):\n        result = []\n        for i in range(3):\n            r = map_fn(self.translation[..., i], *args)\n            result.append(r)\n\n        if result[0].shape[-1] == 1:\n            return Vecs(paddle.concat(result, axis=-1))\n        else:\n            return Vecs(paddle.stack(result, axis=-1))\n\n    @property\n    def shape(self):\n        return self.translation.shape\n\n    @property\n    def x(self):\n        return self.translation[..., 0]\n\n    @property\n    def y(self):\n        return self.translation[..., 1]\n\n    @property\n    def z(self):\n        return self.translation[..., 2]\n\n    def __getitem__(self, index):\n        return Vecs(self.translation[index])\n\n    def __str__(self):\n        return str(self.translation.shape)\n\n    def __repr__(self):\n        return str(self.translation.shape)\n\n    def reshape(self, *argv):\n        return self.translation.reshape(*argv)\n\n\nclass Rots:\n    def __init__(self, *args):\n        if len(args) == 1:\n            args = args[0]\n            if len(args) == 9:\n                rots = paddle.stack(args, axis=-1)\n                self.rotation = rots.reshape(rots.shape[:-1] + [3, 3])\n            else:\n                if args.shape[-1] == 3 and args.shape[-2] == 3:\n                    self.rotation = args\n                elif args.shape[-1] == 9:\n                    self.rotation = args.reshape(args.shape[:-1] + [3, 3])\n                else:\n                    raise ValueError('Invalid shape of input')\n        elif len(args) == 9:\n            rots = paddle.stack(args, axis=-1)\n            self.rotation = rots.reshape(rots.shape[:-1] + [3, 3])\n        else:\n            raise ValueError('Invalid number of inputs')\n\n    def map(self, map_fn, *args):\n        result_i = []\n        for i in range(3):\n            result_j = []\n            for j in range(3):\n                r = map_fn(self.rotation[..., i, j], *args)\n                result_j.append(r)\n\n            if result_j[0].shape[-1] == 1:\n                result_i.append(paddle.concat(result_j, axis=-1))\n            else:\n                result_i.append(paddle.stack(result_j, axis=-1))\n\n        return Rots(paddle.stack(result_i, axis=-2))\n\n    @property\n    def shape(self):\n        return self.rotation.shape\n\n    @property\n    def xx(self):\n        return self.rotation[..., 0, 0]\n\n    @property\n    def xy(self):\n        return self.rotation[..., 0, 1]\n\n    @property\n    def xz(self):\n        return self.rotation[..., 0, 2]\n\n    @property\n    def yx(self):\n        return self.rotation[..., 1, 0]\n\n    @property\n    def yy(self):\n        return self.rotation[..., 1, 1]\n\n    @property\n    def yz(self):\n        return self.rotation[..., 1, 2]\n\n    @property\n    def zx(self):\n        return self.rotation[..., 2, 0]\n\n    @property\n    def zy(self):\n        return self.rotation[..., 2, 1]\n\n    @property\n    def zz(self):\n        return self.rotation[..., 2, 2]\n\n    def __getitem__(self, index):\n        return Rots(self.rotation[index])\n\n    def __str__(self):\n        return str(self.rotation.shape)\n\n    def __repr__(self):\n        return str(self.rotation.shape)\n\n    def reshape(self, *argv):\n        return self.rotation.reshape(*argv)\n\n\ndef squared_difference(x, y):\n    return paddle.square(x - y)\n\n\ndef invert_rigids(r: Rigids) -> Rigids:\n    \"\"\"Computes group inverse of rigid transformations 'r'.\"\"\"\n    inv_rots = invert_rots(r.rot)\n    t = rots_mul_vecs(inv_rots, r.trans)\n    inv_trans = Vecs(-1 * t.translation)\n    return Rigids(inv_rots, inv_trans)\n\n\ndef invert_rots(m: Rots) -> Rots:\n    \"\"\"Computes inverse of rotations 'm'.\"\"\"\n    return Rots(m.xx, m.yx, m.zx, m.xy, m.yy, m.zy, m.xz, m.yz, m.zz)\n\n\ndef rigids_from_3_points_vecs(\n        point_on_neg_x_axis: Vecs,\n        origin: Vecs,\n        point_on_xy_plane: Vecs, ) -> Rigids:\n    \"\"\"Create Rigids from 3 points.\n\n  Jumper et al. (2021) Suppl. Alg. 21 \"rigidFrom3Points\"\n  This creates a set of rigid transformations from 3 points by Gram Schmidt\n  orthogonalization.\n\n  Args:\n    point_on_neg_x_axis: Vecs corresponding to points on the negative x axis\n    origin: Origin of resulting rigid transformations\n    point_on_xy_plane: Vecs corresponding to points in the xy plane\n  Returns:\n    Rigid transformations from global frame to local frames derived from\n    the input points.\n  \"\"\"\n    m = rots_from_two_vecs(\n        e0_unnormalized=vecs_sub(origin, point_on_neg_x_axis),\n        e1_unnormalized=vecs_sub(point_on_xy_plane, origin))\n\n    return Rigids(rot=m, trans=origin)\n\n\ndef rigids_from_3_points(point_on_neg_x_axis: paddle.Tensor,\n                         origin: paddle.Tensor,\n                         point_on_xy_plane: paddle.Tensor,\n                         eps: float=1e-8) -> Rigids:\n    \"\"\"Create Rigids from 3 points.\n\n    Jumper et al. (2021) Suppl. Alg. 21 \"rigidFrom3Points\"\n    This creates a set of rigid transformations from 3 points by Gram Schmidt\n    orthogonalization.\n\n    Argss:\n        point_on_neg_x_axis: [*, 3] coordinates\n        origin: [*, 3] coordinates\n        point_on_xy_plane: [*, 3] coordinates\n        eps: small regularizer added to squared norm before taking square root.\n    Returns:\n        Rigids corresponding to transformations from global frame\n        to local frames derived from the input points.\n    \"\"\"\n    point_on_neg_x_axis = paddle.unbind(point_on_neg_x_axis, axis=-1)\n    origin = paddle.unbind(origin, axis=-1)\n    point_on_xy_plane = paddle.unbind(point_on_xy_plane, axis=-1)\n\n    e0 = [c1 - c2 for c1, c2 in zip(origin, point_on_neg_x_axis)]\n    e1 = [c1 - c2 for c1, c2 in zip(point_on_xy_plane, origin)]\n\n    norms = paddle.sqrt(\n        paddle.square(e0[0]) + paddle.square(e0[1]) + paddle.square(e0[2]) +\n        eps)\n    e0 = [c / norms for c in e0]\n    dot = sum((c1 * c2 for c1, c2 in zip(e0, e1)))\n    e1 = [c2 - c1 * dot for c1, c2 in zip(e0, e1)]\n    norms = paddle.sqrt(\n        paddle.square(e1[0]) + paddle.square(e1[1]) + paddle.square(e1[2]) +\n        eps)\n    e1 = [c / norms for c in e1]\n    e2 = [\n        e0[1] * e1[2] - e0[2] * e1[1],\n        e0[2] * e1[0] - e0[0] * e1[2],\n        e0[0] * e1[1] - e0[1] * e1[0],\n    ]\n\n    rots = paddle.stack([c for tup in zip(e0, e1, e2) for c in tup], axis=-1)\n\n    return Rigids(Rots(rots), Vecs(origin))\n\n\ndef rigids_from_list(l: List[paddle.Tensor]) -> Rigids:\n    \"\"\"Converts flat list of arrays to rigid transformations.\"\"\"\n    assert len(l) == 12\n    return Rigids(Rots(*(l[:9])), Vecs(*(l[9:])))\n\n\ndef rigids_from_quataffine(a: quat_affine.QuatAffine) -> Rigids:\n    \"\"\"Converts QuatAffine object to the corresponding Rigids object.\"\"\"\n    return Rigids(Rots(a.rotation), Vecs(a.translation))\n\n\ndef rigids_from_tensor4x4(m: paddle.Tensor) -> Rigids:\n    \"\"\"Construct Rigids from an 4x4 array.\n\n    Here the 4x4 is representing the transformation in homogeneous coordinates.\n\n    Argss:\n        m: [*, 4, 4] homogenous transformation tensor\n    Returns:\n        Rigids corresponding to transformations m\n    \"\"\"\n    assert m.shape[-1] == 4\n    assert m.shape[-2] == 4\n    sliced_m = m[..., 0:3, :]  # shape is [..., 3, 4]\n    outs = paddle.split(sliced_m, num_or_sections=[3, 1], axis=-1)\n    return Rigids(Rots(outs[0]), Vecs(outs[1].squeeze_(axis=-1)))\n\n\ndef rigids_from_tensor_flat9(m: paddle.Tensor) -> Rigids:\n    \"\"\"Flat9 encoding: first two columns of rotation matrix + translation.\"\"\"\n    assert m.shape[-1] == 9\n    e0 = Vecs(m[..., 0], m[..., 1], m[..., 2])\n    e1 = Vecs(m[..., 3], m[..., 4], m[..., 5])\n    trans = Vecs(m[..., 6], m[..., 7], m[..., 8])\n    return Rigids(rot=rots_from_two_vecs(e0, e1), trans=trans)\n\n\ndef rigids_from_tensor_flat12(m: paddle.Tensor  # shape (..., 12)\n                              ) -> Rigids:  # shape (...)\n    \"\"\"Flat12 encoding: rotation matrix (9 floats) + translation (3 floats).\"\"\"\n    assert m.shape[-1] == 12\n    return Rigids(Rots(m[..., :9]), Vecs(m[..., 9:]))\n\n\ndef rigids_mul_rigids(a: Rigids, b: Rigids) -> Rigids:\n    \"\"\"Group composition of Rigids 'a' and 'b'.\"\"\"\n    return Rigids(\n        rots_mul_rots(a.rot, b.rot),\n        vecs_add(a.trans, rots_mul_vecs(a.rot, b.trans)))\n\n\ndef rigids_mul_rots(r: Rigids, m: Rots) -> Rigids:\n    \"\"\"Compose rigid transformations 'r' with rotations 'm'.\"\"\"\n    return Rigids(rots_mul_rots(r.rot, m), r.trans)\n\n\ndef rigids_mul_vecs(r: Rigids, v: Vecs) -> Vecs:\n    \"\"\"Apply rigid transforms 'r' to points 'v'.\"\"\"\n    return vecs_add(rots_mul_vecs(r.rot, v), r.trans)\n\n\ndef rigids_to_list(r: Rigids) -> List[paddle.Tensor]:\n    \"\"\"Turn Rigids into flat list, inverse of 'rigids_from_list'.\"\"\"\n    return list(r.rot) + list(r.trans)\n\n\ndef rigids_to_quataffine(r: Rigids) -> quat_affine.QuatAffine:\n    \"\"\"Convert Rigids r into QuatAffine, inverse of 'rigids_from_quataffine'.\"\"\"\n    return quat_affine.QuatAffine(\n        quaternion=None,\n        rotation=r.rot.rotation,\n        translation=r.trans.translation)\n\n\ndef rigids_to_tensor_flat9(r: Rigids) -> paddle.Tensor:  # shape (..., 9)\n    \"\"\"Flat9 encoding: first two columns of rotation matrix + translation.\"\"\"\n    return paddle.stack(\n        [r.rot.xx, r.rot.yx, r.rot.zx, r.rot.xy, r.rot.yy, r.rot.zy] +\n        list(r.trans),\n        axis=-1)\n\n\ndef rigids_to_tensor_flat12(r: Rigids  # shape (...)\n                            ) -> paddle.Tensor:  # shape (..., 12)\n    \"\"\"Flat12 encoding: rotation matrix (9 floats) + translation (3 floats).\"\"\"\n    return paddle.stack(\n        [\n            r.rot.xx, r.rot.yx, r.rot.zx, r.rot.xy, r.rot.yy, r.rot.zy,\n            r.rot.xz, r.rot.yz, r.rot.zz\n        ] + [r.trans.x, r.trans.y, r.trans.z],\n        axis=-1)\n\n\ndef rots_from_tensor3x3(\n        m: paddle.Tensor,  # shape (..., 3, 3)\n) -> Rots:  # shape (...)\n    \"\"\"Convert rotations represented as (3, 3) array to Rots.\"\"\"\n    assert m.shape[-1] == 3\n    assert m.shape[-2] == 3\n    return Rots(m)\n\n\ndef rots_from_two_vecs(e0_unnormalized: Vecs, e1_unnormalized: Vecs) -> Rots:\n    \"\"\"Create rotation matrices from unnormalized vectors for the x and y-axes.\n\n    This creates a rotation matrix from two vectors using Gram-Schmidt\n    orthogonalization.\n\n    Args:\n        e0_unnormalized: vectors lying along x-axis of resulting rotation\n        e1_unnormalized: vectors lying in xy-plane of resulting rotation\n    Returns:\n        Rotations resulting from Gram-Schmidt procedure.\n    \"\"\"\n    # Normalize the unit vector for the x-axis, e0.\n    e0 = vecs_robust_normalize(e0_unnormalized)\n\n    # make e1 perpendicular to e0.\n    c = vecs_dot_vecs(e1_unnormalized, e0)\n    e1 = Vecs(e1_unnormalized.translation - c.unsqueeze_(axis=-1) *\n              e0.translation)\n    e1 = vecs_robust_normalize(e1)\n\n    # Compute e2 as cross product of e0 and e1.\n    e2 = vecs_cross_vecs(e0, e1)\n\n    return Rots(\n        paddle.stack(\n            [e0.translation, e1.translation, e2.translation], axis=-1))\n\n\ndef broadcast_shape(x_shape, y_shape):\n    if x_shape == y_shape or len(x_shape) > len(y_shape):\n        out_shape = x_shape\n    elif len(y_shape) > len(x_shape):\n        out_shape = y_shape\n    else:\n        out_shape = []\n        for i in range(len(x_shape)):\n            if x_shape[i] == y_shape[i] or y_shape[i] == 1:\n                out_shape.append(x_shape[i])\n            elif x_shape[i] == 1:\n                out_shape.append(y_shape[i])\n            else:\n                raise ValueError(\"{} and {} cannot braodcast.\".format(x_shape,\n                                                                      y_shape))\n    return out_shape\n\n\ndef broadcast_to(x, broadcast_shape):\n    if x.shape == broadcast_shape:\n        return x\n    else:\n        return paddle.broadcast_to(x, broadcast_shape)\n\n\ndef rots_mul_rots(a: Rots, b: Rots) -> Rots:\n    \"\"\"Composition of rotations 'a' and 'b'.\"\"\"\n    out_shape = broadcast_shape(a.shape, b.shape)\n    broadcasted_a = broadcast_to(a.rotation, out_shape)\n    broadcasted_b = broadcast_to(b.rotation, out_shape)\n    return Rots(paddle.matmul(broadcasted_a, broadcasted_b))\n\n\ndef rots_mul_vecs(m: Rots, v: Vecs) -> Vecs:\n    \"\"\"Apply rotations 'm' to vectors 'v'.\"\"\"\n    if m.shape[:-2] == v.shape[:-1]:\n        broadcasted_m = m.rotation\n        broadcasted_v = v.translation\n    else:\n        out_shape = broadcast_shape(m.shape[:-2], v.shape[:-1])\n        broadcasted_m = broadcast_to(m.rotation, out_shape + [3, 3])\n        broadcasted_v = broadcast_to(v.translation, out_shape + [3])\n    return Vecs(\n        paddle.matmul(\n            broadcasted_m, broadcasted_v.unsqueeze(axis=-1)).squeeze_(axis=-1))\n\n\ndef vecs_add(v1: Vecs, v2: Vecs) -> Vecs:\n    \"\"\"Add two vectors 'v1' and 'v2'.\"\"\"\n    return Vecs(v1.translation + v2.translation)\n\n\ndef vecs_dot_vecs(v1: Vecs, v2: Vecs) -> paddle.Tensor:\n    \"\"\"Dot product of vectors 'v1' and 'v2'.\"\"\"\n    return v1.x * v2.x + v1.y * v2.y + v1.z * v2.z\n\n\ndef vecs_cross_vecs(v1: Vecs, v2: Vecs) -> Vecs:\n    \"\"\"Cross product of vectors 'v1' and 'v2'.\"\"\"\n    return Vecs(paddle.cross(v1.translation, v2.translation, axis=-1))\n\n\ndef vecs_from_tensor(x: paddle.Tensor  # shape (..., 3)\n                     ) -> Vecs:  # shape (...)\n    \"\"\"Converts from tensor of shape (3,) to Vecs.\"\"\"\n    assert x.shape[-1] == 3\n    return Vecs(x)\n\n\ndef vecs_robust_normalize(v: Vecs, epsilon: float=1e-8) -> Vecs:\n    \"\"\"Normalizes vectors 'v'.\n\n    Argss:\n        v: vectors to be normalized.\n        epsilon: small regularizer added to squared norm before taking square root.\n    Returns:\n        normalized vectors\n    \"\"\"\n    norms = vecs_robust_norm(v, epsilon)\n    return Vecs(v.translation / norms.unsqueeze_(axis=-1))\n\n\ndef vecs_robust_norm(v: Vecs, epsilon: float=1e-8) -> paddle.Tensor:\n    \"\"\"Computes norm of vectors 'v'.\n\n    Args:\n        v: vectors to be normalized.\n        epsilon: small regularizer added to squared norm before taking square root.\n    Returns:\n        norm of 'v'\n    \"\"\"\n    return paddle.sqrt(\n        paddle.square(v.x) + paddle.square(v.y) + paddle.square(v.z) + epsilon)\n\n\ndef vecs_sub(v1: Vecs, v2: Vecs) -> Vecs:\n    \"\"\"Computes v1 - v2.\"\"\"\n    return Vecs(v1.translation - v2.translation)\n\n\ndef vecs_squared_distance(v1: Vecs, v2: Vecs) -> paddle.Tensor:\n    \"\"\"Computes squared euclidean difference between 'v1' and 'v2'.\"\"\"\n    return (squared_difference(v1.x, v2.x) + squared_difference(v1.y, v2.y) +\n            squared_difference(v1.z, v2.z))\n\n\ndef vecs_to_tensor(v: Vecs  # shape (...)\n                   ) -> paddle.Tensor:  # shape(..., 3)\n    \"\"\"Converts 'v' to tensor with shape 3, inverse of 'vecs_from_tensor'.\"\"\"\n    return v.translation\n"
  },
  {
    "path": "ppfleetx/models/protein_folding/residue_constants.py",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Constants used in AlphaFold.\"\"\"\n\nimport collections\nimport functools\nimport os\nfrom typing import List, Mapping, Tuple\n\nimport numpy as np\nimport tree\n\n# Internal import (35fd).\n\n# Distance from one CA to next CA [trans configuration: omega = 180].\nca_ca = 3.80209737096\n\n# Format: The list for each AA type contains chi1, chi2, chi3, chi4 in\n# this order (or a relevant subset from chi1 onwards). ALA and GLY don't have\n# chi angles so their chi angle lists are empty.\nchi_angles_atoms = {\n    'ALA': [],\n    # Chi5 in arginine is always 0 +- 5 degrees, so ignore it.\n    'ARG': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],\n            ['CB', 'CG', 'CD', 'NE'], ['CG', 'CD', 'NE', 'CZ']],\n    'ASN': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'OD1']],\n    'ASP': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'OD1']],\n    'CYS': [['N', 'CA', 'CB', 'SG']],\n    'GLN': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],\n            ['CB', 'CG', 'CD', 'OE1']],\n    'GLU': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],\n            ['CB', 'CG', 'CD', 'OE1']],\n    'GLY': [],\n    'HIS': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'ND1']],\n    'ILE': [['N', 'CA', 'CB', 'CG1'], ['CA', 'CB', 'CG1', 'CD1']],\n    'LEU': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],\n    'LYS': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],\n            ['CB', 'CG', 'CD', 'CE'], ['CG', 'CD', 'CE', 'NZ']],\n    'MET': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'SD'],\n            ['CB', 'CG', 'SD', 'CE']],\n    'PHE': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],\n    'PRO': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD']],\n    'SER': [['N', 'CA', 'CB', 'OG']],\n    'THR': [['N', 'CA', 'CB', 'OG1']],\n    'TRP': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],\n    'TYR': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],\n    'VAL': [['N', 'CA', 'CB', 'CG1']],\n}\n\n# If chi angles given in fixed-length array, this matrix determines how to mask\n# them for each AA type. The order is as per restype_order (see below).\nchi_angles_mask = [\n    [0.0, 0.0, 0.0, 0.0],  # ALA\n    [1.0, 1.0, 1.0, 1.0],  # ARG\n    [1.0, 1.0, 0.0, 0.0],  # ASN\n    [1.0, 1.0, 0.0, 0.0],  # ASP\n    [1.0, 0.0, 0.0, 0.0],  # CYS\n    [1.0, 1.0, 1.0, 0.0],  # GLN\n    [1.0, 1.0, 1.0, 0.0],  # GLU\n    [0.0, 0.0, 0.0, 0.0],  # GLY\n    [1.0, 1.0, 0.0, 0.0],  # HIS\n    [1.0, 1.0, 0.0, 0.0],  # ILE\n    [1.0, 1.0, 0.0, 0.0],  # LEU\n    [1.0, 1.0, 1.0, 1.0],  # LYS\n    [1.0, 1.0, 1.0, 0.0],  # MET\n    [1.0, 1.0, 0.0, 0.0],  # PHE\n    [1.0, 1.0, 0.0, 0.0],  # PRO\n    [1.0, 0.0, 0.0, 0.0],  # SER\n    [1.0, 0.0, 0.0, 0.0],  # THR\n    [1.0, 1.0, 0.0, 0.0],  # TRP\n    [1.0, 1.0, 0.0, 0.0],  # TYR\n    [1.0, 0.0, 0.0, 0.0],  # VAL\n]\n\n# The following chi angles are pi periodic: they can be rotated by a multiple\n# of pi without affecting the structure.\nchi_pi_periodic = [\n    [0.0, 0.0, 0.0, 0.0],  # ALA\n    [0.0, 0.0, 0.0, 0.0],  # ARG\n    [0.0, 0.0, 0.0, 0.0],  # ASN\n    [0.0, 1.0, 0.0, 0.0],  # ASP\n    [0.0, 0.0, 0.0, 0.0],  # CYS\n    [0.0, 0.0, 0.0, 0.0],  # GLN\n    [0.0, 0.0, 1.0, 0.0],  # GLU\n    [0.0, 0.0, 0.0, 0.0],  # GLY\n    [0.0, 0.0, 0.0, 0.0],  # HIS\n    [0.0, 0.0, 0.0, 0.0],  # ILE\n    [0.0, 0.0, 0.0, 0.0],  # LEU\n    [0.0, 0.0, 0.0, 0.0],  # LYS\n    [0.0, 0.0, 0.0, 0.0],  # MET\n    [0.0, 1.0, 0.0, 0.0],  # PHE\n    [0.0, 0.0, 0.0, 0.0],  # PRO\n    [0.0, 0.0, 0.0, 0.0],  # SER\n    [0.0, 0.0, 0.0, 0.0],  # THR\n    [0.0, 0.0, 0.0, 0.0],  # TRP\n    [0.0, 1.0, 0.0, 0.0],  # TYR\n    [0.0, 0.0, 0.0, 0.0],  # VAL\n    [0.0, 0.0, 0.0, 0.0],  # UNK\n]\n\n# Atoms positions relative to the 8 rigid groups, defined by the pre-omega, phi,\n# psi and chi angles:\n# 0: 'backbone group',\n# 1: 'pre-omega-group', (empty)\n# 2: 'phi-group', (currently empty, because it defines only hydrogens)\n# 3: 'psi-group',\n# 4,5,6,7: 'chi1,2,3,4-group'\n# The atom positions are relative to the axis-end-atom of the corresponding\n# rotation axis. The x-axis is in direction of the rotation axis, and the y-axis\n# is defined such that the dihedral-angle-definiting atom (the last entry in\n# chi_angles_atoms above) is in the xy-plane (with a positive y-coordinate).\n# format: [atomname, group_idx, rel_position]\nrigid_group_atom_positions = {\n    'ALA': [\n        ['N', 0, (-0.525, 1.363, 0.000)],\n        ['CA', 0, (0.000, 0.000, 0.000)],\n        ['C', 0, (1.526, -0.000, -0.000)],\n        ['CB', 0, (-0.529, -0.774, -1.205)],\n        ['O', 3, (0.627, 1.062, 0.000)],\n    ],\n    'ARG': [\n        ['N', 0, (-0.524, 1.362, -0.000)],\n        ['CA', 0, (0.000, 0.000, 0.000)],\n        ['C', 0, (1.525, -0.000, -0.000)],\n        ['CB', 0, (-0.524, -0.778, -1.209)],\n        ['O', 3, (0.626, 1.062, 0.000)],\n        ['CG', 4, (0.616, 1.390, -0.000)],\n        ['CD', 5, (0.564, 1.414, 0.000)],\n        ['NE', 6, (0.539, 1.357, -0.000)],\n        ['NH1', 7, (0.206, 2.301, 0.000)],\n        ['NH2', 7, (2.078, 0.978, -0.000)],\n        ['CZ', 7, (0.758, 1.093, -0.000)],\n    ],\n    'ASN': [\n        ['N', 0, (-0.536, 1.357, 0.000)],\n        ['CA', 0, (0.000, 0.000, 0.000)],\n        ['C', 0, (1.526, -0.000, -0.000)],\n        ['CB', 0, (-0.531, -0.787, -1.200)],\n        ['O', 3, (0.625, 1.062, 0.000)],\n        ['CG', 4, (0.584, 1.399, 0.000)],\n        ['ND2', 5, (0.593, -1.188, 0.001)],\n        ['OD1', 5, (0.633, 1.059, 0.000)],\n    ],\n    'ASP': [\n        ['N', 0, (-0.525, 1.362, -0.000)],\n        ['CA', 0, (0.000, 0.000, 0.000)],\n        ['C', 0, (1.527, 0.000, -0.000)],\n        ['CB', 0, (-0.526, -0.778, -1.208)],\n        ['O', 3, (0.626, 1.062, -0.000)],\n        ['CG', 4, (0.593, 1.398, -0.000)],\n        ['OD1', 5, (0.610, 1.091, 0.000)],\n        ['OD2', 5, (0.592, -1.101, -0.003)],\n    ],\n    'CYS': [\n        ['N', 0, (-0.522, 1.362, -0.000)],\n        ['CA', 0, (0.000, 0.000, 0.000)],\n        ['C', 0, (1.524, 0.000, 0.000)],\n        ['CB', 0, (-0.519, -0.773, -1.212)],\n        ['O', 3, (0.625, 1.062, -0.000)],\n        ['SG', 4, (0.728, 1.653, 0.000)],\n    ],\n    'GLN': [\n        ['N', 0, (-0.526, 1.361, -0.000)],\n        ['CA', 0, (0.000, 0.000, 0.000)],\n        ['C', 0, (1.526, 0.000, 0.000)],\n        ['CB', 0, (-0.525, -0.779, -1.207)],\n        ['O', 3, (0.626, 1.062, -0.000)],\n        ['CG', 4, (0.615, 1.393, 0.000)],\n        ['CD', 5, (0.587, 1.399, -0.000)],\n        ['NE2', 6, (0.593, -1.189, -0.001)],\n        ['OE1', 6, (0.634, 1.060, 0.000)],\n    ],\n    'GLU': [\n        ['N', 0, (-0.528, 1.361, 0.000)],\n        ['CA', 0, (0.000, 0.000, 0.000)],\n        ['C', 0, (1.526, -0.000, -0.000)],\n        ['CB', 0, (-0.526, -0.781, -1.207)],\n        ['O', 3, (0.626, 1.062, 0.000)],\n        ['CG', 4, (0.615, 1.392, 0.000)],\n        ['CD', 5, (0.600, 1.397, 0.000)],\n        ['OE1', 6, (0.607, 1.095, -0.000)],\n        ['OE2', 6, (0.589, -1.104, -0.001)],\n    ],\n    'GLY': [\n        ['N', 0, (-0.572, 1.337, 0.000)],\n        ['CA', 0, (0.000, 0.000, 0.000)],\n        ['C', 0, (1.517, -0.000, -0.000)],\n        ['O', 3, (0.626, 1.062, -0.000)],\n    ],\n    'HIS': [\n        ['N', 0, (-0.527, 1.360, 0.000)],\n        ['CA', 0, (0.000, 0.000, 0.000)],\n        ['C', 0, (1.525, 0.000, 0.000)],\n        ['CB', 0, (-0.525, -0.778, -1.208)],\n        ['O', 3, (0.625, 1.063, 0.000)],\n        ['CG', 4, (0.600, 1.370, -0.000)],\n        ['CD2', 5, (0.889, -1.021, 0.003)],\n        ['ND1', 5, (0.744, 1.160, -0.000)],\n        ['CE1', 5, (2.030, 0.851, 0.002)],\n        ['NE2', 5, (2.145, -0.466, 0.004)],\n    ],\n    'ILE': [\n        ['N', 0, (-0.493, 1.373, -0.000)],\n        ['CA', 0, (0.000, 0.000, 0.000)],\n        ['C', 0, (1.527, -0.000, -0.000)],\n        ['CB', 0, (-0.536, -0.793, -1.213)],\n        ['O', 3, (0.627, 1.062, -0.000)],\n        ['CG1', 4, (0.534, 1.437, -0.000)],\n        ['CG2', 4, (0.540, -0.785, -1.199)],\n        ['CD1', 5, (0.619, 1.391, 0.000)],\n    ],\n    'LEU': [\n        ['N', 0, (-0.520, 1.363, 0.000)],\n        ['CA', 0, (0.000, 0.000, 0.000)],\n        ['C', 0, (1.525, -0.000, -0.000)],\n        ['CB', 0, (-0.522, -0.773, -1.214)],\n        ['O', 3, (0.625, 1.063, -0.000)],\n        ['CG', 4, (0.678, 1.371, 0.000)],\n        ['CD1', 5, (0.530, 1.430, -0.000)],\n        ['CD2', 5, (0.535, -0.774, 1.200)],\n    ],\n    'LYS': [\n        ['N', 0, (-0.526, 1.362, -0.000)],\n        ['CA', 0, (0.000, 0.000, 0.000)],\n        ['C', 0, (1.526, 0.000, 0.000)],\n        ['CB', 0, (-0.524, -0.778, -1.208)],\n        ['O', 3, (0.626, 1.062, -0.000)],\n        ['CG', 4, (0.619, 1.390, 0.000)],\n        ['CD', 5, (0.559, 1.417, 0.000)],\n        ['CE', 6, (0.560, 1.416, 0.000)],\n        ['NZ', 7, (0.554, 1.387, 0.000)],\n    ],\n    'MET': [\n        ['N', 0, (-0.521, 1.364, -0.000)],\n        ['CA', 0, (0.000, 0.000, 0.000)],\n        ['C', 0, (1.525, 0.000, 0.000)],\n        ['CB', 0, (-0.523, -0.776, -1.210)],\n        ['O', 3, (0.625, 1.062, -0.000)],\n        ['CG', 4, (0.613, 1.391, -0.000)],\n        ['SD', 5, (0.703, 1.695, 0.000)],\n        ['CE', 6, (0.320, 1.786, -0.000)],\n    ],\n    'PHE': [\n        ['N', 0, (-0.518, 1.363, 0.000)],\n        ['CA', 0, (0.000, 0.000, 0.000)],\n        ['C', 0, (1.524, 0.000, -0.000)],\n        ['CB', 0, (-0.525, -0.776, -1.212)],\n        ['O', 3, (0.626, 1.062, -0.000)],\n        ['CG', 4, (0.607, 1.377, 0.000)],\n        ['CD1', 5, (0.709, 1.195, -0.000)],\n        ['CD2', 5, (0.706, -1.196, 0.000)],\n        ['CE1', 5, (2.102, 1.198, -0.000)],\n        ['CE2', 5, (2.098, -1.201, -0.000)],\n        ['CZ', 5, (2.794, -0.003, -0.001)],\n    ],\n    'PRO': [\n        ['N', 0, (-0.566, 1.351, -0.000)],\n        ['CA', 0, (0.000, 0.000, 0.000)],\n        ['C', 0, (1.527, -0.000, 0.000)],\n        ['CB', 0, (-0.546, -0.611, -1.293)],\n        ['O', 3, (0.621, 1.066, 0.000)],\n        ['CG', 4, (0.382, 1.445, 0.0)],\n        # ['CD', 5, (0.427, 1.440, 0.0)],\n        ['CD', 5, (0.477, 1.424, 0.0)],  # manually made angle 2 degrees larger\n    ],\n    'SER': [\n        ['N', 0, (-0.529, 1.360, -0.000)],\n        ['CA', 0, (0.000, 0.000, 0.000)],\n        ['C', 0, (1.525, -0.000, -0.000)],\n        ['CB', 0, (-0.518, -0.777, -1.211)],\n        ['O', 3, (0.626, 1.062, -0.000)],\n        ['OG', 4, (0.503, 1.325, 0.000)],\n    ],\n    'THR': [\n        ['N', 0, (-0.517, 1.364, 0.000)],\n        ['CA', 0, (0.000, 0.000, 0.000)],\n        ['C', 0, (1.526, 0.000, -0.000)],\n        ['CB', 0, (-0.516, -0.793, -1.215)],\n        ['O', 3, (0.626, 1.062, 0.000)],\n        ['CG2', 4, (0.550, -0.718, -1.228)],\n        ['OG1', 4, (0.472, 1.353, 0.000)],\n    ],\n    'TRP': [\n        ['N', 0, (-0.521, 1.363, 0.000)],\n        ['CA', 0, (0.000, 0.000, 0.000)],\n        ['C', 0, (1.525, -0.000, 0.000)],\n        ['CB', 0, (-0.523, -0.776, -1.212)],\n        ['O', 3, (0.627, 1.062, 0.000)],\n        ['CG', 4, (0.609, 1.370, -0.000)],\n        ['CD1', 5, (0.824, 1.091, 0.000)],\n        ['CD2', 5, (0.854, -1.148, -0.005)],\n        ['CE2', 5, (2.186, -0.678, -0.007)],\n        ['CE3', 5, (0.622, -2.530, -0.007)],\n        ['NE1', 5, (2.140, 0.690, -0.004)],\n        ['CH2', 5, (3.028, -2.890, -0.013)],\n        ['CZ2', 5, (3.283, -1.543, -0.011)],\n        ['CZ3', 5, (1.715, -3.389, -0.011)],\n    ],\n    'TYR': [\n        ['N', 0, (-0.522, 1.362, 0.000)],\n        ['CA', 0, (0.000, 0.000, 0.000)],\n        ['C', 0, (1.524, -0.000, -0.000)],\n        ['CB', 0, (-0.522, -0.776, -1.213)],\n        ['O', 3, (0.627, 1.062, -0.000)],\n        ['CG', 4, (0.607, 1.382, -0.000)],\n        ['CD1', 5, (0.716, 1.195, -0.000)],\n        ['CD2', 5, (0.713, -1.194, -0.001)],\n        ['CE1', 5, (2.107, 1.200, -0.002)],\n        ['CE2', 5, (2.104, -1.201, -0.003)],\n        ['OH', 5, (4.168, -0.002, -0.005)],\n        ['CZ', 5, (2.791, -0.001, -0.003)],\n    ],\n    'VAL': [\n        ['N', 0, (-0.494, 1.373, -0.000)],\n        ['CA', 0, (0.000, 0.000, 0.000)],\n        ['C', 0, (1.527, -0.000, -0.000)],\n        ['CB', 0, (-0.533, -0.795, -1.213)],\n        ['O', 3, (0.627, 1.062, -0.000)],\n        ['CG1', 4, (0.540, 1.429, -0.000)],\n        ['CG2', 4, (0.533, -0.776, 1.203)],\n    ],\n}\n\n# A list of atoms (excluding hydrogen) for each AA type. PDB naming convention.\nresidue_atoms = {\n    'ALA': ['C', 'CA', 'CB', 'N', 'O'],\n    'ARG': ['C', 'CA', 'CB', 'CG', 'CD', 'CZ', 'N', 'NE', 'O', 'NH1', 'NH2'],\n    'ASP': ['C', 'CA', 'CB', 'CG', 'N', 'O', 'OD1', 'OD2'],\n    'ASN': ['C', 'CA', 'CB', 'CG', 'N', 'ND2', 'O', 'OD1'],\n    'CYS': ['C', 'CA', 'CB', 'N', 'O', 'SG'],\n    'GLU': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'O', 'OE1', 'OE2'],\n    'GLN': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'NE2', 'O', 'OE1'],\n    'GLY': ['C', 'CA', 'N', 'O'],\n    'HIS': ['C', 'CA', 'CB', 'CG', 'CD2', 'CE1', 'N', 'ND1', 'NE2', 'O'],\n    'ILE': ['C', 'CA', 'CB', 'CG1', 'CG2', 'CD1', 'N', 'O'],\n    'LEU': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'N', 'O'],\n    'LYS': ['C', 'CA', 'CB', 'CG', 'CD', 'CE', 'N', 'NZ', 'O'],\n    'MET': ['C', 'CA', 'CB', 'CG', 'CE', 'N', 'O', 'SD'],\n    'PHE':\n    ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'N', 'O'],\n    'PRO': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'O'],\n    'SER': ['C', 'CA', 'CB', 'N', 'O', 'OG'],\n    'THR': ['C', 'CA', 'CB', 'CG2', 'N', 'O', 'OG1'],\n    'TRP': [\n        'C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE2', 'CE3', 'CZ2', 'CZ3', 'CH2',\n        'N', 'NE1', 'O'\n    ],\n    'TYR':\n    ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'N', 'O', 'OH'],\n    'VAL': ['C', 'CA', 'CB', 'CG1', 'CG2', 'N', 'O']\n}\n\n# Naming swaps for ambiguous atom names.\n# Due to symmetries in the amino acids the naming of atoms is ambiguous in\n# 4 of the 20 amino acids.\n# (The LDDT paper lists 7 amino acids as ambiguous, but the naming ambiguities\n# in LEU, VAL and ARG can be resolved by using the 3d constellations of\n# the 'ambiguous' atoms and their neighbours)\nresidue_atom_renaming_swaps = {\n    'ASP': {\n        'OD1': 'OD2'\n    },\n    'GLU': {\n        'OE1': 'OE2'\n    },\n    'PHE': {\n        'CD1': 'CD2',\n        'CE1': 'CE2'\n    },\n    'TYR': {\n        'CD1': 'CD2',\n        'CE1': 'CE2'\n    },\n}\n\n# Van der Waals radii [Angstroem] of the atoms (from Wikipedia)\nvan_der_waals_radius = {\n    'C': 1.7,\n    'N': 1.55,\n    'O': 1.52,\n    'S': 1.8,\n}\n\nBond = collections.namedtuple(\n    'Bond', ['atom1_name', 'atom2_name', 'length', 'stddev'])\nBondAngle = collections.namedtuple(\n    'BondAngle',\n    ['atom1_name', 'atom2_name', 'atom3name', 'angle_rad', 'stddev'])\n\n\n@functools.lru_cache(maxsize=None)\ndef load_stereo_chemical_props() -> Tuple[Mapping[str, List[Bond]], Mapping[\n        str, List[Bond]], Mapping[str, List[BondAngle]]]:\n    \"\"\"Load stereo_chemical_props.txt into a nice structure.\n\n  Load literature values for bond lengths and bond angles and translate\n  bond angles into the length of the opposite edge of the triangle\n  (\"residue_virtual_bonds\").\n\n  Returns:\n    residue_bonds: Dict that maps resname -> list of Bond tuples.\n    residue_virtual_bonds: Dict that maps resname -> list of Bond tuples.\n    residue_bond_angles: Dict that maps resname -> list of BondAngle tuples.\n  \"\"\"\n    stereo_chemical_props_path = os.path.join(\n        os.path.dirname(os.path.abspath(__file__)),\n        'stereo_chemical_props.txt')\n    with open(stereo_chemical_props_path, 'rt') as f:\n        stereo_chemical_props = f.read()\n    lines_iter = iter(stereo_chemical_props.splitlines())\n    # Load bond lengths.\n    residue_bonds = {}\n    next(lines_iter)  # Skip header line.\n    for line in lines_iter:\n        if line.strip() == '-':\n            break\n        bond, resname, length, stddev = line.split()\n        atom1, atom2 = bond.split('-')\n        if resname not in residue_bonds:\n            residue_bonds[resname] = []\n        residue_bonds[resname].append(\n            Bond(atom1, atom2, float(length), float(stddev)))\n    residue_bonds['UNK'] = []\n\n    # Load bond angles.\n    residue_bond_angles = {}\n    next(lines_iter)  # Skip empty line.\n    next(lines_iter)  # Skip header line.\n    for line in lines_iter:\n        if line.strip() == '-':\n            break\n        bond, resname, angle_degree, stddev_degree = line.split()\n        atom1, atom2, atom3 = bond.split('-')\n        if resname not in residue_bond_angles:\n            residue_bond_angles[resname] = []\n        residue_bond_angles[resname].append(\n            BondAngle(atom1, atom2, atom3,\n                      float(angle_degree) / 180. * np.pi,\n                      float(stddev_degree) / 180. * np.pi))\n    residue_bond_angles['UNK'] = []\n\n    def make_bond_key(atom1_name, atom2_name):\n        \"\"\"Unique key to lookup bonds.\"\"\"\n        return '-'.join(sorted([atom1_name, atom2_name]))\n\n    # Translate bond angles into distances (\"virtual bonds\").\n    residue_virtual_bonds = {}\n    for resname, bond_angles in residue_bond_angles.items():\n        # Create a fast lookup dict for bond lengths.\n        bond_cache = {}\n        for b in residue_bonds[resname]:\n            bond_cache[make_bond_key(b.atom1_name, b.atom2_name)] = b\n        residue_virtual_bonds[resname] = []\n        for ba in bond_angles:\n            bond1 = bond_cache[make_bond_key(ba.atom1_name, ba.atom2_name)]\n            bond2 = bond_cache[make_bond_key(ba.atom2_name, ba.atom3name)]\n\n            # Compute distance between atom1 and atom3 using the law of cosines\n            # c^2 = a^2 + b^2 - 2ab*cos(gamma).\n            gamma = ba.angle_rad\n            length = np.sqrt(bond1.length**2 + bond2.length**2 - 2 *\n                             bond1.length * bond2.length * np.cos(gamma))\n\n            # Propagation of uncertainty assuming uncorrelated errors.\n            dl_outer = 0.5 / length\n            dl_dgamma = (2 * bond1.length * bond2.length *\n                         np.sin(gamma)) * dl_outer\n            dl_db1 = (\n                2 * bond1.length - 2 * bond2.length * np.cos(gamma)) * dl_outer\n            dl_db2 = (\n                2 * bond2.length - 2 * bond1.length * np.cos(gamma)) * dl_outer\n            stddev = np.sqrt((dl_dgamma * ba.stddev)**2 + (\n                dl_db1 * bond1.stddev)**2 + (dl_db2 * bond2.stddev)**2)\n            residue_virtual_bonds[resname].append(\n                Bond(ba.atom1_name, ba.atom3name, length, stddev))\n\n    return (residue_bonds, residue_virtual_bonds, residue_bond_angles)\n\n\n# Between-residue bond lengths for general bonds (first element) and for Proline\n# (second element).\nbetween_res_bond_length_c_n = [1.329, 1.341]\nbetween_res_bond_length_stddev_c_n = [0.014, 0.016]\n\n# Between-residue cos_angles.\nbetween_res_cos_angles_c_n_ca = [-0.5203, 0.0353]  # degrees: 121.352 +- 2.315\nbetween_res_cos_angles_ca_c_n = [-0.4473, 0.0311]  # degrees: 116.568 +- 1.995\n\n# This mapping is used when we need to store atom data in a format that requires\n# fixed atom data size for every residue (e.g. a numpy array).\natom_types = [\n    'N', 'CA', 'C', 'CB', 'O', 'CG', 'CG1', 'CG2', 'OG', 'OG1', 'SG', 'CD',\n    'CD1', 'CD2', 'ND1', 'ND2', 'OD1', 'OD2', 'SD', 'CE', 'CE1', 'CE2', 'CE3',\n    'NE', 'NE1', 'NE2', 'OE1', 'OE2', 'CH2', 'NH1', 'NH2', 'OH', 'CZ', 'CZ2',\n    'CZ3', 'NZ', 'OXT'\n]\natom_order = {atom_type: i for i, atom_type in enumerate(atom_types)}\natom_type_num = len(atom_types)  # := 37.\n\n# A compact atom encoding with 14 columns\n# pylint: disable=line-too-long\n# pylint: disable=bad-whitespace\nrestype_name_to_atom14_names = {\n    'ALA': ['N', 'CA', 'C', 'O', 'CB', '', '', '', '', '', '', '', '', ''],\n    'ARG': [\n        'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'NE', 'CZ', 'NH1', 'NH2', '',\n        '', ''\n    ],\n    'ASN':\n    ['N', 'CA', 'C', 'O', 'CB', 'CG', 'OD1', 'ND2', '', '', '', '', '', ''],\n    'ASP':\n    ['N', 'CA', 'C', 'O', 'CB', 'CG', 'OD1', 'OD2', '', '', '', '', '', ''],\n    'CYS': ['N', 'CA', 'C', 'O', 'CB', 'SG', '', '', '', '', '', '', '', ''],\n    'GLN':\n    ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'OE1', 'NE2', '', '', '', '', ''],\n    'GLU': [\n        'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'OE1', 'OE2', '', '', '', '', ''\n    ],\n    'GLY': ['N', 'CA', 'C', 'O', '', '', '', '', '', '', '', '', '', ''],\n    'HIS': [\n        'N', 'CA', 'C', 'O', 'CB', 'CG', 'ND1', 'CD2', 'CE1', 'NE2', '', '',\n        '', ''\n    ],\n    'ILE': [\n        'N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2', 'CD1', '', '', '', '', '', ''\n    ],\n    'LEU': [\n        'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', '', '', '', '', '', ''\n    ],\n    'LYS': [\n        'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'CE', 'NZ', '', '', '', '', ''\n    ],\n    'MET': [\n        'N', 'CA', 'C', 'O', 'CB', 'CG', 'SD', 'CE', '', '', '', '', '', ''\n    ],\n    'PHE': [\n        'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', '',\n        '', ''\n    ],\n    'PRO': [\n        'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', '', '', '', '', '', '', ''\n    ],\n    'SER': ['N', 'CA', 'C', 'O', 'CB', 'OG', '', '', '', '', '', '', '', ''],\n    'THR': [\n        'N', 'CA', 'C', 'O', 'CB', 'OG1', 'CG2', '', '', '', '', '', '', ''\n    ],\n    'TRP': [\n        'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'NE1', 'CE2', 'CE3',\n        'CZ2', 'CZ3', 'CH2'\n    ],\n    'TYR': [\n        'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ',\n        'OH', '', ''\n    ],\n    'VAL': [\n        'N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2', '', '', '', '', '', '', ''\n    ],\n    'UNK': ['', '', '', '', '', '', '', '', '', '', '', '', '', ''],\n}\n# pylint: enable=line-too-long\n# pylint: enable=bad-whitespace\n\n# This is the standard residue order when coding AA type as a number.\n# Reproduce it by taking 3-letter AA codes and sorting them alphabetically.\nrestypes = [\n    'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P',\n    'S', 'T', 'W', 'Y', 'V'\n]\nrestype_order = {restype: i for i, restype in enumerate(restypes)}\nrestype_num = len(restypes)  # := 20.\nunk_restype_index = restype_num  # Catch-all index for unknown restypes.\n\nrestypes_with_x = restypes + ['X']\nrestype_order_with_x = {\n    restype: i\n    for i, restype in enumerate(restypes_with_x)\n}\n\n\ndef sequence_to_onehot(sequence: str,\n                       mapping: Mapping[str, int],\n                       map_unknown_to_x: bool=False) -> np.ndarray:\n    \"\"\"Maps the given sequence into a one-hot encoded matrix.\n\n  Args:\n    sequence: An amino acid sequence.\n    mapping: A dictionary mapping amino acids to integers.\n    map_unknown_to_x: If True, any amino acid that is not in the mapping will be\n      mapped to the unknown amino acid 'X'. If the mapping doesn't contain\n      amino acid 'X', an error will be thrown. If False, any amino acid not in\n      the mapping will throw an error.\n\n  Returns:\n    A numpy array of shape (seq_len, num_unique_aas) with one-hot encoding of\n    the sequence.\n\n  Raises:\n    ValueError: If the mapping doesn't contain values from 0 to\n      num_unique_aas - 1 without any gaps.\n  \"\"\"\n    num_entries = max(mapping.values()) + 1\n\n    if sorted(set(mapping.values())) != list(range(num_entries)):\n        raise ValueError(\n            'The mapping must have values from 0 to num_unique_aas-1 '\n            'without any gaps. Got: %s' % sorted(mapping.values()))\n\n    one_hot_arr = np.zeros((len(sequence), num_entries), dtype=np.int32)\n\n    for aa_index, aa_type in enumerate(sequence):\n        if map_unknown_to_x:\n            if aa_type.isalpha() and aa_type.isupper():\n                aa_id = mapping.get(aa_type, mapping['X'])\n            else:\n                raise ValueError(\n                    f'Invalid character in the sequence: {aa_type}')\n        else:\n            aa_id = mapping[aa_type]\n        one_hot_arr[aa_index, aa_id] = 1\n\n    return one_hot_arr\n\n\nrestype_1to3 = {\n    'A': 'ALA',\n    'R': 'ARG',\n    'N': 'ASN',\n    'D': 'ASP',\n    'C': 'CYS',\n    'Q': 'GLN',\n    'E': 'GLU',\n    'G': 'GLY',\n    'H': 'HIS',\n    'I': 'ILE',\n    'L': 'LEU',\n    'K': 'LYS',\n    'M': 'MET',\n    'F': 'PHE',\n    'P': 'PRO',\n    'S': 'SER',\n    'T': 'THR',\n    'W': 'TRP',\n    'Y': 'TYR',\n    'V': 'VAL',\n}\n\n# NB: restype_3to1 differs from Bio.PDB.protein_letters_3to1 by being a simple\n# 1-to-1 mapping of 3 letter names to one letter names. The latter contains\n# many more, and less common, three letter names as keys and maps many of these\n# to the same one letter name (including 'X' and 'U' which we don't use here).\nrestype_3to1 = {v: k for k, v in restype_1to3.items()}\n\n# Define a restype name for all unknown residues.\nunk_restype = 'UNK'\n\nresnames = [restype_1to3[r] for r in restypes] + [unk_restype]\nresname_to_idx = {resname: i for i, resname in enumerate(resnames)}\n\n# The mapping here uses hhblits convention, so that B is mapped to D, J and O\n# are mapped to X, U is mapped to C, and Z is mapped to E. Other than that the\n# remaining 20 amino acids are kept in alphabetical order.\n# There are 2 non-amino acid codes, X (representing any amino acid) and\n# \"-\" representing a missing amino acid in an alignment.  The id for these\n# codes is put at the end (20 and 21) so that they can easily be ignored if\n# desired.\nHHBLITS_AA_TO_ID = {\n    'A': 0,\n    'B': 2,\n    'C': 1,\n    'D': 2,\n    'E': 3,\n    'F': 4,\n    'G': 5,\n    'H': 6,\n    'I': 7,\n    'J': 20,\n    'K': 8,\n    'L': 9,\n    'M': 10,\n    'N': 11,\n    'O': 20,\n    'P': 12,\n    'Q': 13,\n    'R': 14,\n    'S': 15,\n    'T': 16,\n    'U': 1,\n    'V': 17,\n    'W': 18,\n    'X': 20,\n    'Y': 19,\n    'Z': 3,\n    '-': 21,\n}\n\n# Partial inversion of HHBLITS_AA_TO_ID.\nID_TO_HHBLITS_AA = {\n    0: 'A',\n    1: 'C',  # Also U.\n    2: 'D',  # Also B.\n    3: 'E',  # Also Z.\n    4: 'F',\n    5: 'G',\n    6: 'H',\n    7: 'I',\n    8: 'K',\n    9: 'L',\n    10: 'M',\n    11: 'N',\n    12: 'P',\n    13: 'Q',\n    14: 'R',\n    15: 'S',\n    16: 'T',\n    17: 'V',\n    18: 'W',\n    19: 'Y',\n    20: 'X',  # Includes J and O.\n    21: '-',\n}\n\nrestypes_with_x_and_gap = restypes + ['X', '-']\nMAP_HHBLITS_AATYPE_TO_OUR_AATYPE = tuple(\n    restypes_with_x_and_gap.index(ID_TO_HHBLITS_AA[i])\n    for i in range(len(restypes_with_x_and_gap)))\n\n\ndef _make_standard_atom_mask() -> np.ndarray:\n    \"\"\"Returns [num_res_types, num_atom_types] mask array.\"\"\"\n    # +1 to account for unknown (all 0s).\n    mask = np.zeros([restype_num + 1, atom_type_num], dtype=np.int32)\n    for restype, restype_letter in enumerate(restypes):\n        restype_name = restype_1to3[restype_letter]\n        atom_names = residue_atoms[restype_name]\n        for atom_name in atom_names:\n            atom_type = atom_order[atom_name]\n            mask[restype, atom_type] = 1\n    return mask\n\n\nSTANDARD_ATOM_MASK = _make_standard_atom_mask()\n\n\n# A one hot representation for the first and second atoms defining the axis\n# of rotation for each chi-angle in each residue.\ndef chi_angle_atom(atom_index: int) -> np.ndarray:\n    \"\"\"Define chi-angle rigid groups via one-hot representations.\"\"\"\n    chi_angles_index = {}\n    one_hots = []\n\n    for k, v in chi_angles_atoms.items():\n        indices = [atom_types.index(s[atom_index]) for s in v]\n        indices.extend([-1] * (4 - len(indices)))\n        chi_angles_index[k] = indices\n\n    for r in restypes:\n        res3 = restype_1to3[r]\n        one_hot = np.eye(atom_type_num)[chi_angles_index[res3]]\n        one_hots.append(one_hot)\n\n    one_hots.append(np.zeros([4, atom_type_num]))  # Add zeros for residue `X`.\n    one_hot = np.stack(one_hots, axis=0)\n    one_hot = np.transpose(one_hot, [0, 2, 1])\n\n    return one_hot\n\n\nchi_atom_1_one_hot = chi_angle_atom(1)\nchi_atom_2_one_hot = chi_angle_atom(2)\n\n# An array like chi_angles_atoms but using indices rather than names.\nchi_angles_atom_indices = [chi_angles_atoms[restype_1to3[r]] for r in restypes]\nchi_angles_atom_indices = tree.map_structure(\n    lambda atom_name: atom_order[atom_name], chi_angles_atom_indices)\nchi_angles_atom_indices = np.array([\n    chi_atoms + ([[0, 0, 0, 0]] * (4 - len(chi_atoms)))\n    for chi_atoms in chi_angles_atom_indices\n])\n\n# Mapping from (res_name, atom_name) pairs to the atom's chi group index\n# and atom index within that group.\nchi_groups_for_atom = collections.defaultdict(list)\nfor res_name, chi_angle_atoms_for_res in chi_angles_atoms.items():\n    for chi_group_i, chi_group in enumerate(chi_angle_atoms_for_res):\n        for atom_i, atom in enumerate(chi_group):\n            chi_groups_for_atom[(res_name, atom)].append((chi_group_i, atom_i))\nchi_groups_for_atom = dict(chi_groups_for_atom)\n\n\ndef _make_rigid_transformation_4x4(ex, ey, translation):\n    \"\"\"Create a rigid 4x4 transformation matrix from two axes and transl.\"\"\"\n    # Normalize ex.\n    ex_normalized = ex / np.linalg.norm(ex)\n\n    # make ey perpendicular to ex\n    ey_normalized = ey - np.dot(ey, ex_normalized) * ex_normalized\n    ey_normalized /= np.linalg.norm(ey_normalized)\n\n    # compute ez as cross product\n    eznorm = np.cross(ex_normalized, ey_normalized)\n    m = np.stack(\n        [ex_normalized, ey_normalized, eznorm, translation]).transpose()\n    m = np.concatenate([m, [[0., 0., 0., 1.]]], axis=0)\n    return m\n\n\n# create an array with (restype, atomtype) --> rigid_group_idx\n# and an array with (restype, atomtype, coord) for the atom positions\n# and compute affine transformation matrices (4,4) from one rigid group to the\n# previous group\nrestype_atom37_to_rigid_group = np.zeros([21, 37], dtype=np.int)\nrestype_atom37_mask = np.zeros([21, 37], dtype=np.float32)\nrestype_atom37_rigid_group_positions = np.zeros([21, 37, 3], dtype=np.float32)\nrestype_atom14_to_rigid_group = np.zeros([21, 14], dtype=np.int)\nrestype_atom14_mask = np.zeros([21, 14], dtype=np.float32)\nrestype_atom14_rigid_group_positions = np.zeros([21, 14, 3], dtype=np.float32)\nrestype_rigid_group_default_frame = np.zeros([21, 8, 4, 4], dtype=np.float32)\n\n\ndef _make_rigid_group_constants():\n    \"\"\"Fill the arrays above.\"\"\"\n    for restype, restype_letter in enumerate(restypes):\n        resname = restype_1to3[restype_letter]\n        for atomname, group_idx, atom_position in rigid_group_atom_positions[\n                resname]:\n            atomtype = atom_order[atomname]\n            restype_atom37_to_rigid_group[restype, atomtype] = group_idx\n            restype_atom37_mask[restype, atomtype] = 1\n            restype_atom37_rigid_group_positions[restype,\n                                                 atomtype, :] = atom_position\n\n            atom14idx = restype_name_to_atom14_names[resname].index(atomname)\n            restype_atom14_to_rigid_group[restype, atom14idx] = group_idx\n            restype_atom14_mask[restype, atom14idx] = 1\n            restype_atom14_rigid_group_positions[restype,\n                                                 atom14idx, :] = atom_position\n\n    for restype, restype_letter in enumerate(restypes):\n        resname = restype_1to3[restype_letter]\n        atom_positions = {\n            name: np.array(pos)\n            for name, _, pos in rigid_group_atom_positions[resname]\n        }\n\n        # backbone to backbone is the identity transform\n        restype_rigid_group_default_frame[restype, 0, :, :] = np.eye(4)\n\n        # pre-omega-frame to backbone (currently dummy identity matrix)\n        restype_rigid_group_default_frame[restype, 1, :, :] = np.eye(4)\n\n        # phi-frame to backbone\n        mat = _make_rigid_transformation_4x4(\n            ex=atom_positions['N'] - atom_positions['CA'],\n            ey=np.array([1., 0., 0.]),\n            translation=atom_positions['N'])\n        restype_rigid_group_default_frame[restype, 2, :, :] = mat\n\n        # psi-frame to backbone\n        mat = _make_rigid_transformation_4x4(\n            ex=atom_positions['C'] - atom_positions['CA'],\n            ey=atom_positions['CA'] - atom_positions['N'],\n            translation=atom_positions['C'])\n        restype_rigid_group_default_frame[restype, 3, :, :] = mat\n\n        # chi1-frame to backbone\n        if chi_angles_mask[restype][0]:\n            base_atom_names = chi_angles_atoms[resname][0]\n            base_atom_positions = [\n                atom_positions[name] for name in base_atom_names\n            ]\n            mat = _make_rigid_transformation_4x4(\n                ex=base_atom_positions[2] - base_atom_positions[1],\n                ey=base_atom_positions[0] - base_atom_positions[1],\n                translation=base_atom_positions[2])\n            restype_rigid_group_default_frame[restype, 4, :, :] = mat\n\n        # chi2-frame to chi1-frame\n        # chi3-frame to chi2-frame\n        # chi4-frame to chi3-frame\n        # luckily all rotation axes for the next frame start at (0,0,0) of the\n        # previous frame\n        for chi_idx in range(1, 4):\n            if chi_angles_mask[restype][chi_idx]:\n                axis_end_atom_name = chi_angles_atoms[resname][chi_idx][2]\n                axis_end_atom_position = atom_positions[axis_end_atom_name]\n                mat = _make_rigid_transformation_4x4(\n                    ex=axis_end_atom_position,\n                    ey=np.array([-1., 0., 0.]),\n                    translation=axis_end_atom_position)\n                restype_rigid_group_default_frame[restype, 4 +\n                                                  chi_idx, :, :] = mat\n\n\n_make_rigid_group_constants()\n\n\ndef make_atom14_dists_bounds(overlap_tolerance=1.5,\n                             bond_length_tolerance_factor=15):\n    \"\"\"compute upper and lower bounds for bonds to assess violations.\"\"\"\n    restype_atom14_bond_lower_bound = np.zeros([21, 14, 14], np.float32)\n    restype_atom14_bond_upper_bound = np.zeros([21, 14, 14], np.float32)\n    restype_atom14_bond_stddev = np.zeros([21, 14, 14], np.float32)\n    residue_bonds, residue_virtual_bonds, _ = load_stereo_chemical_props()\n    for restype, restype_letter in enumerate(restypes):\n        resname = restype_1to3[restype_letter]\n        atom_list = restype_name_to_atom14_names[resname]\n\n        # create lower and upper bounds for clashes\n        for atom1_idx, atom1_name in enumerate(atom_list):\n            if not atom1_name:\n                continue\n            atom1_radius = van_der_waals_radius[atom1_name[0]]\n            for atom2_idx, atom2_name in enumerate(atom_list):\n                if (not atom2_name) or atom1_idx == atom2_idx:\n                    continue\n                atom2_radius = van_der_waals_radius[atom2_name[0]]\n                lower = atom1_radius + atom2_radius - overlap_tolerance\n                upper = 1e10\n                restype_atom14_bond_lower_bound[restype, atom1_idx,\n                                                atom2_idx] = lower\n                restype_atom14_bond_lower_bound[restype, atom2_idx,\n                                                atom1_idx] = lower\n                restype_atom14_bond_upper_bound[restype, atom1_idx,\n                                                atom2_idx] = upper\n                restype_atom14_bond_upper_bound[restype, atom2_idx,\n                                                atom1_idx] = upper\n\n        # overwrite lower and upper bounds for bonds and angles\n        for b in residue_bonds[resname] + residue_virtual_bonds[resname]:\n            atom1_idx = atom_list.index(b.atom1_name)\n            atom2_idx = atom_list.index(b.atom2_name)\n            lower = b.length - bond_length_tolerance_factor * b.stddev\n            upper = b.length + bond_length_tolerance_factor * b.stddev\n            restype_atom14_bond_lower_bound[restype, atom1_idx,\n                                            atom2_idx] = lower\n            restype_atom14_bond_lower_bound[restype, atom2_idx,\n                                            atom1_idx] = lower\n            restype_atom14_bond_upper_bound[restype, atom1_idx,\n                                            atom2_idx] = upper\n            restype_atom14_bond_upper_bound[restype, atom2_idx,\n                                            atom1_idx] = upper\n            restype_atom14_bond_stddev[restype, atom1_idx,\n                                       atom2_idx] = b.stddev\n            restype_atom14_bond_stddev[restype, atom2_idx,\n                                       atom1_idx] = b.stddev\n    return {\n        'lower_bound': restype_atom14_bond_lower_bound,  # shape (21,14,14)\n        'upper_bound': restype_atom14_bond_upper_bound,  # shape (21,14,14)\n        'stddev': restype_atom14_bond_stddev,  # shape (21,14,14)\n    }\n"
  },
  {
    "path": "ppfleetx/models/protein_folding/template.py",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\n\nfrom ppfleetx.distributed.protein_folding import dap\n\nfrom .attentions import (\n    Attention,\n    TriangleMultiplication,\n    TriangleAttention, )\n\nfrom .common import (\n    Transition,\n    Dropout,\n    recompute_wrapper,\n    dgram_from_positions,\n    subbatch, )\n\nfrom . import (residue_constants, )\nfrom . import (quat_affine, )\n\n\nclass TemplatePair(nn.Layer):\n    \"\"\"Pair processing for the templates.\n\n    Jumper et al. (2021) Suppl. Alg. 16 \"TemplatePairStack\" lines 2-6\n    \"\"\"\n\n    def __init__(self, channel_num, config, global_config):\n        super(TemplatePair, self).__init__()\n        self.config = config\n        self.global_config = global_config\n\n        channel_num = {}\n        channel_num[\n            'pair_channel'] = self.config.triangle_attention_ending_node.value_dim\n\n        self.triangle_attention_starting_node = TriangleAttention(\n            channel_num,\n            self.config.triangle_attention_starting_node,\n            self.global_config,\n            name='triangle_attention_starting_node')\n\n        dropout_rate, dropout_axis = self._parse_dropout_params(\n            self.triangle_attention_starting_node)\n        self.triangle_starting_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \\\n            if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)\n\n        self.triangle_attention_ending_node = TriangleAttention(\n            channel_num,\n            self.config.triangle_attention_ending_node,\n            self.global_config,\n            name='triangle_attention_ending_node')\n\n        dropout_rate, dropout_axis = self._parse_dropout_params(\n            self.triangle_attention_ending_node)\n        self.triangle_ending_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \\\n            if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)\n\n        self.triangle_multiplication_outgoing = TriangleMultiplication(\n            channel_num,\n            self.config.triangle_multiplication_outgoing,\n            self.global_config,\n            name='triangle_multiplication_outgoing')\n\n        dropout_rate, dropout_axis = self._parse_dropout_params(\n            self.triangle_multiplication_outgoing)\n        self.triangle_outgoing_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \\\n            if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)\n\n        self.triangle_multiplication_incoming = TriangleMultiplication(\n            channel_num,\n            self.config.triangle_multiplication_incoming,\n            self.global_config,\n            name='triangle_multiplication_incoming')\n\n        dropout_rate, dropout_axis = self._parse_dropout_params(\n            self.triangle_multiplication_incoming)\n        self.triangle_incoming_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \\\n            if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)\n\n        self.pair_transition = Transition(\n            channel_num,\n            self.config.pair_transition,\n            self.global_config,\n            is_extra_msa=False,\n            transition_type='pair_transition')\n\n        dropout_rate, dropout_axis = self._parse_dropout_params(\n            self.pair_transition)\n        self.pair_transition_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \\\n            if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)\n\n    def _parse_dropout_params(self, module):\n        dropout_rate = 0.0 if self.global_config.deterministic else \\\n            module.config.dropout_rate\n        dropout_axis = None\n        if module.config.shared_dropout:\n            dropout_axis = {\n                'per_row': [0, 2, 3],\n                'per_column': [0, 1, 3],\n            }[module.config.orientation]\n\n        return dropout_rate, dropout_axis\n\n    def forward(self, pair_act, pair_mask):\n        \"\"\"Builds one block of TemplatePair module.\n\n        Arguments:\n        pair_act: Pair activations for single template, shape [batch, N_res, N_res, c_t].\n        pair_mask: Pair mask, shape [batch, N_res, N_res].\n\n        Returns:\n        Updated pair_act, shape [batch, N_res, N_res, c_t].\n        \"\"\"\n\n        pair_mask_row = dap.scatter(pair_mask, axis=1)\n        pair_mask_col = dap.scatter(pair_mask, axis=2)\n\n        residual = self.triangle_attention_starting_node(pair_act,\n                                                         pair_mask_row)\n        residual = self.triangle_starting_dropout(residual)\n        pair_act = pair_act + residual\n\n        pair_act = dap.row_to_col(pair_act)\n        residual = self.triangle_attention_ending_node(pair_act, pair_mask_col)\n        residual = self.triangle_ending_dropout(residual)\n        pair_act = pair_act + residual\n\n        pair_act = dap.col_to_row(pair_act)\n        residual = self.triangle_multiplication_outgoing(pair_act,\n                                                         pair_mask_row)\n        residual = self.triangle_outgoing_dropout(residual)\n        pair_act = pair_act + residual\n\n        pair_act = dap.row_to_col(pair_act)\n        residual = self.triangle_multiplication_incoming(pair_act,\n                                                         pair_mask_col)\n        residual = self.triangle_incoming_dropout(residual)\n        pair_act = pair_act + residual\n\n        residual = self.pair_transition(pair_act, pair_mask)\n        residual = self.pair_transition_dropout(residual)\n        pair_act = pair_act + residual\n\n        pair_act = dap.col_to_row(pair_act)\n\n        return pair_act\n\n\nclass SingleTemplateEmbedding(nn.Layer):\n    \"\"\"Embeds a single template.\n\n    Jumper et al. (2021) Suppl. Alg. 2 \"Inference\" lines 9+11\n    \"\"\"\n\n    def __init__(self, channel_num, config, global_config):\n        super(SingleTemplateEmbedding, self).__init__()\n        self.config = config\n        self.channel_num = channel_num\n        self.global_config = global_config\n\n        Linear = paddle.incubate.nn.FusedLinear if self.global_config.fuse_linear else paddle.nn.Linear\n\n        self.embedding2d = Linear(channel_num['template_pair'],\n                                  self.config.template_pair_stack.\n                                  triangle_attention_ending_node.value_dim)\n\n        self.template_pair_stack = nn.LayerList()\n        for _ in range(self.config.template_pair_stack.num_block):\n            self.template_pair_stack.append(\n                TemplatePair(self.channel_num, self.config.template_pair_stack,\n                             self.global_config))\n\n        self.output_layer_norm = nn.LayerNorm(self.config.attention.key_dim)\n\n    def forward(self, query_embedding, batch, mask_2d):\n        \"\"\"Build the single template embedding.\n\n        Arguments:\n            query_embedding: Query pair representation, shape [batch, N_res, N_res, c_z].\n            batch: A batch of template features (note the template dimension has been\n                stripped out as this module only runs over a single template).\n            mask_2d: Padding mask (Note: this doesn't care if a template exists,\n                unlike the template_pseudo_beta_mask).\n\n        Returns:\n            A template embedding [N_res, N_res, c_z].\n        \"\"\"\n        assert mask_2d.dtype == query_embedding.dtype\n        dtype = query_embedding.dtype\n        num_res = batch['template_aatype'].shape[1]\n        template_mask = batch['template_pseudo_beta_mask']\n        # template_mask[..., None] * template_mask[..., None, :]\n        template_mask_2d = template_mask.unsqueeze(\n            axis=-1) * template_mask.unsqueeze(axis=-2)\n        template_mask_2d = template_mask_2d.astype(dtype)\n\n        template_dgram = dgram_from_positions(batch['template_pseudo_beta'],\n                                              **self.config.dgram_features)\n        template_dgram = template_dgram.astype(dtype)\n\n        aatype = nn.functional.one_hot(batch['template_aatype'], 22)\n        aatype = aatype.astype(dtype)\n\n        to_concat = [template_dgram, template_mask_2d.unsqueeze(axis=-1)]\n        to_concat.append(\n            paddle.tile(\n                aatype.unsqueeze(axis=-3),  # aatype[..., None, :, :]\n                [1, num_res, 1, 1]))\n        to_concat.append(\n            paddle.tile(\n                aatype.unsqueeze(axis=-2),  # aatype[..., None, :]\n                [1, 1, num_res, 1]))\n\n        n, ca, c = [residue_constants.atom_order[a] for a in ('N', 'CA', 'C')]\n        rot, trans = quat_affine.make_transform_from_reference(\n            n_xyz=batch['template_all_atom_positions'][..., n, :],\n            ca_xyz=batch['template_all_atom_positions'][..., ca, :],\n            c_xyz=batch['template_all_atom_positions'][..., c, :])\n        affines = quat_affine.QuatAffine(\n            quaternion=quat_affine.rot_to_quat(rot),\n            translation=trans,\n            rotation=rot)\n\n        points = [\n            paddle.unsqueeze(\n                x, axis=-2) for x in paddle.unstack(\n                    affines.translation, axis=-1)\n        ]\n        affine_vec = affines.invert_point(points, extra_dims=1)\n        inv_distance_scalar = paddle.rsqrt(1e-6 + sum(\n            [paddle.square(x) for x in affine_vec]))\n\n        # Backbone affine mask: whether the residue has C, CA, N\n        # (the template mask defined above only considers pseudo CB).\n        template_mask = (batch['template_all_atom_masks'][..., n] *\n                         batch['template_all_atom_masks'][..., ca] *\n                         batch['template_all_atom_masks'][..., c])\n        # template_mask[..., None] * template_mask[..., None, :]\n        template_mask_2d = template_mask.unsqueeze(\n            axis=-1) * template_mask.unsqueeze(axis=-2)\n        inv_distance_scalar *= template_mask_2d.astype(\n            inv_distance_scalar.dtype)\n\n        unit_vector = [(x * inv_distance_scalar).unsqueeze(axis=-1)\n                       for x in affine_vec]\n        unit_vector = [x.astype(dtype) for x in unit_vector]\n        if not self.config.use_template_unit_vector:\n            unit_vector = [paddle.zeros_like(x) for x in unit_vector]\n        to_concat.extend(unit_vector)\n\n        template_mask_2d = template_mask_2d.astype(dtype)\n        to_concat.append(template_mask_2d.unsqueeze(axis=-1))\n\n        act = paddle.concat(to_concat, axis=-1)\n        # Mask out non-template regions so we don't get arbitrary values in the\n        # distogram for these regions.\n        act *= template_mask_2d.unsqueeze(axis=-1)\n\n        act = self.embedding2d(act)\n\n        act = dap.scatter(act, axis=1)\n        for idx, pair_encoder in enumerate(self.template_pair_stack):\n            act = recompute_wrapper(\n                pair_encoder,\n                act,\n                mask_2d,\n                is_recompute=self.training and idx >=\n                self.config.template_pair_stack.recompute_start_block_index)\n        act = dap.gather(act, axis=1)\n\n        act = self.output_layer_norm(act)\n        return act\n\n\nclass TemplateEmbedding(nn.Layer):\n    \"\"\"Embeds a set of templates.\n\n        Jumper et al. (2021) Suppl. Alg. 2 \"Inference\" lines 9-12\n        Jumper et al. (2021) Suppl. Alg. 17 \"TemplatePointwiseAttention\"\n    \"\"\"\n\n    def __init__(self, channel_num, config, global_config):\n        super(TemplateEmbedding, self).__init__()\n        self.config = config\n        self.global_config = global_config\n\n        self.single_template_embedding = SingleTemplateEmbedding(\n            channel_num, config, global_config)\n        self.attention = Attention(\n            config.attention, global_config, channel_num['pair_channel'],\n            config.attention.key_dim, channel_num['pair_channel'])\n\n    def forward(self, query_embedding, template_batch, mask_2d):\n        \"\"\"Build TemplateEmbedding module.\n\n        Arguments:\n            query_embedding: Query pair representation, shape [n_batch, N_res, N_res, c_z].\n            template_batch: A batch of template features.\n            mask_2d: Padding mask (Note: this doesn't care if a template exists,\n                unlike the template_pseudo_beta_mask).\n\n        Returns:\n            A template embedding [n_batch, N_res, N_res, c_z].\n        \"\"\"\n\n        num_templates = template_batch['template_mask'].shape[1]\n\n        num_channels = (self.config.template_pair_stack.\n                        triangle_attention_ending_node.value_dim)\n\n        num_res = query_embedding.shape[1]\n\n        dtype = query_embedding.dtype\n        template_mask = template_batch['template_mask']\n        template_mask = template_mask.astype(dtype)\n\n        query_channels = query_embedding.shape[-1]\n\n        outs = []\n        for i in range(num_templates):\n            # By default, num_templates = 4\n            batch0 = {\n                k: paddle.squeeze(\n                    v.slice([1], [i], [i + 1]), axis=1)\n                for k, v in template_batch.items()\n            }\n            outs.append(\n                self.single_template_embedding(query_embedding, batch0,\n                                               mask_2d))\n\n        template_pair_repr = paddle.stack(outs, axis=1)\n\n        flat_query = paddle.reshape(\n            query_embedding, [-1, num_res * num_res, 1, query_channels])\n        flat_templates = paddle.reshape(\n            paddle.transpose(template_pair_repr, [0, 2, 3, 1, 4]),\n            [-1, num_res * num_res, num_templates, num_channels])\n\n        bias = 1e9 * (template_mask[:, None, None, None, :] - 1.)\n\n        if not self.training:\n            sb_attn = subbatch(self.attention, [0, 1], [1, 1],\n                               self.config.subbatch_size, 1)\n            emb = sb_attn(flat_query, flat_templates, bias)\n\n        else:\n            emb = self.attention(flat_query, flat_templates, bias)\n\n        emb = paddle.reshape(emb, [-1, num_res, num_res, query_channels])\n\n        # No gradients if no templates.\n        emb *= (paddle.sum(template_mask) > 0.).astype(emb.dtype)\n        return emb\n"
  },
  {
    "path": "ppfleetx/models/vision_model/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/models/vision_model/factory.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport copy\nimport importlib\n\nfrom .vit import *\nfrom .loss import *\nfrom .metrics import *\nfrom .resnet import *\nfrom .moco import *\nfrom .layers import *\n\n__all__ = ['build', ]\n\n\ndef build(config):\n    if config is None:\n        return None\n    config = copy.deepcopy(config)\n    model_type = config.pop(\"name\")\n    mod = importlib.import_module(__name__)\n    model = getattr(mod, model_type)(**config)\n    return model\n"
  },
  {
    "path": "ppfleetx/models/vision_model/general_classification_module.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport sys\nimport copy\nimport importlib\nfrom collections import defaultdict\nimport numpy as np\n\nimport paddle\nfrom paddle import LazyGuard\nfrom paddle.static import InputSpec\nfrom ppfleetx.utils.log import logger\n\nfrom ppfleetx.core.module.basic_module import BasicModule\n\nfrom .factory import build\n\n\nclass GeneralClsModule(BasicModule):\n    def __init__(self, configs):\n        self.nranks = paddle.distributed.get_world_size()\n        self.model_configs = copy.deepcopy(configs.Model)\n        self.model_configs.pop('module')\n\n        # must init before loss function\n        super(GeneralClsModule, self).__init__(configs)\n\n        assert 'train' in self.model_configs.loss\n        self.loss_fn = build(self.model_configs.loss.train)\n        self.eval_loss_fn = None\n        if 'eval' in self.model_configs.loss:\n            self.eval_loss_fn = build(self.model_configs.loss.eval)\n\n        if 'train' in self.model_configs.metric:\n            self.train_metric_fn = build(self.model_configs.metric.train)\n        if 'eval' in self.model_configs.metric:\n            self.eval_metric_fn = build(self.model_configs.metric.eval)\n\n        self.train_batch_size = None\n        self.eval_batch_size = None\n        self.best_metric = 0.0\n        self.acc_list = []\n\n    def get_model(self):\n        if not hasattr(self, 'model') or self.model is None:\n            self.model = build(self.model_configs.model)\n\n        return self.model\n\n    def qat_model(self):\n        self.quanter = paddleslim.dygraph.quant.QAT(config=self.qat_config)\n        self.quanter.quantize(self.model)\n\n    def forward(self, inputs):\n        return self.model(inputs)\n\n    def training_step(self, batch):\n        inputs, labels = batch\n\n        if self.train_batch_size is None:\n            self.train_batch_size = inputs.shape[\n                0] * paddle.distributed.get_world_size()\n\n        inputs.stop_gradient = True\n        labels.stop_gradient = True\n\n        logits = self(inputs)\n        loss = self.loss_fn(logits, labels)\n\n        return loss\n\n    def training_step_end(self, log_dict):\n        ips = self.train_batch_size / log_dict['train_cost']\n        logger.info(\n            \"[train] epoch: %d, step: [%d/%d], learning rate: %.7f, loss: %.9f, batch_cost: %.5f sec, ips: %.2f images/sec\"\n            % (log_dict['epoch'], log_dict['batch'], log_dict['total_batch'],\n               log_dict['lr'], log_dict['loss'], log_dict['train_cost'], ips))\n\n    def validation_step(self, batch):\n        inputs, labels = batch\n\n        batch_size = inputs.shape[0]\n\n        inputs.stop_gradient = True\n        labels.stop_gradient = True\n\n        logits = self(inputs)\n        loss = self.eval_loss_fn(logits, labels)\n\n        if paddle.distributed.get_world_size() > 1:\n            label_list = []\n            paddle.distributed.all_gather(label_list, labels)\n            labels = paddle.concat(label_list, 0)\n\n            pred_list = []\n            paddle.distributed.all_gather(pred_list, logits)\n            logits = paddle.concat(pred_list, 0)\n\n        if self.eval_batch_size is None:\n            self.eval_batch_size = logits.shape[0]\n\n        acc = self.eval_metric_fn(logits, labels)\n        self.acc_list.append(acc)\n        return loss\n\n    def validation_step_end(self, log_dict):\n        ips = self.eval_batch_size / log_dict['eval_cost']\n        speed = self.configs['Engine']['logging_freq'] / log_dict['eval_cost']\n        logger.info(\n            \"[eval] epoch: %d, step: [%d/%d], loss: %.9f, batch_cost: %.5f sec, ips: %.2f images/sec\"\n            % (log_dict['epoch'], log_dict['batch'], log_dict['total_batch'],\n               log_dict['loss'], log_dict['eval_cost'], ips))\n\n    def input_spec(self):\n        return [\n            InputSpec(\n                shape=[None, 3, 224, 224], name=\"images\", dtype='float32')\n        ]\n\n    def training_epoch_end(self, log_dict):\n        logger.info(\"[Training] epoch: %d, total time: %.5f sec\" %\n                    (log_dict['epoch'], log_dict['train_cost']))\n\n    def validation_epoch_end(self, log_dict):\n        msg = ''\n        if len(self.acc_list) > 0:\n            ret = defaultdict(list)\n\n            for item in self.acc_list:\n                for key, val in item.items():\n                    ret[key].append(val)\n\n            for k, v in ret.items():\n                ret[k] = np.mean(v)\n\n            if 'metric' in ret and ret['metric'] > self.best_metric:\n                self.best_metric = ret['metric']\n\n            if 'metric' in ret:\n                ret['best_metric'] = self.best_metric\n\n            msg = ', '\n            msg += \", \".join([f'{k} = {v:.6f}' for k, v in ret.items()])\n            self.acc_list.clear()\n\n        logger.info(\"[Eval] epoch: %d, total time: %.5f sec%s\" %\n                    (log_dict['epoch'], log_dict['eval_cost'], msg))\n\n\nclass GeneralClsModuleAuto(BasicModule):\n    def __init__(self, configs):\n        self.nranks = paddle.distributed.get_world_size()\n        self.model_configs = copy.deepcopy(configs.Model)\n        self.model_configs.pop('module')\n\n        # must init before loss function\n        super(GeneralClsModuleAuto, self).__init__(configs)\n\n        assert 'loss' in self.model_configs\n        self.loss_fn = build(self.model_configs.loss)\n\n        if 'metric' in self.model_configs:\n            self.metric_fn = build(self.model_configs.metric)\n\n    def get_model(self):\n        with LazyGuard():\n            if not hasattr(self, 'model') or self.model is None:\n                self.model = build(self.model_configs.model)\n        return self.model\n\n    def input_spec(self):\n        return [\n            InputSpec(\n                shape=[None, 3, 224, 224], name=\"images\", dtype='float32')\n        ]\n"
  },
  {
    "path": "ppfleetx/models/vision_model/layers/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\n\nfrom .mlp import *\nfrom .identity import *\n"
  },
  {
    "path": "ppfleetx/models/vision_model/layers/attention.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom .initializer import xavier_uniform_, zeros_\n\n\nclass ViTAttention(nn.Layer):\n    def __init__(self,\n                 dim,\n                 num_heads=8,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 attn_drop=0.,\n                 proj_drop=0.):\n        super().__init__()\n        self.num_heads = num_heads\n        head_dim = dim // num_heads\n        self.scale = qk_scale or head_dim**-0.5\n\n        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)\n        self.attn_drop = nn.Dropout(attn_drop)\n        self.proj = nn.Linear(dim, dim)\n        self.proj_drop = nn.Dropout(proj_drop)\n\n        self.apply(self._init_weights)\n\n    def _init_weights(self, m):\n        if isinstance(m, nn.Linear):\n            xavier_uniform_(m.weight)\n            zeros_(m.bias)\n\n    def forward(self, x):\n        N, C = x.shape[1:]\n        qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C //\n                                   self.num_heads)).transpose((2, 0, 3, 1, 4))\n        q, k, v = qkv[0], qkv[1], qkv[2]\n\n        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale\n        attn = nn.functional.softmax(attn, axis=-1)\n        attn = self.attn_drop(attn)\n\n        x = (paddle.matmul(attn, v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))\n        x = self.proj(x)\n        x = self.proj_drop(x)\n        return x\n"
  },
  {
    "path": "ppfleetx/models/vision_model/layers/droppath.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\n\n\ndef drop_path(x, drop_prob=0., training=False):\n    \"\"\"Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).\n    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...\n    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...\n    \"\"\"\n    if drop_prob == 0. or not training:\n        return x\n    keep_prob = paddle.to_tensor(1 - drop_prob)\n    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)\n    if x.dtype == paddle.float16:\n        random_tensor = keep_prob + paddle.rand(\n            shape, dtype=paddle.float32).astype(x.dtype)\n    else:\n        random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)\n    random_tensor = paddle.floor(random_tensor)  # binarize\n    output = x.divide(keep_prob) * random_tensor\n    return output\n\n\nclass DropPath(nn.Layer):\n    \"\"\"Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).\n    \"\"\"\n\n    def __init__(self, drop_prob=None):\n        super(DropPath, self).__init__()\n        self.drop_prob = drop_prob\n\n    def forward(self, x):\n        return drop_path(x, self.drop_prob, self.training)\n"
  },
  {
    "path": "ppfleetx/models/vision_model/layers/embedding.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\n\n\nclass ViTPatchEmbed(nn.Layer):\n    \"\"\" Image to Patch Embedding\n    \"\"\"\n\n    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):\n        super().__init__()\n        img_size = img_size if isinstance(img_size, tuple) else (img_size,\n                                                                 img_size)\n        patch_size = patch_size if isinstance(patch_size, tuple) else (\n            patch_size, patch_size)\n        num_patches = (img_size[1] // patch_size[1]) * \\\n            (img_size[0] // patch_size[0])\n        self.img_size = img_size\n        self.patch_size = patch_size\n        self.num_patches = num_patches\n\n        self.proj = nn.Conv2D(\n            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)\n\n    def forward(self, x):\n        B, C, H, W = x.shape\n        assert H == self.img_size[0] and W == self.img_size[1], \\\n            f\"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]}).\"\n\n        x = self.proj(x).flatten(2).transpose((0, 2, 1))\n        return x\n"
  },
  {
    "path": "ppfleetx/models/vision_model/layers/identity.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\n\n__all__ = ['Identity', ]\n\n\nclass Identity(nn.Layer):\n    def __init__(self):\n        super(Identity, self).__init__()\n\n    def forward(self, input):\n        return input\n"
  },
  {
    "path": "ppfleetx/models/vision_model/layers/initializer.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport numpy as np\nimport math\nfrom paddle.nn.initializer import Constant, Normal, XavierUniform, Uniform\n\nmlp_bias_normal_ = Normal(std=1e-6)\npos_normal_ = Normal(std=0.02)\nxavier_uniform_ = XavierUniform()\nzeros_ = Constant(value=0.)\nminus_tens_ = Constant(value=-10.)\nones_ = Constant(value=1.)\n\n\ndef xavier_uniform_2d_(param, axis=-1):\n    fan_in = int(np.prod(param.shape[:axis]))\n    fan_out = int(np.prod(param.shape[axis:]))\n    limit = math.sqrt(6.0 / (fan_in + fan_out))\n    uniform = Uniform(low=-limit, high=limit)\n    uniform(param)\n"
  },
  {
    "path": "ppfleetx/models/vision_model/layers/mlp.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nfrom .initializer import xavier_uniform_, mlp_bias_normal_\n\n__all__ = ['ViTMLP', ]\n\n\nclass ViTMLP(nn.Layer):\n    def __init__(self,\n                 in_features,\n                 hidden_features=None,\n                 out_features=None,\n                 act_layer=nn.GELU,\n                 drop=0.):\n        super().__init__()\n        out_features = out_features or in_features\n        hidden_features = hidden_features or in_features\n        self.fc1 = nn.Linear(in_features, hidden_features)\n        self.act = act_layer()\n        self.fc2 = nn.Linear(hidden_features, out_features)\n        self.drop = nn.Dropout(drop)\n\n        self.apply(self._init_weights)\n\n    def _init_weights(self, m):\n        if isinstance(m, nn.Linear):\n            xavier_uniform_(m.weight)\n            mlp_bias_normal_(m.bias)\n\n    def forward(self, x):\n        x = self.fc1(x)\n        x = self.act(x)\n        x = self.drop(x)\n        x = self.fc2(x)\n        x = self.drop(x)\n        return x\n"
  },
  {
    "path": "ppfleetx/models/vision_model/loss/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom .cross_entropy import *\n"
  },
  {
    "path": "ppfleetx/models/vision_model/loss/cross_entropy.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\n\n__all__ = [\n    'ViTCELoss',\n    'CELoss',\n]\n\n\nclass CELoss(nn.Layer):\n    \"\"\"\n    Softmax Cross entropy loss\n    \"\"\"\n\n    def __init__(self, epsilon=None):\n        super().__init__()\n        if epsilon is not None:\n            assert epsilon >= 0 and epsilon <= 1, \"epsilon must be in [0, 1]\"\n        self.epsilon = epsilon\n\n    def _labelsmoothing(self, target, class_num):\n        if len(target.shape) == 1 or target.shape[-1] != class_num:\n            one_hot_target = F.one_hot(target, class_num)\n        else:\n            one_hot_target = target\n        soft_target = F.label_smooth(one_hot_target, epsilon=self.epsilon)\n        soft_target = paddle.reshape(soft_target, shape=[-1, class_num])\n        return soft_target\n\n    def forward(self, x, label):\n        if isinstance(x, dict):\n            x = x[\"logits\"]\n        if self.epsilon is not None:\n            class_num = x.shape[-1]\n            label = self._labelsmoothing(label, class_num)\n            x = -F.log_softmax(x, axis=-1)\n            loss = paddle.sum(x * label, axis=-1)\n        else:\n            if label.shape[-1] == x.shape[-1]:\n                loss = paddle.sum(-label * F.log_softmax(x, axis=-1), axis=-1)\n            else:\n                if label.dtype == paddle.int32:\n                    label = paddle.cast(label, 'int64')\n                loss = F.cross_entropy(x, label=label, soft_label=False)\n        loss = loss.mean()\n        return loss\n\n\nclass ViTCELoss(nn.Layer):\n    \"\"\"\n    ViT style Sigmoid Cross entropy loss\n    \"\"\"\n\n    def __init__(self, epsilon=None):\n        super().__init__()\n        if epsilon is not None:\n            assert epsilon >= 0 and epsilon <= 1, \"epsilon must be in [0, 1]\"\n        self.epsilon = epsilon\n\n    def forward(self, x, label):\n        if isinstance(x, dict):\n            x = x[\"logits\"]\n        class_num = x.shape[-1]\n        if len(label.shape) == 1 or label.shape[-1] != class_num:\n            label = F.one_hot(label, class_num)\n            label = paddle.reshape(label, shape=[-1, class_num])\n        if self.epsilon is not None:\n            # vit style label smoothing\n            with paddle.no_grad():\n                label = label * (1.0 - self.epsilon) + self.epsilon\n\n        if x.dtype == paddle.float16:\n            x = paddle.cast(x, 'float32')\n        loss = F.binary_cross_entropy_with_logits(x, label, reduction='none')\n        loss = paddle.sum(loss, axis=-1)\n        loss = loss.mean()\n\n        return loss\n"
  },
  {
    "path": "ppfleetx/models/vision_model/metrics/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom .accuracy import *\n"
  },
  {
    "path": "ppfleetx/models/vision_model/metrics/accuracy.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddle.nn as nn\n\n\nclass TopkAcc(nn.Layer):\n    def __init__(self, topk=(1, 5)):\n        super().__init__()\n        assert isinstance(topk, (int, list, tuple))\n        if isinstance(topk, int):\n            topk = [topk]\n        self.topk = topk\n\n    def forward(self, x, label):\n        if isinstance(x, dict):\n            x = x[\"logits\"]\n\n        if len(label.shape) == 1:\n            label = label.reshape([label.shape[0], -1])\n\n        if label.dtype == paddle.int32:\n            label = paddle.cast(label, 'int64')\n        metric_dict = dict()\n        for i, k in enumerate(self.topk):\n            acc = paddle.metric.accuracy(x, label, k=k).item()\n            metric_dict[\"top{}\".format(k)] = acc\n            if i == 0:\n                metric_dict[\"metric\"] = acc\n\n        return metric_dict\n"
  },
  {
    "path": "ppfleetx/models/vision_model/moco/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom .moco import *\n"
  },
  {
    "path": "ppfleetx/models/vision_model/moco/moco.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom collections.abc import Callable\n\nimport os\nimport copy\nimport numpy as np\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.nn.initializer import Constant, Normal\n\nfrom ..layers.identity import Identity\n\n__all__ = [\n    'MoCo',\n    'MoCoV2Projector',\n    'MoCoClassifier',\n]\n\n\n@paddle.no_grad()\ndef concat_all_gather(tensor):\n    \"\"\"\n    Performs all_gather operation on the provided tensors.\n    \"\"\"\n    if paddle.distributed.get_world_size() < 2:\n        return tensor\n\n    tensors_gather = []\n    paddle.distributed.all_gather(tensors_gather, tensor)\n\n    output = paddle.concat(tensors_gather, axis=0)\n    return output\n\n\nclass MoCoV2Projector(nn.Layer):\n    def __init__(self, with_pool, in_dim, out_dim):\n        super().__init__()\n\n        self.with_pool = with_pool\n        if with_pool:\n            self.avgpool = nn.Sequential(\n                nn.AdaptiveAvgPool2D((1, 1)), nn.Flatten(start_axis=1))\n\n        self.mlp = nn.Sequential(nn.Linear(in_dim, out_dim), nn.ReLU())\n\n    def forward(self, x):\n\n        if self.with_pool:\n            x = self.avgpool(x)\n\n        x = self.mlp(x)\n        return x\n\n\nclass MoCoClassifier(nn.Layer):\n    def __init__(self, with_pool, num_features, num_classes):\n        super().__init__()\n\n        self.with_pool = with_pool\n        if with_pool:\n            self.avgpool = nn.Sequential(\n                nn.AdaptiveAvgPool2D((1, 1)), nn.Flatten(start_axis=1))\n\n        self.fc = nn.Linear(num_features, num_classes)\n        normal_ = Normal(std=0.01)\n        zeros_ = Constant(value=0.)\n\n        normal_(self.fc.weight)\n        zeros_(self.fc.bias)\n\n    def forward(self, x):\n\n        if self.with_pool:\n            x = self.avgpool(x)\n        x = self.fc(x)\n        return x\n\n\nclass MoCo(nn.Layer):\n    \"\"\" MoCo v1, v2\n    \n    ref: https://github.com/facebookresearch/moco/blob/main/moco/builder.py\n    ref: https://github.com/PaddlePaddle/PASSL/blob/main/passl/modeling/architectures/moco.py\n    \"\"\"\n\n    def __init__(self,\n                 base_encoder,\n                 base_projector,\n                 base_classifier,\n                 momentum_encoder,\n                 momentum_projector,\n                 momentum_classifier,\n                 dim=128,\n                 K=65536,\n                 m=0.999,\n                 T=0.07,\n                 **kwargs):\n        super(MoCo, self).__init__()\n\n        self.m = m\n        self.T = T\n        self.K = K\n\n        self.base_encoder = nn.Sequential(base_encoder, base_projector,\n                                          base_classifier)\n        self.momentum_encoder = nn.Sequential(\n            momentum_encoder, momentum_projector, momentum_classifier)\n\n        for param_b, param_m in zip(self.base_encoder.parameters(),\n                                    self.momentum_encoder.parameters()):\n            param_m.copy_(param_b, False)  # initialize\n            param_m.stop_gradient = True  # not update by gradient\n\n        # create the queue\n        self.register_buffer(\"queue\", paddle.randn([dim, K]))\n        self.queue = F.normalize(self.queue, axis=0)\n\n        self.register_buffer(\"queue_ptr\", paddle.zeros([1], 'int64'))\n\n    @paddle.no_grad()\n    def _update_momentum_encoder(self):\n        \"\"\"Momentum update of the momentum encoder\"\"\"\n        #Note(GuoxiaWang): disable auto cast when use mix_precision\n        with paddle.amp.auto_cast(False):\n            for param_b, param_m in zip(self.base_encoder.parameters(),\n                                        self.momentum_encoder.parameters()):\n                paddle.assign((param_m * self.m + param_b * (1. - self.m)),\n                              param_m)\n                param_m.stop_gradient = True\n\n    @paddle.no_grad()\n    def _dequeue_and_enqueue(self, keys):\n        keys = concat_all_gather(keys)\n\n        batch_size = keys.shape[0]\n\n        ptr = int(self.queue_ptr[0])\n        assert self.K % batch_size == 0  # for simplicity\n\n        # replace the keys at ptr (dequeue and enqueue)\n        self.queue[:, ptr:ptr + batch_size] = keys.transpose([1, 0])\n        ptr = (ptr + batch_size) % self.K  # move pointer\n\n        self.queue_ptr[0] = ptr\n\n    @paddle.no_grad()\n    def _batch_shuffle_ddp(self, x):\n        \"\"\"\n        Batch shuffle, for making use of BatchNorm.\n        *** Only support DistributedDataParallel (DDP) model. ***\n        \"\"\"\n        # gather from all gpus\n        batch_size_this = x.shape[0]\n        x_gather = concat_all_gather(x)\n        batch_size_all = x_gather.shape[0]\n\n        num_gpus = batch_size_all // batch_size_this\n\n        # random shuffle index\n        idx_shuffle = paddle.randperm(batch_size_all)\n\n        # broadcast to all gpus\n        if paddle.distributed.get_world_size() > 1:\n            paddle.distributed.broadcast(idx_shuffle, src=0)\n\n        # index for restoring\n        idx_unshuffle = paddle.argsort(idx_shuffle)\n\n        # shuffled index for this gpu\n        gpu_idx = paddle.distributed.get_rank()\n        idx_this = idx_shuffle.reshape([num_gpus, -1])[gpu_idx]\n        return paddle.gather(x_gather, idx_this, axis=0), idx_unshuffle\n\n    @paddle.no_grad()\n    def _batch_unshuffle_ddp(self, x, idx_unshuffle):\n        \"\"\"\n        Undo batch shuffle.\n        *** Only support DistributedDataParallel (DDP) model. ***\n        \"\"\"\n        # gather from all gpus\n        batch_size_this = x.shape[0]\n        x_gather = concat_all_gather(x)\n        batch_size_all = x_gather.shape[0]\n\n        num_gpus = batch_size_all // batch_size_this\n\n        # restored index for this gpu\n        gpu_idx = paddle.distributed.get_rank()\n        idx_this = idx_unshuffle.reshape([num_gpus, -1])[gpu_idx]\n\n        return paddle.gather(x_gather, idx_this, axis=0)\n\n    def forward(self, x1, x2):\n\n        # compute query features\n        q = self.base_encoder(x1)  # queries: NxC\n        q = F.normalize(q, axis=1)\n\n        # compute key features\n        with paddle.no_grad():  # no gradient\n            self._update_momentum_encoder()  # update the momentum encoder\n\n            # shuffle for making use of BN\n            k, idx_unshuffle = self._batch_shuffle_ddp(x2)\n\n            k = self.momentum_encoder(k)  # keys: NxC\n            k = F.normalize(k, axis=1)\n\n            # undo shuffle\n            k = self._batch_unshuffle_ddp(k, idx_unshuffle)\n\n        # compute logits\n        # Einstein sum is more intuitive\n        # positive logits: Nx1\n        l_pos = paddle.sum(q * k, axis=1).unsqueeze(-1)\n        # negative logits: NxK\n        l_neg = paddle.matmul(q, self.queue.clone().detach())\n\n        # logits: Nx(1+K)\n        logits = paddle.concat((l_pos, l_neg), axis=1)\n\n        # apply temperature\n        logits /= self.T\n\n        # labels: positive key indicators\n        labels = paddle.zeros([logits.shape[0]], dtype=paddle.int64)\n\n        # dequeue and enqueue\n        self._dequeue_and_enqueue(k)\n\n        return (logits, labels)\n"
  },
  {
    "path": "ppfleetx/models/vision_model/moco_module.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport sys\nimport copy\nimport datetime\nfrom collections import defaultdict\nimport numpy as np\n\nimport paddle\nimport paddle.nn as nn\nfrom ppfleetx.utils.log import logger\n\nfrom ppfleetx.core.module.basic_module import BasicModule\n\nfrom .factory import build\nfrom .moco import MoCo\n\n\nclass MOCOModule(BasicModule):\n    def __init__(self, configs):\n        self.nranks = paddle.distributed.get_world_size()\n        self.model_configs = copy.deepcopy(configs.Model)\n        self.model_configs.pop('module')\n\n        # must init before loss function\n        super(MOCOModule, self).__init__(configs)\n\n        assert 'train' in self.model_configs.loss\n        self.loss_fn = build(self.model_configs.loss.train)\n\n        self.train_batch_size = None\n        self.best_metric = 0.0\n\n    def get_model(self):\n        if not hasattr(self, 'model') or self.model is None:\n            config = copy.deepcopy(self.model_configs.model)\n            base_encoder = build(self.model_configs.model.base_encoder)\n            base_projector = build(\n                self.model_configs.model.get('base_projector',\n                                             {\"name\": \"Identity\"}))\n            base_classifier = build(self.model_configs.model.base_classifier)\n            momentum_encoder = build(self.model_configs.model.momentum_encoder)\n            momentum_projector = build(\n                self.model_configs.model.get('momentum_projector',\n                                             {\"name\": \"Identity\"}))\n            momentum_classifier = build(\n                self.model_configs.model.momentum_classifier)\n\n            config['base_encoder'] = base_encoder\n            config['base_projector'] = base_projector\n            config['base_classifier'] = base_classifier\n            config['momentum_encoder'] = momentum_encoder\n            config['momentum_projector'] = momentum_projector\n            config['momentum_classifier'] = momentum_classifier\n\n            self.model = MoCo(**config)\n        return self.model\n\n    def forward(self, img_q, img_k):\n        return self.model(img_q, img_k)\n\n    def training_step(self, batch):\n        img_q, img_k = batch\n\n        # Note(GuoxiaWang)paddle.distributed.all_gather required CudaPlace\n        img_q = img_q.cuda()\n        img_k = img_k.cuda()\n\n        if self.train_batch_size is None:\n            self.train_batch_size = img_q.shape[\n                0] * paddle.distributed.get_world_size()\n\n        logits, labels = self(img_q, img_k)\n        loss = self.loss_fn(logits, labels)\n\n        return loss\n\n    def training_step_end(self, log_dict):\n        ips = self.train_batch_size / log_dict['train_cost']\n\n        total_step = log_dict['total_epoch'] * log_dict['total_batch']\n        cur_step = log_dict['epoch'] * log_dict['total_batch'] + log_dict[\n            'batch'] + 1\n        remained_step = total_step - cur_step\n        eta_sec = remained_step * log_dict['train_cost']\n        eta_msg = \"eta: {:s}\".format(\n            str(datetime.timedelta(seconds=int(eta_sec))))\n\n        logger.info(\n            \"[train] epoch: %d, step: [%d/%d], learning rate: %.7f, loss: %.9f, batch_cost: %.5f sec, ips: %.2f images/sec, %s\"\n            % (log_dict['epoch'], log_dict['batch'], log_dict['total_batch'],\n               log_dict['lr'], log_dict['loss'], log_dict['train_cost'], ips,\n               eta_msg))\n\n    def input_spec(self):\n        return [\n            InputSpec(\n                shape=[None, 3, 224, 224], name=\"images\", dtype='float32')\n        ]\n\n    def training_epoch_end(self, log_dict):\n        logger.info(\"[Training] epoch: %d, total time: %.5f sec\" %\n                    (log_dict['epoch'], log_dict['train_cost']))\n\n\nclass MOCOClsModule(BasicModule):\n    def __init__(self, configs):\n        self.nranks = paddle.distributed.get_world_size()\n        self.model_configs = copy.deepcopy(configs.Model)\n        self.model_configs.pop('module')\n\n        # must init before loss function\n        super(MOCOClsModule, self).__init__(configs)\n\n        assert 'train' in self.model_configs.loss\n        self.loss_fn = build(self.model_configs.loss.train)\n        self.eval_loss_fn = None\n        if 'eval' in self.model_configs.loss:\n            self.eval_loss_fn = build(self.model_configs.loss.eval)\n\n        if 'train' in self.model_configs.metric:\n            self.train_metric_fn = build(self.model_configs.metric.train)\n        if 'eval' in self.model_configs.metric:\n            self.eval_metric_fn = build(self.model_configs.metric.eval)\n\n        self.train_batch_size = None\n        self.eval_batch_size = None\n        self.best_metric = 0.0\n        self.acc_list = []\n\n    def _freeze_backbone(self, layer):\n        for param in layer.parameters():\n            param.trainable = False\n\n        def freeze_norm(layer):\n            if isinstance(layer, (nn.layer.norm._BatchNormBase)):\n                layer._use_global_stats = True\n\n        layer.apply(freeze_norm)\n\n    def get_model(self):\n        if not hasattr(self, 'model') or self.model is None:\n            pretrained_path = self.model_configs.model.base_encoder.pop(\n                \"pretrained\")\n            base_encoder = build(self.model_configs.model.base_encoder)\n            self._freeze_backbone(base_encoder)\n\n            pretrained_path = pretrained_path + \".pdparams\"\n            assert os.path.exists(\n                pretrained_path), f'{pretrained_path} is not exists!'\n            base_encoder_dict = paddle.load(pretrained_path)\n\n            for k in list(base_encoder_dict.keys()):\n                # retain only encoder_q up to before the embedding layer\n                if k.startswith('base_encoder.0.'):\n                    # remove prefix\n                    base_encoder_dict[k[len(\n                        \"base_encoder.0.\"):]] = base_encoder_dict[k]\n                    # delete renamed\n                    del base_encoder_dict[k]\n\n            for name, param in base_encoder.state_dict().items():\n                if name in base_encoder_dict and param.dtype != base_encoder_dict[\n                        name].dtype:\n                    base_encoder_dict[name] = base_encoder_dict[name].cast(\n                        param.dtype)\n\n            base_encoder.set_state_dict(base_encoder_dict)\n            logger.info(f'Load pretrained weight from {pretrained_path}')\n\n            base_classifier = build(self.model_configs.model.base_classifier)\n\n            self.model = nn.Sequential(base_encoder, base_classifier)\n        return self.model\n\n    def forward(self, inputs):\n        return self.model(inputs)\n\n    def training_step(self, batch):\n        inputs, labels = batch\n\n        if self.train_batch_size is None:\n            self.train_batch_size = inputs.shape[\n                0] * paddle.distributed.get_world_size()\n\n        inputs.stop_gradient = True\n        labels.stop_gradient = True\n\n        logits = self(inputs)\n        loss = self.loss_fn(logits, labels)\n\n        return loss\n\n    def training_step_end(self, log_dict):\n        ips = self.train_batch_size / log_dict['train_cost']\n\n        total_step = log_dict['total_epoch'] * log_dict['total_batch']\n        cur_step = log_dict['epoch'] * log_dict['total_batch'] + log_dict[\n            'batch'] + 1\n        remained_step = total_step - cur_step\n        eta_sec = remained_step * log_dict['train_cost']\n        eta_msg = \"eta: {:s}\".format(\n            str(datetime.timedelta(seconds=int(eta_sec))))\n\n        logger.info(\n            \"[train] epoch: %d, step: [%d/%d], learning rate: %.7f, loss: %.9f, batch_cost: %.5f sec, ips: %.2f images/sec, %s\"\n            % (log_dict['epoch'], log_dict['batch'], log_dict['total_batch'],\n               log_dict['lr'], log_dict['loss'], log_dict['train_cost'], ips,\n               eta_msg))\n\n    def validation_step(self, batch):\n        inputs, labels = batch\n\n        batch_size = inputs.shape[0]\n\n        inputs.stop_gradient = True\n        labels.stop_gradient = True\n\n        logits = self(inputs)\n        loss = self.eval_loss_fn(logits, labels)\n\n        if paddle.distributed.get_world_size() > 1:\n            label_list = []\n            paddle.distributed.all_gather(label_list, labels)\n            labels = paddle.concat(label_list, 0)\n\n            pred_list = []\n            paddle.distributed.all_gather(pred_list, logits)\n            logits = paddle.concat(pred_list, 0)\n\n        if self.eval_batch_size is None:\n            self.eval_batch_size = logits.shape[0]\n\n        acc = self.eval_metric_fn(logits, labels)\n        self.acc_list.append(acc)\n        return loss\n\n    def validation_step_end(self, log_dict):\n        ips = self.eval_batch_size / log_dict['eval_cost']\n        speed = self.configs['Engine']['logging_freq'] / log_dict['eval_cost']\n        logger.info(\n            \"[eval] epoch: %d, step: [%d/%d], loss: %.9f, batch_cost: %.5f sec, ips: %.2f images/sec\"\n            % (log_dict['epoch'], log_dict['batch'], log_dict['total_batch'],\n               log_dict['loss'], log_dict['eval_cost'], ips))\n\n    def input_spec(self):\n        return [\n            InputSpec(\n                shape=[None, 3, 224, 224], name=\"images\", dtype='float32')\n        ]\n\n    def training_epoch_end(self, log_dict):\n        logger.info(\"[Training] epoch: %d, total time: %.5f sec\" %\n                    (log_dict['epoch'], log_dict['train_cost']))\n\n    def validation_epoch_end(self, log_dict):\n        msg = ''\n        if len(self.acc_list) > 0:\n            ret = defaultdict(list)\n\n            for item in self.acc_list:\n                for key, val in item.items():\n                    ret[key].append(val)\n\n            for k, v in ret.items():\n                ret[k] = np.mean(v)\n\n            if 'metric' in ret and ret['metric'] > self.best_metric:\n                self.best_metric = ret['metric']\n\n            if 'metric' in ret:\n                ret['best_metric'] = self.best_metric\n\n            msg = ', '\n            msg += \", \".join([f'{k} = {v:.6f}' for k, v in ret.items()])\n            self.acc_list.clear()\n\n        logger.info(\"[Eval] epoch: %d, total time: %.5f sec%s\" %\n                    (log_dict['epoch'], log_dict['eval_cost'], msg))\n"
  },
  {
    "path": "ppfleetx/models/vision_model/resnet/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom paddle.vision.models.resnet import resnet18, resnet34, resnet50, resnet101, resnet152\n\n__all__ = [\n    'resnet18',\n    'resnet34',\n    'resnet50',\n    'resnet101',\n    'resnet152',\n]\n"
  },
  {
    "path": "ppfleetx/models/vision_model/vit/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom .vit import *\n"
  },
  {
    "path": "ppfleetx/models/vision_model/vit/vit.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom collections.abc import Callable\n\nimport os\nimport numpy as np\n\nimport paddle\nimport paddle.nn as nn\nimport paddle.nn.functional as F\nfrom paddle.incubate.nn import FusedMultiHeadAttention, FusedFeedForward\n\nfrom ppfleetx.utils.log import logger\nfrom ..layers.droppath import DropPath\nfrom ..layers.identity import Identity\nfrom ..layers.attention import ViTAttention\nfrom ..layers.embedding import ViTPatchEmbed\nfrom ..layers.mlp import ViTMLP\nfrom ..layers.initializer import (xavier_uniform_, xavier_uniform_2d_,\n                                  mlp_bias_normal_, zeros_, minus_tens_,\n                                  pos_normal_, ones_)\n\n__all__ = [\n    'ViT_tiny_patch16_224',\n    'ViT_base_patch16_224',\n    'ViT_base_patch16_384',\n    'ViT_base_patch32_224',\n    'ViT_base_patch32_384',\n    'ViT_large_patch16_224',\n    'ViT_large_patch16_384',\n    'ViT_large_patch32_224',\n    'ViT_large_patch32_384',\n    'ViT_huge_patch14_224',\n    'ViT_huge_patch14_384',\n    'ViT_g_patch14_224',\n    'ViT_G_patch14_224',\n    'ViT_6B_patch14_224',\n    'ViT',\n]\n\n\nclass FusedBlock(nn.Layer):\n    def __init__(self,\n                 dim,\n                 num_heads,\n                 mlp_ratio=4.,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 drop=0.,\n                 attn_drop=0.,\n                 drop_path=0.,\n                 act_layer=nn.GELU,\n                 norm_layer='nn.LayerNorm',\n                 epsilon=1e-5):\n        super().__init__()\n\n        assert qk_scale is None, \"Fused attention doesn't support qk_scale.\"\n        if isinstance(drop_path, (float, int)):\n            assert drop_path == 0.0, \"Fused attention doesn't support drop_path.\"\n        elif isinstance(drop_path, (tuple, list)):\n            assert drop_path == [0.0] * len(\n                drop_path), \"Fused attention doesn't support drop_path.\"\n        assert norm_layer == \"nn.LayerNorm\", \"Fused attention only support nn.LayerNorm\"\n        assert ((act_layer == nn.GELU) or (act_layer == nn.ReLU)) or \\\n                (isinstance(act_layer, str) and act_layer.lower() == \"gelu\" or act_layer.lower() == \"relu\"), \\\n                \"Fused attention only support GELU and ReLU activation.\"\n\n        self.attn = FusedMultiHeadAttention(\n            dim,\n            num_heads=num_heads,\n            qkv_bias_attr=qkv_bias,\n            dropout_rate=drop,\n            attn_dropout_rate=attn_drop,\n            normalize_before=True,\n            epsilon=epsilon)\n\n        mlp_hidden_dim = int(dim * mlp_ratio)\n        if (act_layer == nn.GELU) or act_layer.lower() == \"gelu\":\n            act_func = \"gelu\"\n        else:\n            act_func = \"relu\"\n        self.mlp = FusedFeedForward(\n            d_model=dim,\n            dim_feedforward=mlp_hidden_dim,\n            dropout_rate=drop,\n            activation=act_func,\n            act_dropout_rate=drop,\n            normalize_before=True)\n\n        xavier_uniform_2d_(self.attn.qkv_weight)\n        xavier_uniform_2d_(self.attn.linear_weight)\n        xavier_uniform_2d_(self.mlp._linear1_weight)\n        xavier_uniform_2d_(self.mlp._linear2_weight)\n\n        zeros_(self.attn.qkv_bias)\n        zeros_(self.attn.linear_bias)\n        mlp_bias_normal_(self.mlp._linear1_bias)\n        mlp_bias_normal_(self.mlp._linear2_bias)\n\n    def forward(self, x):\n        return self.mlp(self.attn(x))\n\n\nclass Block(nn.Layer):\n    def __init__(self,\n                 dim,\n                 num_heads,\n                 mlp_ratio=4.,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 drop=0.,\n                 attn_drop=0.,\n                 drop_path=0.,\n                 act_layer=nn.GELU,\n                 norm_layer='nn.LayerNorm',\n                 epsilon=1e-5):\n        super().__init__()\n        if isinstance(norm_layer, str):\n            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)\n        elif isinstance(norm_layer, Callable):\n            self.norm1 = norm_layer(dim)\n        else:\n            raise TypeError(\n                \"The norm_layer must be str or paddle.nn.layer.Layer class\")\n        self.attn = ViTAttention(\n            dim,\n            num_heads=num_heads,\n            qkv_bias=qkv_bias,\n            qk_scale=qk_scale,\n            attn_drop=attn_drop,\n            proj_drop=drop)\n        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here\n        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()\n        if isinstance(norm_layer, str):\n            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)\n        elif isinstance(norm_layer, Callable):\n            self.norm2 = norm_layer(dim)\n        else:\n            raise TypeError(\n                \"The norm_layer must be str or paddle.nn.layer.Layer class\")\n        mlp_hidden_dim = int(dim * mlp_ratio)\n        self.mlp = ViTMLP(\n            in_features=dim,\n            hidden_features=mlp_hidden_dim,\n            act_layer=act_layer,\n            drop=drop)\n\n    def forward(self, x):\n        x = x + self.drop_path(self.attn(self.norm1(x)))\n        x = x + self.drop_path(self.mlp(self.norm2(x)))\n        return x\n\n\nclass ViT(nn.Layer):\n    \"\"\" Vision Transformer with support for patch input\n    \"\"\"\n\n    def __init__(self,\n                 img_size=224,\n                 patch_size=16,\n                 in_chans=3,\n                 class_num=1000,\n                 embed_dim=768,\n                 depth=12,\n                 num_heads=12,\n                 mlp_ratio=4,\n                 qkv_bias=False,\n                 qk_scale=None,\n                 drop_rate=0.,\n                 attn_drop_rate=0.,\n                 drop_path_rate=0.,\n                 norm_layer='nn.LayerNorm',\n                 epsilon=1e-5,\n                 representation_size=None,\n                 use_fused_attn=False,\n                 **kwargs):\n        super().__init__()\n        self.class_num = class_num\n        self.representation_size = representation_size\n        self.num_heads = num_heads\n        self.num_features = self.embed_dim = embed_dim\n\n        self.patch_embed = ViTPatchEmbed(\n            img_size=img_size,\n            patch_size=patch_size,\n            in_chans=in_chans,\n            embed_dim=embed_dim)\n        num_patches = self.patch_embed.num_patches\n\n        self.pos_embed = self.create_parameter(\n            shape=(1, num_patches + 1, embed_dim), default_initializer=zeros_)\n        self.cls_token = self.create_parameter(\n            shape=(1, 1, embed_dim), default_initializer=zeros_)\n        self.pos_drop = nn.Dropout(p=drop_rate)\n\n        dpr = np.linspace(0, drop_path_rate, depth)\n\n        self.use_fused_attn = use_fused_attn\n        block_fn = FusedBlock if self.use_fused_attn else Block\n        if self.use_fused_attn:\n            logger.info(\n                \"ViT use fused attention. Fused attention model checkpoint will be\" \\\n                \" saved in normal attention format for inference checkpoint export,\" \\\n                \" and its optimizer checkpoint keeps the same.\")\n        self.blocks = nn.LayerList([\n            block_fn(\n                dim=embed_dim,\n                num_heads=num_heads,\n                mlp_ratio=mlp_ratio,\n                qkv_bias=qkv_bias,\n                qk_scale=qk_scale,\n                drop=drop_rate,\n                attn_drop=attn_drop_rate,\n                drop_path=dpr[i],\n                norm_layer=norm_layer,\n                epsilon=epsilon) for i in range(depth)\n        ])\n\n        self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)\n\n        # Classifier head\n        if self.representation_size is not None:\n            self.head0 = nn.Linear(embed_dim, representation_size)\n            self.tanh = nn.Tanh()\n            self.head = nn.Linear(representation_size,\n                                  class_num) if class_num > 0 else Identity()\n            xavier_uniform_(self.head0.weight)\n            zeros_(self.head0.bias)\n            xavier_uniform_(self.head.weight)\n            minus_tens_(self.head.bias)\n        else:\n            self.head = nn.Linear(embed_dim,\n                                  class_num) if class_num > 0 else Identity()\n            zeros_(self.head.weight)\n            zeros_(self.head.bias)\n\n        pos_normal_(self.pos_embed)\n        zeros_(self.cls_token)\n        self.apply(self._init_weights)\n\n        pretrained_configs = kwargs.pop('pretrained', None)\n        if pretrained_configs is not None:\n            self.load_pretrained(**pretrained_configs)\n\n    def _init_weights(self, m):\n        if isinstance(m, nn.LayerNorm):\n            zeros_(m.bias)\n            ones_(m.weight)\n\n    def forward_features(self, x):\n        # B = x.shape[0]\n        B = paddle.shape(x)[0]\n        x = self.patch_embed(x)\n        cls_tokens = self.cls_token.expand((B, -1, -1))\n        x = paddle.concat((cls_tokens, x), axis=1)\n        x = x + self.pos_embed\n        x = self.pos_drop(x)\n        for blk in self.blocks:\n            x = blk(x)\n        x = self.norm(x)\n        return x[:, 0]\n\n    def forward(self, x):\n        x = self.forward_features(x)\n        if self.representation_size is not None:\n            x = self.tanh(self.head0(x))\n        x = self.head(x)\n        return x\n\n    # Saved the fused attention checkpoint in origin attention checkpoint format\n    replaced_dict = {\n        # FusedMultiHeadAttention\n        'attn.pre_ln_scale': 'norm1.weight',\n        'attn.pre_ln_bias': 'norm1.bias',\n        'attn.qkv_weight': 'attn.qkv.weight',\n        'attn.qkv_bias': 'attn.qkv.bias',\n        'attn.linear_weight': 'attn.proj.weight',\n        'attn.linear_bias': 'attn.proj.bias',\n        # FusedFeedForward\n        'mlp._ln1_scale': 'norm2.weight',\n        'mlp._ln1_bias': 'norm2.bias',\n        'mlp._linear1_weight': 'mlp.fc1.weight',\n        'mlp._linear1_bias': 'mlp.fc1.bias',\n        'mlp._linear2_weight': 'mlp.fc2.weight',\n        'mlp._linear2_bias': 'mlp.fc2.bias',\n    }\n\n    @paddle.no_grad()\n    def state_dict(self,\n                   destination=None,\n                   include_sublayers=True,\n                   structured_name_prefix=\"\",\n                   use_hook=True):\n        state_dict = super().state_dict(destination, include_sublayers,\n                                        structured_name_prefix, use_hook)\n        if self.use_fused_attn:\n            new_dict = []\n            poped_keys = []\n            for key, value in state_dict.items():\n                new_key = \"\"\n                for k, v in self.replaced_dict.items():\n                    if k in key:\n                        new_key = key.replace(k, v)\n                        break\n                if new_key != \"\":\n                    value_name = value.name\n                    if 'attn.qkv.weight' in new_key:\n                        value = value.reshape([-1, value.shape[-1]]).transpose(\n                            [1, 0])\n                    if 'attn.qkv.bias' in new_key:\n                        value = value.reshape([-1])\n                    # value is a Tensor after transformation,\n                    # it will be transformed to ParamBase for auto_infer\n                    param = paddle.create_parameter(\n                        shape=value.shape, dtype=value.dtype)\n                    param.set_value(value)\n                    param.name = value_name\n                    new_dict.append({new_key: param})\n                    poped_keys.append(key)\n\n            for i in range(len(new_dict)):\n                state_dict.update(new_dict[i])\n                state_dict.pop(poped_keys[i])\n        return state_dict\n\n    @paddle.no_grad()\n    def set_state_dict(self, state_dict, use_structured_name=True):\n        reversed_replaced_dict = {}\n        for k, v in self.replaced_dict.items():\n            reversed_replaced_dict.update({v: k})\n\n        if self.use_fused_attn:\n            new_dict = []\n            poped_keys = []\n            for key, value in state_dict.items():\n                new_key = \"\"\n                for k, v in reversed_replaced_dict.items():\n                    if k in key:\n                        new_key = key.replace(k, v)\n                        break\n                if new_key != \"\":\n                    if 'attn.qkv_weight' in new_key:\n                        value = value.transpose([1, 0])\n                        value = value.reshape(\n                            [3, self.num_heads, -1, value.shape[-1]])\n                    if 'attn.qkv_bias' in new_key:\n                        value = value.reshape([3, self.num_heads, -1])\n                    new_dict.append({new_key: value})\n                    poped_keys.append(key)\n\n            for i in range(len(new_dict)):\n                state_dict.update(new_dict[i])\n                state_dict.pop(poped_keys[i])\n        super().set_state_dict(state_dict)\n\n    def load_pretrained(self, prefix_path, finetune=False):\n        if not os.path.exists(prefix_path + '.pdparams'):\n            raise ValueError(\"Model pretrain path {} does not \"\n                             \"exists.\".format(prefix_path))\n\n        state_dict = self.state_dict()\n        param_state_dict = paddle.load(prefix_path + \".pdparams\")\n\n        # for FP16 saving pretrained weight\n        for key, value in param_state_dict.items():\n            param_state_dict[key] = param_state_dict[key].astype(\n                paddle.float32)\n\n        if not finetune:\n            self.set_state_dict(param_state_dict)\n            return\n\n        for k in ['head0.weight', 'head0.bias', 'head.weight', 'head.bias']:\n            if k in param_state_dict:\n                print(f\"Removing key {k} from pretrained checkpoint\")\n                del param_state_dict[k]\n\n        # interpolate position embedding\n        pos_embed_checkpoint = param_state_dict['pos_embed']\n        embedding_size = pos_embed_checkpoint.shape[-1]\n        num_patches = self.patch_embed.num_patches\n        num_extra_tokens = self.pos_embed.shape[-2] - num_patches\n        # height (== width) for the checkpoint position embedding\n        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens)**\n                        0.5)\n        # height (== width) for the new position embedding\n        new_size = int(num_patches**0.5)\n        # class_token and dist_token are kept unchanged\n        extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]\n        # only the position tokens are interpolated\n        pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]\n        pos_tokens = paddle.transpose(\n            pos_tokens.reshape([-1, orig_size, orig_size, embedding_size]),\n            perm=[0, 3, 1, 2])\n        dtype = pos_tokens.dtype\n        pos_tokens = paddle.nn.functional.interpolate(\n            pos_tokens.astype(paddle.float32),\n            size=(new_size, new_size),\n            mode='bicubic',\n            align_corners=False).astype(dtype)\n        pos_tokens = paddle.transpose(\n            pos_tokens, perm=[0, 2, 3, 1]).flatten(1, 2)\n        new_pos_embed = paddle.concat((extra_tokens, pos_tokens), axis=1)\n        param_state_dict['pos_embed'] = new_pos_embed\n\n        self.set_state_dict(param_state_dict)\n        return\n\n\ndef ViT_tiny_patch16_224(**kwargs):\n    model = ViT(patch_size=16,\n                embed_dim=192,\n                depth=12,\n                num_heads=3,\n                mlp_ratio=4,\n                qkv_bias=True,\n                epsilon=1e-6,\n                representation_size=192,\n                **kwargs)\n    return model\n\n\ndef ViT_base_patch16_224(**kwargs):\n    model = ViT(patch_size=16,\n                embed_dim=768,\n                depth=12,\n                num_heads=12,\n                mlp_ratio=4,\n                qkv_bias=True,\n                epsilon=1e-6,\n                representation_size=768,\n                **kwargs)\n    return model\n\n\ndef ViT_base_patch16_384(**kwargs):\n    model = ViT(img_size=384,\n                patch_size=16,\n                embed_dim=768,\n                depth=12,\n                num_heads=12,\n                mlp_ratio=4,\n                qkv_bias=True,\n                epsilon=1e-6,\n                representation_size=None,\n                **kwargs)\n    return model\n\n\ndef ViT_base_patch32_224(**kwargs):\n    model = ViT(patch_size=32,\n                embed_dim=768,\n                depth=12,\n                num_heads=12,\n                mlp_ratio=4,\n                qkv_bias=True,\n                epsilon=1e-6,\n                representation_size=768,\n                **kwargs)\n    return model\n\n\ndef ViT_base_patch32_384(**kwargs):\n    model = ViT(img_size=384,\n                patch_size=32,\n                embed_dim=768,\n                depth=12,\n                num_heads=12,\n                mlp_ratio=4,\n                qkv_bias=True,\n                epsilon=1e-6,\n                representation_size=None,\n                **kwargs)\n    return model\n\n\ndef ViT_large_patch16_224(**kwargs):\n    model = ViT(patch_size=16,\n                embed_dim=1024,\n                depth=24,\n                num_heads=16,\n                mlp_ratio=4,\n                qkv_bias=True,\n                epsilon=1e-6,\n                representation_size=1024,\n                **kwargs)\n    return model\n\n\ndef ViT_large_patch16_384(**kwargs):\n    model = ViT(img_size=384,\n                patch_size=16,\n                embed_dim=1024,\n                depth=24,\n                num_heads=16,\n                mlp_ratio=4,\n                qkv_bias=True,\n                epsilon=1e-6,\n                representation_size=None,\n                **kwargs)\n    return model\n\n\ndef ViT_large_patch32_224(**kwargs):\n    model = ViT(patch_size=32,\n                embed_dim=1024,\n                depth=24,\n                num_heads=16,\n                mlp_ratio=4,\n                qkv_bias=True,\n                epsilon=1e-6,\n                representation_size=1024,\n                **kwargs)\n    return model\n\n\ndef ViT_large_patch32_384(**kwargs):\n    model = ViT(img_size=384,\n                patch_size=32,\n                embed_dim=1024,\n                depth=24,\n                num_heads=16,\n                mlp_ratio=4,\n                qkv_bias=True,\n                epsilon=1e-6,\n                representation_size=None,\n                **kwargs)\n    return model\n\n\ndef ViT_huge_patch14_224(**kwargs):\n    model = ViT(patch_size=14,\n                embed_dim=1280,\n                depth=32,\n                num_heads=16,\n                mlp_ratio=4,\n                qkv_bias=True,\n                epsilon=1e-6,\n                representation_size=1280,\n                **kwargs)\n    return model\n\n\ndef ViT_huge_patch14_384(**kwargs):\n    model = ViT(img_size=384,\n                patch_size=14,\n                embed_dim=1280,\n                depth=32,\n                num_heads=16,\n                mlp_ratio=4,\n                qkv_bias=True,\n                epsilon=1e-6,\n                representation_size=None,\n                **kwargs)\n    return model\n\n\ndef ViT_g_patch14_224(**kwargs):\n    model = ViT(img_size=224,\n                patch_size=14,\n                embed_dim=1408,\n                depth=40,\n                num_heads=16,\n                mlp_ratio=4.364,\n                qkv_bias=True,\n                epsilon=1e-6,\n                representation_size=1408,\n                **kwargs)\n    return model\n\n\ndef ViT_G_patch14_224(**kwargs):\n    model = ViT(img_size=224,\n                patch_size=14,\n                embed_dim=1664,\n                depth=48,\n                num_heads=16,\n                mlp_ratio=4.9231,\n                qkv_bias=True,\n                epsilon=1e-6,\n                representation_size=1664,\n                **kwargs)\n    return model\n\n\ndef ViT_6B_patch14_224(**kwargs):\n    model = ViT(img_size=224,\n                patch_size=14,\n                embed_dim=2320,\n                depth=80,\n                num_heads=16,\n                mlp_ratio=4.955,\n                qkv_bias=True,\n                epsilon=1e-6,\n                representation_size=2320,\n                **kwargs)\n    return model\n"
  },
  {
    "path": "ppfleetx/ops/setup_cuda.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom paddle.utils.cpp_extension import CUDAExtension, setup\n\nsetup(\n    name='ppfleetx_ops',\n    ext_modules=CUDAExtension(sources=['topp_sampling.cu']))\n"
  },
  {
    "path": "ppfleetx/ops/test_topp_sampling.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport numpy as np\nfrom ppfleetx.ops import topp_sampling\n\npaddle.seed(2022)\n\nx = paddle.randn([1, 51200], dtype=\"float16\")\nx = paddle.nn.functional.softmax(x)\ntop_ps = paddle.to_tensor(np.random.uniform(0, 1, [1]).astype(np.float16))\nout = topp_sampling(x, top_ps)\nprint(out)\n"
  },
  {
    "path": "ppfleetx/ops/topp_sampling.cu",
    "content": "// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <curand_kernel.h>\n#include <cuda_fp16.h>\n#include \"cub/cub.cuh\"\n#include \"paddle/extension.h\"\n\n#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x \" must be a GPU Tensor.\")\n\n#define FINAL_MASK 0xFFFFFFFF\n\n#define FIXED_BLOCK_DIM_BASE(dim, ...) \\\n  case (dim): {                        \\\n    constexpr auto kBlockDim = (dim);  \\\n    __VA_ARGS__;                       \\\n  } break\n\n\n#define FIXED_BLOCK_DIM(...)                 \\\n  FIXED_BLOCK_DIM_BASE(1024, ##__VA_ARGS__); \\\n  FIXED_BLOCK_DIM_BASE(512, ##__VA_ARGS__);  \\\n  FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__);  \\\n  FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__);  \\\n  FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);   \\\n  FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)\n\ntemplate <paddle::DataType D>\nclass PDTraits;\n\ntemplate <>\nclass PDTraits<paddle::DataType::FLOAT32> {\npublic:\n  typedef float DataType;\n  typedef float data_t;\n};\n\ntemplate <>\nclass PDTraits<paddle::DataType::FLOAT16> {\npublic:\n  typedef half DataType;\n  typedef paddle::float16 data_t;\n};\n\nstruct SegmentOffsetIter {\n    explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}\n\n    __host__ __device__ __forceinline__ int operator()(int idx) const {\n        return idx * num_cols_;\n    }\n\n    int num_cols_;\n};\n\ntemplate <typename T>\nstruct Pair {\n  __device__ __forceinline__ Pair() {}\n  __device__ __forceinline__ Pair(T value, int id) : v(value), id(id) {}\n\n  __device__ __forceinline__ void set(T value, int id) {\n    v = value;\n    id = id;\n  }\n\n  __device__ __forceinline__ void operator=(const Pair<T>& in) {\n    v = in.v;\n    id = in.id;\n  }\n\n  __device__ __forceinline__ bool operator<(const T value) const {\n    return ((float)v < (float)value);\n  }\n\n  __device__ __forceinline__ bool operator>(const T value) const {\n    return ((float)v > (float)value);\n  }\n  __device__ __forceinline__ bool operator<(const Pair<T>& in) const {\n    return ((float)v < (float)in.v) || (((float)v == (float)in.v) && (id > in.id));\n  }\n\n  __device__ __forceinline__ bool operator>(const Pair<T>& in) const {\n    return ((float)v > (float)in.v) || (((float)v == (float)in.v) && (id < in.id));\n  }\n\n  T v;\n  int id;\n};\n\ninline int div_up(int a, int n)\n{\n    return (a + n - 1) / n;\n}\n\n__global__ void setup_kernel(curandState_t *state, const uint64_t seed, const int bs) {\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  for (int i = idx; i < bs; i += gridDim.x * blockDim.x) {\n    curand_init(seed, 0, i, &state[i]);\n  }\n}\n\ntemplate <typename T>\n__device__ __forceinline__ void AddTo(Pair<T> topk[],\n                                      const Pair<T>& p,\n                                      int beam_size) {\n  for (int k = beam_size - 2; k >= 0; k--) {\n    if (topk[k] < p) {\n    topk[k + 1] = topk[k];\n    } else {\n    topk[k + 1] = p;\n    return;\n    }\n  }\n  topk[0] = p;\n}\n\ntemplate <typename T, int BlockSize>\n__device__ __forceinline__ void GetTopK(Pair<T> topk[],\n                                        const T* src,\n                                        int idx,\n                                        int dim,\n                                        int beam_size) {\n  while (idx < dim) {\n    if (topk[beam_size - 1] < src[idx]) {\n    Pair<T> tmp(src[idx], idx);\n    AddTo<T>(topk, tmp, beam_size);\n    }\n    idx += BlockSize;\n  }\n}\n\ntemplate <typename T, int BlockSize>\n__device__ __forceinline__ void GetTopK(Pair<T> topk[],\n                                        const T* src,\n                                        int idx,\n                                        int dim,\n                                        const Pair<T>& max,\n                                        int beam_size) {\n  while (idx < dim) {\n    if (topk[beam_size - 1] < src[idx]) {\n        Pair<T> tmp(src[idx], idx);\n        if (tmp < max) {\n            AddTo<T>(topk, tmp, beam_size);\n        }\n    }\n    idx += BlockSize;\n  }\n}\n\ntemplate <typename T, int MaxLength, int BlockSize>\n__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[],\n                                              int* beam,\n                                              int beam_size,\n                                              const T* src,\n                                              bool* firstStep,\n                                              bool* is_empty,\n                                              Pair<T>* max,\n                                              int dim,\n                                              const int tid) {\n  if (*beam > 0) {\n    int length = (*beam) < beam_size ? *beam : beam_size;\n    if (*firstStep) {\n      *firstStep = false;\n      GetTopK<T, BlockSize>(topk, src, tid, dim, length);\n    } else {\n      for (int k = 0; k < MaxLength; k++) {\n        if (k < MaxLength - (*beam)) {\n          topk[k] = topk[k + *beam];\n        } else {\n            topk[k].set(std::numeric_limits<T>::min(), -1);\n        }\n      }\n      if (!(*is_empty)) {\n        GetTopK<T, BlockSize>(\n            topk + MaxLength - *beam, src, tid, dim, *max, length);\n      }\n    }\n\n    *max = topk[MaxLength - 1];\n    if ((*max).id == -1) *is_empty = true;\n    *beam = 0;\n  }\n}\n\ntemplate <typename T>\n__forceinline__ __device__ Pair<T> WarpReduce(Pair<T> input) {\n#pragma unroll\n    for (int offset = 16; offset > 0; offset >>= 1) {\n        T tmp_val = __shfl_down_sync(FINAL_MASK, input.v, static_cast<unsigned>(offset), 32);\n        int tmp_id = __shfl_down_sync(FINAL_MASK, input.id, static_cast<unsigned>(offset), 32);\n        if ((float)input.v < (float)tmp_val) {\n            input.v = tmp_val;\n            input.id = tmp_id;\n        }\n    }\n    return input;\n}\n\ntemplate <typename T, int MaxLength, int BlockSize>\n__device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],\n                                            Pair<T> topk[],\n                                            Pair<T> beam_max[],\n                                            int* beam,\n                                            int* k,\n                                            int *count,\n                                            const int tid,\n                                            const int wid,\n                                            const int lane) {\n  while (true) {\n    __syncthreads();\n    Pair<T> input_now = topk[0];\n    input_now = WarpReduce(input_now);\n\n    if (lane == 0) {\n      shared_max[wid] = input_now;\n    }\n    __syncthreads();\n    input_now = (tid < BlockSize / 32)\n                    ? shared_max[lane]\n                    : Pair<T>(std::numeric_limits<T>::min(), -1);\n    if (wid == 0) {\n      input_now = WarpReduce(input_now);\n      if (lane == 0) shared_max[0] = input_now;\n    }\n    __syncthreads();\n    if (tid == 0) {\n      beam_max[*count] = shared_max[0]; \n      (*count)++;\n    }\n    int tid_max = shared_max[0].id % BlockSize;\n    if (tid == tid_max) {\n      (*beam)++;\n    }\n    if (--(*k) == 0) break;\n    __syncthreads();\n\n    if (tid == tid_max) {\n        if (*beam < MaxLength) {\n            topk[0] = topk[*beam];\n        }\n    }\n\n    if (MaxLength < 5) {\n      if (*beam >= MaxLength) break;\n    } else {\n      unsigned mask = 0u;\n      mask = __ballot_sync(FINAL_MASK, true);\n      if (tid_max / 32 == wid) {\n        if (__shfl_down_sync(FINAL_MASK, *beam, tid_max % 32, 32) ==\n            MaxLength)\n          break;\n      }\n    }\n  }\n}\n\ntemplate <typename T, int MaxLength, int TopPBeamTopK, int BlockSize>\n__global__ void KeMatrixTopPBeamTopK(const T* src,\n                                     T *top_ps,\n                                     int64_t *out_id, // topk id\n                                     T *out_val, // topk val\n                                     int vocab_size,\n                                     curandState_t *state,\n                                     int *count_iter,\n                                     int *count_iter_begin) {\n    const int tid = threadIdx.x;\n    const int wid = tid / 32;\n    const int lane = tid % 32;\n    const int bid = blockIdx.x;\n\n    int top_num = TopPBeamTopK;\n    float top_p_num = (float)top_ps[bid];\n\n    __shared__ Pair<T> shared_max[BlockSize / 32];\n    __shared__ Pair<T> beam_max[TopPBeamTopK];\n\n    Pair<T> topk[MaxLength];\n    int beam = MaxLength;\n    Pair<T> max;\n    bool is_empty = false;\n    bool firststep = true;\n    __shared__ int count;\n\n    if (tid == 0) {\n        count = 0;\n    }\n\n    for (int j = 0; j < MaxLength; j++) {\n        topk[j].set(std::numeric_limits<T>::min(), -1);\n    }\n\n    while (top_num) {\n        ThreadGetTopK<T, MaxLength, BlockSize>(topk,\n                                               &beam,\n                                               TopPBeamTopK,\n                                               src + bid * vocab_size,\n                                               &firststep,\n                                               &is_empty,\n                                               &max,\n                                               vocab_size,\n                                               tid);\n        BlockReduce<T, MaxLength, BlockSize>(shared_max,\n                                             topk,\n                                             beam_max,\n                                             &beam,\n                                             &top_num,\n                                             &count,\n                                             tid,\n                                             wid,\n                                             lane);\n    }\n    if (tid == 0) {\n        count_iter_begin[bid] = count_iter[bid];\n        float rand_top_p = curand_uniform(state + bid) * top_p_num;\n        top_ps[bid] = (T)rand_top_p;\n        float sum_prob = 0.0f;\n#pragma unroll\n        for(int i = 0; i < TopPBeamTopK; i++) {\n            sum_prob += (float)(beam_max[i].v);\n            if(sum_prob >= rand_top_p) {\n                count_iter_begin[bid] += 1;\n                out_id[bid] = (int64_t)beam_max[i].id;\n                out_val[bid] = beam_max[i].v;\n                break;\n            }\n        }\n    }\n}\n\n__global__ void SetCountIter(int *count_iter, int num) {\n    int tid = threadIdx.x;\n    int bid = blockIdx.x;\n    int idx = bid * blockDim.x + tid;\n    for (int i = idx; i < num; i += gridDim.x * blockDim.x) {\n        count_iter[i] = i;\n    }\n}\n\ntemplate <typename T>\n__global__ void FillIndex(T* indices, T num_rows, T num_cols) {\n  int col_id = threadIdx.x;\n  int row_id = blockIdx.x;\n\n  for (T j = row_id; j < num_rows; j += gridDim.x) {\n    for (T i = col_id; i < num_cols; i += blockDim.x) {\n      indices[j * num_cols + i] = i;\n    }\n  }\n}\n\nstruct BlockPrefixCallbackOp {\n    // Running prefix\n    float running_total;\n    // Constructor\n    __device__ BlockPrefixCallbackOp(float running_total): running_total(running_total) {}\n    // Callback operator to be entered by the first warp of threads in the block.\n    // Thread-0 is responsible for returning a value for seeding the block-wide scan.\n    __device__ float operator()(float block_aggregate)\n    {\n        float old_prefix = running_total;\n        running_total += block_aggregate;\n        return old_prefix;\n    }\n};\n\ntemplate <typename T, int BLOCK_SIZE>\n__global__ void topp_sampling(T *sorted_probs,\n                              int64_t *sorted_id,\n                              T *out_val,\n                              int64_t *out_id,\n                              const T *top_ps,\n                              int p_num,\n                              int vocab_size,\n                              int *count_iter,\n                              int *count_iter_begin) {\n    __shared__ int stop_shared;\n    __shared__ float rand_p;\n    const int tid = threadIdx.x;\n    const int bid = blockIdx.x;\n    constexpr int WARP_SIZE = 32;\n    constexpr int NUM_WARPS = BLOCK_SIZE / WARP_SIZE;\n    const int lane_id = tid % WARP_SIZE;\n    const int warp_id = tid / WARP_SIZE;\n    const float p_t = (float)top_ps[bid];\n    if (tid == 0) {\n        stop_shared = 0;\n        rand_p = p_t;\n    }\n    if (count_iter_begin[bid] == count_iter[bid + 1]) {\n        // topk\n        return;\n    }\n\n    typedef cub::BlockScan<float, BLOCK_SIZE>  BlockScan;\n    __shared__ typename BlockScan::TempStorage temp_storage;\n    __shared__ uint32_t selected_shared[NUM_WARPS];\n\n    // Initialize running total\n    BlockPrefixCallbackOp prefix_op(0);\n\n    if (lane_id == 0) {\n        selected_shared[warp_id] = 0;\n    }\n    __syncthreads();\n\n    int offset = bid * vocab_size;\n    int end = ((vocab_size + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE;\n    int i_activate = 0;\n    float thread_offset = 0;\n    for (int i = tid; i < end; i += BLOCK_SIZE) {\n        float thread_count = (i < vocab_size) ? (float)sorted_probs[offset + i] : 0.f;\n        BlockScan(temp_storage).InclusiveSum(thread_count, thread_offset, prefix_op);\n    \n        uint32_t activate_mask = __ballot_sync(FINAL_MASK, rand_p <= thread_offset);\n        \n        i_activate = i;\n        if (activate_mask != 0) {\n            if (lane_id == 0) {\n                atomicAdd(&stop_shared, 1);\n                selected_shared[warp_id] = activate_mask;\n            }\n        }\n        __syncthreads();\n        if(stop_shared > 0) {\n            break;\n        }\n    }\n\n    bool skip = (selected_shared[warp_id] > 0) ? false : true;\n    for (int i=0; i < warp_id; i++) {\n        if(selected_shared[i] != 0) {\n            skip = true;\n        }\n    }\n    if (!skip) {\n        int active_lane_id = WARP_SIZE - __popc(selected_shared[warp_id]); // first not 0\n        if (lane_id == active_lane_id) {\n            // printf(\"active_lane_id: %d, i_activate: %d.\\n\", active_lane_id, i_activate);\n            // for (int i=0; i < active_lane_id; i++) {\n            //   printf(\"p %d, value: %f\\n\", i, (float)(sorted_probs[offset + i]));\n            // }\n            out_id[bid] = sorted_id[offset + i_activate];\n            out_val[bid] = sorted_probs[offset + i_activate];\n        }\n    }\n}\n\nint GetBlockSize(int vocab_size) {\n    if (vocab_size > 512) {\n        return 1024;\n    } else if (vocab_size > 256) {\n        return 512;\n    } else if (vocab_size > 128) {\n        return 256;\n    } else if (vocab_size > 64) {\n        return 128;\n    } else {\n        return 64;\n    }\n}\n\ntemplate <typename T>\n__global__ void print_kernel(T *input, int size) {\n  printf(\"[\");\n  for (int i=0; i < size; i++) {\n    if (i != size-1) {\n      printf(\"%f, \", (float)input[i]);\n    } else {\n      printf(\"%f]\\n\", (float)input[i]);\n    }\n  }\n}\n\ntemplate <paddle::DataType D>\nstd::vector<paddle::Tensor> top_p_sampling_kernel(const paddle::Tensor& x, const paddle::Tensor& top_ps, int random_seed) {\n    typedef PDTraits<D> traits_;\n    typedef typename traits_::DataType DataType_;\n    typedef typename traits_::data_t data_t;\n    std::vector<int64_t> shape = x.shape();\n    auto cu_stream = x.stream();\n\n    int bs = shape[0];\n    int p_num = top_ps.numel();\n    PD_CHECK(bs == p_num, \"PD_CHECK returns \", false, \", expected bs == p_num.\");\n    int vocab_size = shape[1];\n    auto topp_ids = paddle::full({bs, 1}, 1, paddle::DataType::INT64, x.place());\n    auto topp_probs = paddle::full({bs, 1}, 1, x.dtype(), x.place());\n    auto inds_input = paddle::full({bs, vocab_size}, 1, paddle::DataType::INT64, x.place());\n    auto sorted_out = paddle::full({bs, vocab_size}, 1, x.dtype(), x.place());\n    auto sorted_id = paddle::full({bs, vocab_size}, 1, paddle::DataType::INT64, x.place());\n    \n\n    int BlockSize = GetBlockSize(vocab_size);\n    switch (BlockSize) {\n        FIXED_BLOCK_DIM(FillIndex<int64_t><<<bs, kBlockDim, 0, cu_stream>>>(inds_input.data<int64_t>(), bs, vocab_size));\n        default:\n            PD_THROW(\"the input data shape has error in the FillIndex kernel.\");\n    }\n\n    \n    static int count = 0;\n    static curandState_t* dev_curand_states;\n    if (count == 0) {\n#if CUDA_VERSION >= 11020\n      cudaMallocAsync(&dev_curand_states, bs * sizeof(curandState_t), cu_stream);\n#else\n      cudaMalloc(&dev_curand_states, bs * sizeof(curandState_t));\n#endif\n    }\n    srand((unsigned int)(time(NULL)));\n    setup_kernel<<<1, 256, 0, cu_stream>>>(dev_curand_states, rand() % random_seed, bs);\n    PD_CHECK(bs == p_num, \"PD_CHECK returns \", false, \", expected bs == p_num.\");\n\n    auto count_iter = paddle::empty({bs + 1}, paddle::DataType::INT32, x.place());\n    auto count_iter_begin = paddle::empty({bs}, paddle::DataType::INT32, x.place());\n    SetCountIter<<<1, 256, 0, cu_stream>>>(count_iter.data<int>(), bs + 1);\n\n    constexpr int TopKMaxLength = 1;\n    constexpr int TopPBeamTopK = 1;\n    switch (BlockSize) {\n        FIXED_BLOCK_DIM(\n            KeMatrixTopPBeamTopK<DataType_, TopKMaxLength, TopPBeamTopK, kBlockDim><<<bs, kBlockDim, 0, cu_stream>>>(\n                reinterpret_cast<DataType_*>(const_cast<data_t*>(x.data<data_t>())),\n                reinterpret_cast<DataType_*>(const_cast<data_t*>(top_ps.data<data_t>())),\n                topp_ids.data<int64_t>(),\n                reinterpret_cast<DataType_*>(topp_probs.data<data_t>()),\n                vocab_size,\n                dev_curand_states,\n                count_iter.data<int>(),\n                count_iter_begin.data<int>()));\n        default:\n            PD_THROW(\"the input data shape has error in the topp_beam_topk kernel.\");\n    }\n//     if (count % random_seed == random_seed - 1) {\n// #if CUDA_VERSION >= 11020\n//       cudaFreeAsync(dev_curand_states, cu_stream);\n// #else\n//       cudaFree(dev_curand_states);\n// #endif\n//     }\n    count++;\n\n    size_t temp_storage_bytes = 0;\n\n    cub::TransformInputIterator<int, SegmentOffsetIter, int*>\n        segment_offsets_t_begin(count_iter_begin.data<int>(),\n                                SegmentOffsetIter(vocab_size));\n\n    cub::TransformInputIterator<int, SegmentOffsetIter, int*>\n        segment_offsets_t_end(count_iter.data<int>(),\n                              SegmentOffsetIter(vocab_size));\n    \n    DataType_ *x_ptr = reinterpret_cast<DataType_*>(const_cast<data_t*>(x.data<data_t>()));\n    DataType_ *sorted_out_ptr = reinterpret_cast<DataType_*>(const_cast<data_t*>(sorted_out.data<data_t>()));\n    int64_t *in_id_ptr = inds_input.data<int64_t>();\n    int64_t *out_id_ptr = sorted_id.data<int64_t>();\n\n    cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr,\n                                                       temp_storage_bytes,\n                                                       x_ptr,\n                                                       sorted_out_ptr,\n                                                       in_id_ptr,\n                                                       out_id_ptr,\n                                                       vocab_size * bs,\n                                                       bs,\n                                                       segment_offsets_t_begin,\n                                                       segment_offsets_t_end + 1,\n                                                       0,\n                                                       sizeof(data_t) * 8,\n                                                       cu_stream);\n\n    temp_storage_bytes = div_up(temp_storage_bytes, 256) * 256;\n    int64_t temp_size = temp_storage_bytes;\n    auto temp_storage = paddle::empty({temp_size}, paddle::DataType::UINT8, x.place());\n\n    cub::DeviceSegmentedRadixSort::SortPairsDescending(\n        temp_storage.data<uint8_t>(),\n        temp_storage_bytes,\n        x_ptr,\n        sorted_out_ptr,\n        in_id_ptr,\n        out_id_ptr,\n        vocab_size * bs,\n        bs,\n        segment_offsets_t_begin,\n        segment_offsets_t_end + 1,\n        0,\n        sizeof(data_t) * 8,\n        cu_stream);\n\n    switch (BlockSize) {\n      FIXED_BLOCK_DIM(\n          topp_sampling<DataType_, kBlockDim><<<bs, kBlockDim, 0, cu_stream>>>(\n              sorted_out_ptr,\n              out_id_ptr,\n              reinterpret_cast<DataType_*>(topp_probs.data<data_t>()),\n              topp_ids.data<int64_t>(),\n              reinterpret_cast<DataType_*>(const_cast<data_t*>(top_ps.data<data_t>())),\n              p_num,\n              vocab_size,\n              count_iter.data<int>(),\n              count_iter_begin.data<int>()));\n      default:\n          PD_THROW(\"the input data shape has error in the topp_sampling kernel.\");\n    }\n    return {topp_probs, topp_ids};\n}\n\n\nstd::vector<paddle::Tensor> TopPSampling(const paddle::Tensor& x, const paddle::Tensor& top_ps, int random_seed) {\n    switch (x.type()) {\n        case paddle::DataType::FLOAT16: {\n            return top_p_sampling_kernel<paddle::DataType::FLOAT16>(\n                x,\n                top_ps,\n                random_seed\n            );\n        }\n        case paddle::DataType::FLOAT32: {\n            return top_p_sampling_kernel<paddle::DataType::FLOAT32>(\n                x,\n                top_ps,\n                random_seed\n            );\n        }\n        default: {\n            PD_THROW(\n                \"NOT supported data type. \"\n                \"Only float16 and float32 are supported. \");\n            break;\n        }\n    }\n}\n\nstd::vector<std::vector<int64_t>> TopPSamplingInferShape(const std::vector<int64_t>& x_shape,\n                                                         const std::vector<int64_t>& top_ps_shape) {\n    std::vector<int64_t> out_probs_shape = {x_shape[0], 1};                                                          \n    std::vector<int64_t> out_ids_shape = {x_shape[0], 1};\n    return {out_probs_shape, out_ids_shape};\n}\n\nstd::vector<paddle::DataType> TopPSamplingInferDtype(const paddle::DataType& x_dtype,\n                                                     const paddle::DataType& top_ps_dtype) {\n    return {x_dtype, paddle::DataType::INT64};\n}\n\nPD_BUILD_OP(topp_sampling)\n    .Inputs({\"x\", \"top_ps\"})\n    .Outputs({\"topp_probs\", \"topp_ids\"})\n    .Attrs({\"random_seed: int\"})\n    .SetKernelFn(PD_KERNEL(TopPSampling))\n    .SetInferShapeFn(PD_INFER_SHAPE(TopPSamplingInferShape))\n    .SetInferDtypeFn(PD_INFER_DTYPE(TopPSamplingInferDtype));"
  },
  {
    "path": "ppfleetx/optims/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom collections import defaultdict\nimport sys\nimport copy\n\nimport paddle\nfrom paddle.optimizer.lr import LRScheduler\n\nfrom .lr_scheduler import *\nfrom .optimizer import *\nfrom .grad_clip import *\n\nfrom ppfleetx.utils.log import logger\n\n\ndef build_lr_scheduler(lr_config):\n    if 'name' in lr_config:\n        lr_name = lr_config.pop('name')\n        lr = eval(lr_name)(**lr_config)\n        if isinstance(lr, LRScheduler):\n            return lr\n        else:\n            return lr()\n    else:\n        lr = lr_config.learning_rate\n\n    logger.debug(\"build lr ({}) success..\".format(lr))\n    return lr\n\n\ndef build_grad_clip(grad_clip_config):\n    if grad_clip_config is not None:\n        grad_clip_name = grad_clip_config.pop('name', 'ClipGradByGlobalNorm')\n        clip_norm = grad_clip_config.get('clip_norm', 1.0)\n        grad_clip = eval(grad_clip_name)(\n            **grad_clip_config) if clip_norm != 0. else None\n        return grad_clip\n    else:\n        return None\n\n\ndef build_optimizer(config, model, lr_scheduler=None):\n    config = copy.deepcopy(config)\n    if lr_scheduler is not None:\n        config.pop('lr')\n\n    multi_precision = config.get('multi_precision', False)\n    if multi_precision:\n        paddle.nn.clip._clip_by_global_norm_using_mp_type(True)\n\n    grad_clip_config = config.pop('grad_clip', None)\n    grad_clip = build_grad_clip(grad_clip_config)\n\n    optim_name = config.pop('name')\n    optim = eval(optim_name)(learning_rate=lr_scheduler,\n                             parameters=model.parameters(),\n                             grad_clip=grad_clip,\n                             **config)\n\n    logger.debug(\"build optimizer ({}) success..\".format(optim))\n    return optim\n"
  },
  {
    "path": "ppfleetx/optims/grad_clip.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nfrom paddle.nn.clip import ClipGradByGlobalNorm\n\nfrom paddle.nn.clip import ClipGradBase, _squared_l2_norm\nfrom paddle.fluid.dygraph import base as imperative_base\nfrom paddle.fluid import core, layers\nfrom paddle.distributed import collective\nimport paddle.distributed.fleet as fleet\n\nfrom ppfleetx.distributed.apis import env\n\n\nclass ClipGradForMOEByGlobalNorm(ClipGradBase):\n    def __init__(self, clip_norm):\n        super(ClipGradForMOEByGlobalNorm, self).__init__()\n        self.clip_norm = float(clip_norm)\n\n        self.moe_group = None\n        self.world_size = paddle.distributed.get_world_size()\n        if self.world_size > 1:\n            hcg = env.get_hcg()\n            self.moe_group = hcg.get_expert_parallel_group()\n\n    def __str__(self):\n        return \"Gradient Clip By GlobalNorm, global_norm=%f\" % (self.clip_norm)\n\n    @staticmethod\n    def get_l2_norm_pow(params_grads, sum_dtype=None):\n        sum_square_list = []\n        sum_square_list_fp16 = []\n        sum_square_list_fp32 = []\n        for p, g in params_grads:\n            if g is None:\n                continue\n            if getattr(p, 'need_clip', True) is False:\n                continue\n            merge_grad = g\n            if g.type == core.VarDesc.VarType.SELECTED_ROWS:\n                merge_grad = layers.merge_selected_rows(g)\n                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)\n            sum_square = _squared_l2_norm(merge_grad)\n            if sum_square.dtype == core.VarDesc.VarType.FP16:\n                sum_square_list_fp16.append(sum_square)\n            elif sum_square.dtype == core.VarDesc.VarType.FP32:\n                sum_square_list_fp32.append(sum_square)\n            else:\n                sum_square_list.append(sum_square)\n\n        # all parameters have been filterd out\n        if len(sum_square_list) + len(sum_square_list_fp16) + len(\n                sum_square_list_fp32) == 0:\n            return None, None\n        assert sum_dtype in [\"float64\", \"float32\", None], \\\n            \"sum's type must be float64/ float32 / None\"\n        if sum_dtype != \"float64\":\n            sum_dtype = 'float64' if len(sum_square_list) > 0 else \"float32\"\n\n        global_norm_var = []\n        if len(sum_square_list_fp16) > 0:\n            global_norm_var_fp16 = layers.concat(sum_square_list_fp16)\n            global_norm_var_fp16 = layers.reduce_sum(global_norm_var_fp16)\n            global_norm_var.append(global_norm_var_fp16.astype(sum_dtype))\n        if len(sum_square_list_fp32) > 0:\n            global_norm_var_fp32 = layers.concat(sum_square_list_fp32)\n            global_norm_var_fp32 = layers.reduce_sum(global_norm_var_fp32)\n            if sum_dtype == 'float32':\n                global_norm_var.append(global_norm_var_fp32)\n            else:\n                global_norm_var.append(global_norm_var_fp32.astype(sum_dtype))\n        if len(sum_square_list) > 0:\n            global_norm_var_fp64 = layers.concat(sum_square_list)\n            global_norm_var_fp64 = layers.reduce_sum(global_norm_var_fp64)\n            global_norm_var.append(global_norm_var_fp64)\n        global_norm_var = layers.concat(global_norm_var)\n        global_norm_var = layers.reduce_sum(global_norm_var)\n        return global_norm_var, sum_dtype\n\n    @imperative_base.no_grad\n    def _dygraph_clip(self, params_grads):\n        normal_params_grads = []\n        moe_params_grads = []\n\n        # separate moe params from normal params\n        if self.moe_group is not None and self.moe_group.nranks > 1:\n            for p, g in params_grads:\n                if \"expert\" in p.name or \"gate\" in p.name:\n                    moe_params_grads.append((p, g))\n                else:\n                    normal_params_grads.append((p, g))\n        else:\n            normal_params_grads = params_grads\n\n        # why to return sum_dtype?\n        # we will call `get_l2_norm_pow` twice and the precisions may be different.\n        # For convenience and simplification, we use sum_dtype directly instead of global_norm_var_normal.dtype\n        global_norm_var_normal, sum_dtype \\\n            = self.get_l2_norm_pow(normal_params_grads)\n        global_norm_var_moe = None\n        if len(moe_params_grads) > 0:\n            global_norm_var_moe, _ \\\n                = self.get_l2_norm_pow(moe_params_grads, sum_dtype)\n            if global_norm_var_moe is not None:\n                collective.all_reduce(\n                    global_norm_var_moe,\n                    op=collective.ReduceOp.SUM,\n                    group=self.moe_group)\n\n        if global_norm_var_normal is None and global_norm_var_moe is None:\n            return params_grads\n        elif global_norm_var_normal is None:\n            global_norm_var = global_norm_var_moe\n        elif global_norm_var_moe is None:\n            global_norm_var = global_norm_var_normal\n        else:\n            if global_norm_var_normal.dtype != global_norm_var_moe.dtype:\n                # compared with normal norm, moe norm is the later one,\n                # so its precision is no lower than normal norm\n                global_norm_var_normal = \\\n                    global_norm_var_normal.astype(global_norm_var_moe.dtype)\n            global_norm_var = global_norm_var_normal + global_norm_var_moe\n\n        global_norm_var = layers.sqrt(global_norm_var)\n        max_global_norm = layers.fill_constant(\n            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)\n        clip_var = layers.elementwise_div(\n            x=max_global_norm,\n            y=layers.elementwise_max(\n                x=global_norm_var, y=max_global_norm))\n        clip_var_fp16 = paddle.cast(clip_var, paddle.float16)\n\n        for p, g in params_grads:\n            if g is None or getattr(p, 'need_clip', True) is False:\n                continue\n\n            if p.dtype == paddle.float16:\n                g.scale_(clip_var_fp16)\n            else:\n                g.scale_(clip_var)\n\n            p._reset_grad_inplace_version(True)\n\n        return params_grads\n"
  },
  {
    "path": "ppfleetx/optims/lr_scheduler.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport math\nimport numpy\nimport warnings\nfrom paddle import Tensor\nfrom paddle.optimizer import lr\nfrom paddle.optimizer.lr import LRScheduler\n\n__all__ = [\n    'CosineAnnealingWithWarmupDecay',\n    'LinearDecayWithWarmup',\n    'ViTLRScheduler',\n    'MultiStepDecay',\n    'CosineDecay',\n]\n\n\nclass CosineAnnealingWithWarmupDecay(LRScheduler):\n    def __init__(self,\n                 max_lr,\n                 min_lr,\n                 warmup_rate,\n                 decay_steps,\n                 last_epoch=0,\n                 verbose=False,\n                 **kwargs):\n\n        self.decay_steps = decay_steps\n        self.warmup_step = warmup_rate * decay_steps\n        self.max_lr = max_lr\n        self.min_lr = min_lr\n        super(CosineAnnealingWithWarmupDecay, self).__init__(\n            max_lr, last_epoch, verbose)\n\n    def get_lr(self):\n        if self.warmup_step > 0 and self.last_epoch <= self.warmup_step:\n            return float(self.max_lr) * (self.last_epoch) / self.warmup_step\n\n        if self.last_epoch > self.decay_steps:\n            return self.min_lr\n\n        num_step_ = self.last_epoch - self.warmup_step\n        decay_steps_ = self.decay_steps - self.warmup_step\n        decay_ratio = float(num_step_) / float(decay_steps_)\n        coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0)\n        return self.min_lr + coeff * (self.max_lr - self.min_lr)\n\n    def step(self, epoch=None):\n        if epoch is None:\n            self.last_epoch += 0\n            self.last_lr = self.get_lr()\n        else:\n            self.last_epoch += epoch\n            if hasattr(self, \"_get_closed_form_lr\"):\n                self.last_lr = self._get_closed_form_lr()\n            else:\n                self.last_lr = self.get_lr()\n\n        if self.verbose:\n            print('Epoch {}: {} set learning rate to {}.'.format(\n                self.last_epoch, self.__class__.__name__, self.last_lr))\n\n\nclass LinearDecayWithWarmup(LRScheduler):\n    def __init__(self,\n                 learning_rate,\n                 step_each_epoch,\n                 epochs,\n                 warmup=0,\n                 verbose=False,\n                 last_epoch=-1,\n                 **kwargs):\n        if kwargs.get('total_steps', -1) > 0:\n            self.T_max = total_steps\n        else:\n            self.T_max = epochs * step_each_epoch\n\n        self.warmup_steps = warmup if isinstance(\n            warmup, int) else int(math.floor(warmup * self.T_max))\n        super(LinearDecayWithWarmup, self).__init__(learning_rate, last_epoch,\n                                                    verbose)\n\n    def get_lr(self):\n        if self.last_epoch < self.warmup_steps:\n            return self.base_lr * (float(self.last_epoch) /\n                                   float(max(1, self.warmup_steps)))\n        return self.base_lr * max(0.0, 1.0 - self.last_epoch / self.T_max)\n\n\nclass ViTLRScheduler(LRScheduler):\n    def __init__(self,\n                 learning_rate,\n                 step_each_epoch,\n                 epochs,\n                 decay_type='cosine',\n                 linear_end=1e-5,\n                 warmup_steps=0,\n                 verbose=False,\n                 last_epoch=-1,\n                 **kwargs):\n\n        self.linear_end = linear_end\n        self.T_max = epochs * step_each_epoch\n        self.warmup_steps = warmup_steps\n\n        if self.warmup_steps >= self.T_max:\n            self.warmup_steps = self.T_max - 1\n\n        self.decay_type = decay_type\n        self.last_epoch = last_epoch\n        super(ViTLRScheduler, self).__init__(learning_rate, last_epoch,\n                                             verbose)\n\n    def get_lr(self):\n\n        progress = (self.last_epoch - self.warmup_steps\n                    ) / float(self.T_max - self.warmup_steps)\n        progress = min(1.0, max(0.0, progress))\n\n        if self.decay_type == 'linear':\n            lr = self.linear_end + (self.base_lr - self.linear_end) * (\n                1.0 - progress)\n        elif self.decay_type == 'cosine':\n            lr = 0.5 * self.base_lr * (1.0 + math.cos(math.pi * progress))\n        if self.warmup_steps:\n            lr = lr * min(1.0, self.last_epoch / self.warmup_steps)\n\n        return lr\n\n\nclass MultiStepDecay(lr.MultiStepDecay):\n    def __init__(self,\n                 learning_rate,\n                 step_each_epoch,\n                 epochs,\n                 milestones,\n                 gamma=0.1,\n                 last_epoch=-1,\n                 verbose=False,\n                 **kwargs):\n        super(MultiStepDecay, self).__init__(\n            learning_rate=learning_rate,\n            milestones=milestones,\n            gamma=gamma,\n            last_epoch=last_epoch,\n            verbose=verbose)\n\n\nclass CosineDecay(lr.LRScheduler):\n    def __init__(self,\n                 learning_rate,\n                 step_each_epoch,\n                 epochs,\n                 update_unit='epoch',\n                 warmups=0,\n                 verbose=False,\n                 last_epoch=-1,\n                 **kwargs):\n\n        self.T_max = epochs if update_unit == 'epoch' else step_each_epoch * epochs\n        self.warmups = warmups if update_unit == 'epoch' else step_each_epoch * warmups\n\n        assert self.warmups < self.T_max\n\n        self.last_epoch = last_epoch\n        super(CosineDecay, self).__init__(learning_rate, last_epoch, verbose)\n\n    def get_lr(self):\n\n        progress = (\n            self.last_epoch - self.warmups) / float(self.T_max - self.warmups)\n        progress = min(1.0, max(0.0, progress))\n\n        if self.warmups:\n            lr = lr * min(1.0, self.last_epoch / self.warmups)\n        else:\n            lr = 0.5 * self.base_lr * (1.0 + math.cos(math.pi * progress))\n\n        return lr\n"
  },
  {
    "path": "ppfleetx/optims/optimizer.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport sys\nimport paddle\nimport paddle.distributed.fleet as fleet\n\nfrom ppfleetx.utils.tensor_fusion_helper import fused_parameters\nfrom paddle.optimizer import Adam, AdamW, Momentum\nfrom ppfleetx.distributed.apis import env\n\n__all__ = [\n    'Adam',\n    'AdamW',\n    'Momentum',\n    'FusedAdamW',\n]\n\n\nclass FusedAdamW(paddle.optimizer.AdamW):\n    def __init__(self, learning_rate, parameters, grad_clip, **config):\n        tensor_fusion = config.pop(\"tensor_fusion\", False)\n\n        if paddle.distributed.get_world_size() > 1:\n            hcg = env.get_hcg()\n            sharding_size = hcg.get_sharding_parallel_world_size()\n\n        if tensor_fusion:\n            self.decay_fused_tensors, self.all_fused_tensors = fused_parameters(\n                parameters, sharding_size > 1)\n            decay_params = [p.name for p in self.decay_fused_tensors]\n        else:\n            decay_params = [\n                p.name for p in parameters\n                if not any(nd in p.name for nd in [\"bias\", \"norm\", \"b_0\"])\n            ]\n\n        apply_decay_param_fun = lambda x: x in decay_params\n\n        super().__init__(\n            learning_rate=learning_rate,\n            parameters=self.all_fused_tensors if tensor_fusion else parameters,\n            grad_clip=grad_clip,\n            apply_decay_param_fun=apply_decay_param_fun,\n            **config)\n"
  },
  {
    "path": "ppfleetx/tools/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/tools/multiprocess_tool.py",
    "content": "#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport multiprocessing\nfrom multiprocessing import Process\nimport math\nimport time\nimport os\nimport argparse\nimport warnings\n\"\"\"\nMulti-process batch processing tool\n\nThis tool provides a multi-process batch processing method. \nFor example, multi-process batch download data, multi-process preprocessing data, etc.\n\nThe tool relies on executable shell commands or scripts. Its essence is to use Python's \nmulti-process library to create multiple processes, and call executable commands or \nscripts through the os.system API.\n\nExecutable commands or scripts are passed in via a txt text file, organized by line. \nFor example, the following example is download, unzip and delete example.\n\nbatch_cmd.txt\n\nwget http://xxxx.com/0.tar && tar -xf 0.tar && rm 0.tar\nwget http://xxxx.com/1.tar && tar -xf 1.tar && rm 1.tar\n...\nwget http://xxxx.com/99.tar && tar -xf 99.tar && rm 99.tar\n\nHow to run:\n\npython multiprocess_tool.py --num_proc 10 --shell_cmd_list_filename batch_cmd.txt\n\n\"\"\"\n\n\ndef process_fn(cmd_list):\n    for cmd in cmd_list:\n        try:\n            ret = os.system(cmd)\n            if ret != 0:\n                raise Exception(f'execute command: {cmd} failed.')\n        except Exception as e:\n            print(e)\n\n\ndef read_command(shell_cmd_list_filename):\n    shell_cmd_list = []\n    with open(shell_cmd_list_filename, 'r') as f:\n        for cmd in f:\n            cmd = cmd.strip()\n            shell_cmd_list.append(cmd)\n    return shell_cmd_list\n\n\ndef parallel_process(cmd_list, nproc=20):\n    if nproc > multiprocessing.cpu_count():\n        warnings.warn(\n            'The set number of processes exceeds the number of cpu cores, please confirm whether it is reasonable.'\n        )\n    num_cmd = len(cmd_list)\n    num_cmd_part = (num_cmd + nproc - 1) // nproc\n    workers = []\n    for i in range(min(nproc, num_cmd)):\n        start = i * num_cmd_part\n        end = min(start + num_cmd_part, num_cmd)\n        p = Process(target=process_fn, args=(cmd_list[start:end], ))\n        workers.append(p)\n        p.start()\n\n    for p in workers:\n        p.join()\n\n\ndef main(args):\n    start = time.time()\n    shell_cmd_list = read_command(args.shell_cmd_list_filename)\n    parallel_process(shell_cmd_list, args.num_proc)\n    end = time.time()\n    print(\"Cost time: {:.2f}\".format(end - start))\n\n\nif __name__ == \"__main__\":\n    parse = argparse.ArgumentParser(\n        description='multi-process batch processing tool')\n    parse.add_argument('--num_proc', type=int, default=20)\n    parse.add_argument(\n        '--shell_cmd_list_filename',\n        type=str,\n        help='a txt file contains shell command list to be execute.')\n    args = parse.parse_args()\n    main(args)\n"
  },
  {
    "path": "ppfleetx/utils/__init__.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "ppfleetx/utils/check.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\n\nimport paddle\nfrom paddle import is_compiled_with_cuda\nfrom .log import logger\nfrom .device import get_device_and_mapping\n\ndef check_version():\n    \"\"\"\n    Log error and exit when the installed version of paddlepaddle is\n    not satisfied.\n    \"\"\"\n    err = \"PaddlePaddle version 1.8.0 or higher is required, \" \\\n          \"or a suitable develop version is satisfied as well. \\n\" \\\n          \"Please make sure the version is good with your code.\"\n    try:\n        pass\n        # paddle.utils.require_version('0.0.0')\n    except Exception:\n        logger.error(err)\n        sys.exit(1)\n\n\ndef check_device(device):\n    \"\"\"\n    Log error and exit when using paddlepaddle cpu version.\n    \"\"\"\n    err = \"You are using paddlepaddle %s version! Please try to \\n\" \\\n          \"1. install paddlepaddle-%s to run model on %s \\nor 2. set the config option 'Global.device' to %s.\"\n\n    d, supported_device_map = get_device_and_mapping()\n\n    assert device in supported_device_map, \\\n        f\"the device({device}) to check is not supported by now.Now the paddle only supports: {supported_device_map.keys()}\"\n    err = err % (d, device, device, d)\n    \n    try:\n        assert supported_device_map[device]\n    except AssertionError:\n        logger.error(err)\n        sys.exit(1)\n"
  },
  {
    "path": "ppfleetx/utils/compression_helper.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nimport paddleslim\n\n\ndef get_pruned_params(model):\n    params = []\n    for sublayer in model.sublayers():\n        for param in sublayer.parameters(include_sublayers=False):\n            if isinstance(sublayer,\n                          paddle.nn.layer.common.Linear) or isinstance(\n                              sublayer, paddle.distributed.fleet.layers.mpu.\n                              mp_layers.ColumnParallelLinear) or isinstance(\n                                  sublayer, paddle.distributed.fleet.layers.\n                                  mpu.mp_layers.RowParallelLinear):\n                if len(param.shape) != 2: continue\n\n                # NOTE(minghaoBD):\n                # 1. param.shape[1] == 3 * param.shape[0]： prune fused-qkv's weight and its next weight: out-linear's weight\n                # 2. param.shape[1] == 4 * param.shape[0]： prune ffn1's weight and its next weight: ffn2's weight\n                # If your model has a different architecture, like your qkv's weights are not fused or ffn1_weight.shape[1] != 4*ffn1_weight.shape[0], you may need to customize this function to suit your model.\n                if param.shape[1] == 3 * param.shape[0] or param.shape[\n                        1] == 4 * param.shape[0]:\n                    params.append(param.name)\n\n    return params\n\n\ndef prune_model(model, configs, inputs_desc=[]):\n    prune_criterion = configs.criterion\n    ratio = configs.ratio\n    shapes, dtypes = [], []\n    for input_desc in inputs_desc:\n        dtypes.append(input_desc.dtype)\n        new_shape = [10 if item == -1 else item for item in input_desc.shape]\n        shapes.append(new_shape)\n    #TODO(minghaoBD): support ViT and other model architectures in the future\n    num_attention_heads = model.gpt.decoder.layers[0].self_attn.num_heads\n\n    if prune_criterion == 'l1_norm':\n        pruner = paddleslim.L1NormFilterPruner(\n            model,\n            shapes,\n            skip_leaves=False,\n            prune_type='fc',\n            input_dtype=dtypes[0],\n            num_head=num_attention_heads)\n    elif prune_criterion == 'l2_norm':\n        pruner = paddleslim.L2NormFilterPruner(\n            model,\n            shapes,\n            skip_leaves=False,\n            prune_type='fc',\n            input_dtype=dtypes[0],\n            num_head=num_attention_heads)\n    params = get_pruned_params(model)\n    ratios = {}\n    for param in params:\n        ratios[param] = ratio\n    #NOTE(minghaoBD): hidden size in Layernorm must be 768/1024/2048/4096 for best inference performace, and when axis=0, the hidden size in layernorm will be changed accordingly. So axis=1 is required.\n    plan = pruner.prune_vars(ratios, [1])\n\n\ndef quant_model(model, configs):\n    quanter = paddleslim.dygraph.quant.QAT(configs)\n    return quanter.quantize(model), quanter\n"
  },
  {
    "path": "ppfleetx/utils/config.py",
    "content": "# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport copy\nimport argparse\nimport yaml\nimport codecs\nimport sys\nimport logging\nfrom .log import logger, advertise\n\nfrom . import check\nimport paddle\nimport paddle.distributed as dist\nimport paddle.distributed.auto_parallel as auto\nfrom paddle.fluid.reader import use_pinned_memory\n\n__all__ = ['get_config', 'print_config']\n\n\ndef process_dist_config(configs):\n    \"\"\"\n    process distributed strategy for hybrid parallel\n    \"\"\"\n    nranks = dist.get_world_size()\n\n    config = configs['Distributed']\n\n    config.setdefault(\"hcg\", \"HybridCommunicateGroup\")\n    mp_degree = config.setdefault(\"mp_degree\", 1)\n    pp_degree = config.setdefault(\"pp_degree\", 1)\n    pp_recompute_interval = config.setdefault(\"pp_recompute_interval\", 1)\n\n    # sharding default\n    sharding_config = config['sharding']\n    sharding_degree = sharding_config.setdefault(\"sharding_degree\", 1)\n    sharding_stage = sharding_config.setdefault('sharding_stage', 2)\n    sharding_offload = sharding_config.setdefault('sharding_offload', False)\n    reduce_overlap = sharding_config.setdefault('reduce_overlap', False)\n    broadcast_overlap = sharding_config.setdefault('broadcast_overlap', False)\n\n    other_degree = mp_degree * pp_degree * sharding_degree\n\n    assert nranks % other_degree == 0, \"unreasonable config of dist_strategy.\"\n    dp_degree = config.setdefault(\"dp_degree\", nranks // other_degree)\n    assert nranks % dp_degree == 0, \"unreasonable config of dist_strategy.\"\n    assert nranks == dp_degree * other_degree, \\\n        \"Mismatched config using {} cards with dp_degree[{}],\" \\\n            \"mp_degree[{}], pp_degree[{}] and sharding_degree[{}]\".format(nranks, \\\n                dp_degree, mp_degree, pp_degree, sharding_degree)\n\n    if sharding_config['sharding_degree'] > 1 and reduce_overlap:\n        if sharding_config['sharding_stage'] == 3 or sharding_config[\n                'sharding_offload']:\n            sharding_config['reduce_overlap'] = False\n            logger.warning(\n                \"reduce overlap only valid for sharding stage 2 without offload\"\n            )\n\n    if sharding_config['sharding_degree'] > 1 and broadcast_overlap:\n        if sharding_config['sharding_stage'] == 3 or sharding_config[\n                'sharding_offload']:\n            sharding_config['broadcast_overlap'] = False\n            logger.warning(\n                \"broadcast overlap only valid for sharding stage 2 without offload\"\n            )\n\n    if broadcast_overlap and configs['Engine']['logging_freq'] == 1:\n        logger.warning(\n            \"Set logging_freq to 1 will disable broadcast_overlap. \"\n            \"If you want to overlap the broadcast, please increase the logging_freq.\"\n        )\n        sharding_config['broadcast_overlap'] = False\n\n    if sharding_config['sharding_degree'] > 1:\n        if getattr(sharding_config, 'broadcast_overlap', False):\n            logger.warning(\n                \"Enable broadcast overlap for sharding will not use pin memory for dataloader\"\n            )\n            use_pinned_memory(False)\n\n    if 'fuse_sequence_parallel_allreduce' not in config:\n        config['fuse_sequence_parallel_allreduce'] = False\n\n    if 'use_main_grad' in config and config['use_main_grad'] is True:\n        logger.warning(\n            \"If use_main_grad is True, fuse_sequence_parallel_allreduce will be forced to False\"\n        )\n        config['fuse_sequence_parallel_allreduce'] = False\n\n\ndef process_global_configs(config):\n    \"\"\"\n    process global configs for hybrid parallel\n    \"\"\"\n    dp_degree = config['Distributed']['dp_degree']\n    pp_degree = config['Distributed']['pp_degree']\n    sharding_degree = config['Distributed']['sharding']['sharding_degree']\n\n    config['Global']['enable_partial_send_recv'] = True\n    if 'sequence_parallel' in config['Model'] and pp_degree > 1:\n        if config['Model']['sequence_parallel']:\n            config['Global']['enable_partial_send_recv'] = False\n            logger.warning(\n                \"if config.Distributed.pp_degree > 1 and config.Model.sequence_parallel is True, \" \\\n                \"config.Global.enable_partial_send_recv will be set False.\"\n            )\n\n    global_cfg = config['Global']\n\n    # Set environment variable\n    flags = global_cfg.get(\"flags\", {})\n    paddle.set_flags(flags)\n    for k, v in flags.items():\n        logger.info(\"Environment variable {} is set {}.\".format(k, v))\n\n    if global_cfg['global_batch_size'] is None and global_cfg[\n            'local_batch_size'] is None:\n        raise ValueError(\n            \"global_batch_size or local_batch_size should be set.\")\n    elif global_cfg['global_batch_size'] is not None and global_cfg[\n            'local_batch_size'] is not None:\n        assert global_cfg['global_batch_size'] // global_cfg['local_batch_size'] == (dp_degree * sharding_degree), \"global_batch_size[{}] should be divided by local_batch_size[{}] \"\\\n            \"when dp_degree is [{}] and sharding_degree is [{}]\".format(global_cfg['global_batch_size'],\n            global_cfg['local_batch_size'], dp_degree, sharding_degree)\n    elif global_cfg['global_batch_size'] is not None and global_cfg[\n            'local_batch_size'] is None:\n        assert global_cfg['global_batch_size'] % (dp_degree * sharding_degree) == 0, \\\n            \"global_batch_size[{}] should be divided by dp_degree[{}] times sharding_degree[{}]\"\\\n            .format(global_cfg['global_batch_size'], dp_degree, sharding_degree)\n        global_cfg['local_batch_size'] = global_cfg['global_batch_size'] // (\n            dp_degree * sharding_degree)\n    else:\n        global_cfg['global_batch_size'] = global_cfg[\n            'local_batch_size'] * dp_degree * sharding_degree\n    assert global_cfg['local_batch_size'] % global_cfg['micro_batch_size'] == 0\n\n\ndef process_engine_config(config):\n    \"\"\"\n    process engine\n    \"\"\"\n    # save_load\n    config.Engine['save_load'] = config.Engine.get('save_load', {})\n    save_load_cfg = config.Engine.save_load\n    save_steps = save_load_cfg.get('save_steps', None)\n    save_epoch = save_load_cfg.get('save_epoch', None)\n    if save_steps is None or save_steps == -1:\n        save_load_cfg[\n            'save_steps'] = sys.maxsize if sys.version > '3' else sys.maxint\n\n    if save_epoch is None or save_epoch == -1:\n        save_load_cfg['save_epoch'] = 1\n\n    save_load_cfg['output_dir'] = save_load_cfg.get('output_dir', './output')\n    save_load_cfg['ckpt_dir'] = save_load_cfg.get('ckpt_dir', None)\n\n    # mix_precision\n    config.Engine['mix_precision'] = config.Engine.get('mix_precision', {})\n    amp_cfg = config.Engine.mix_precision\n\n    amp_cfg['enable'] = amp_cfg.get('enable', False)\n    amp_cfg['scale_loss'] = amp_cfg.get('scale_loss', 32768)\n    amp_cfg['custom_black_list'] = amp_cfg.get('custom_black_list', None)\n    amp_cfg['custom_white_list'] = amp_cfg.get('custom_white_list', None)\n\n    # engine\n    config.Engine['max_steps'] = config.Engine.get('max_steps', 500000)\n    config.Engine['eval_freq'] = config.Engine.get('eval_freq', -1)\n    config.Engine['eval_iters'] = config.Engine.get('eval_iters', 0)\n    config.Engine['logging_freq'] = config.Engine.get('logging_freq', 1)\n    config.Engine['num_train_epochs'] = config.Engine.get('num_train_epochs',\n                                                          1)\n    config.Engine['test_iters'] = config.Engine['eval_iters'] * 10 \\\n            if config.Engine.get('test_iters', None) is None else config.Engine['test_iters']\n    config.Engine[\n        'accumulate_steps'] = config.Global.local_batch_size // config.Global.micro_batch_size\n\n\nclass AttrDict(dict):\n    def __getattr__(self, key):\n        return self[key]\n\n    def __setattr__(self, key, value):\n        if key in self.__dict__:\n            self.__dict__[key] = value\n        else:\n            self[key] = value\n\n    def __copy__(self):\n        cls = self.__class__\n        result = cls.__new__(cls)\n        result.__dict__.update(self.__dict__)\n        return result\n\n    def __deepcopy__(self, memo):\n        cls = self.__class__\n        result = cls.__new__(cls)\n        memo[id(self)] = result\n        for k, v in self.__dict__.items():\n            setattr(result, k, copy.deepcopy(v, memo))\n        for k, v in self.items():\n            setattr(result, k, copy.deepcopy(v, memo))\n        return result\n\n    def setdefault(self, k, default=None):\n        if k not in self or self[k] is None:\n            self[k] = default\n            return default\n        else:\n            return self[k]\n\n\ndef create_attr_dict(yaml_config):\n    from ast import literal_eval\n    for key, value in yaml_config.items():\n        if type(value) is dict:\n            yaml_config[key] = value = AttrDict(value)\n        if isinstance(value, str):\n            try:\n                value = literal_eval(value)\n            except BaseException:\n                pass\n        if isinstance(value, AttrDict):\n            create_attr_dict(yaml_config[key])\n        else:\n            yaml_config[key] = value\n\n\ndef parse_config(cfg_file):\n    \"\"\"Load a config file into AttrDict\"\"\"\n\n    def _update_dic(dic, base_dic):\n        '''Update config from dic based base_dic\n        '''\n        base_dic = base_dic.copy()\n        dic = dic.copy()\n\n        if dic.get('_inherited_', True) == False:\n            dic.pop('_inherited_')\n            return dic\n\n        for key, val in dic.items():\n            if isinstance(val, dict) and key in base_dic:\n                base_dic[key] = _update_dic(val, base_dic[key])\n            else:\n                base_dic[key] = val\n        dic = base_dic\n        return dic\n\n    def _parse_from_yaml(path):\n        '''Parse a yaml file and build config'''\n\n        with codecs.open(path, 'r', 'utf-8') as file:\n            dic = yaml.load(file, Loader=yaml.FullLoader)\n\n        if '_base_' in dic:\n            cfg_dir = os.path.dirname(path)\n            base_path = dic.pop('_base_')\n            base_path = os.path.join(cfg_dir, base_path)\n            base_dic = _parse_from_yaml(base_path)\n            dic = _update_dic(dic, base_dic)\n        return dic\n\n    yaml_dict = _parse_from_yaml(cfg_file)\n    yaml_config = AttrDict(yaml_dict)\n\n    create_attr_dict(yaml_config)\n    return yaml_config\n\n\ndef print_dict(d, delimiter=0):\n    \"\"\"\n    Recursively visualize a dict and\n    indenting acrrording by the relationship of keys.\n    \"\"\"\n    placeholder = \"-\" * 60\n    for k, v in sorted(d.items()):\n        if isinstance(v, dict):\n            logger.info(\"{}{} : \".format(delimiter * \" \", k))\n            print_dict(v, delimiter + 4)\n        elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):\n            logger.info(\"{}{} : \".format(delimiter * \" \", k))\n            for value in v:\n                print_dict(value, delimiter + 4)\n        else:\n            logger.info(\"{}{} : {}\".format(delimiter * \" \", k, v))\n        if k.isupper():\n            logger.info(placeholder)\n\n\ndef print_config(config):\n    \"\"\"\n    visualize configs\n    Arguments:\n        config: configs\n    \"\"\"\n    advertise()\n    print_dict(config)\n\n\ndef check_config(config):\n    \"\"\"\n    Check config\n    \"\"\"\n    # global_batch_size = config.get(\"\")\n\n    global_config = config.get('Global')\n    check.check_version()\n    device = global_config.get('device', 'gpu')\n    device = device.lower()\n    if device in ['gpu', 'xpu', 'rocm', 'npu', \"cpu\", 'mlu']:\n        check.check_device(device)\n    else:\n        raise ValueError(\n            f\"device({device}) is not in ['gpu', 'xpu', 'rocm', 'npu', 'cpu', 'mlu'],\\n\"\n            \"Please ensure the config option Global.device is one of these devices\"\n        )\n\n\ndef override(dl, ks, v):\n    \"\"\"\n    Recursively replace dict of list\n    Args:\n        dl(dict or list): dict or list to be replaced\n        ks(list): list of keys\n        v(str): value to be replaced\n    \"\"\"\n\n    def str2num(v):\n        try:\n            return eval(v)\n        except Exception:\n            return v\n\n    assert isinstance(dl, (list, dict)), (\"{} should be a list or a dict\")\n    assert len(ks) > 0, ('lenght of keys should larger than 0')\n    if isinstance(dl, list):\n        k = str2num(ks[0])\n        if len(ks) == 1:\n            assert k < len(dl), ('index({}) out of range({})'.format(k, dl))\n            dl[k] = str2num(v)\n        else:\n            override(dl[k], ks[1:], v)\n    else:\n        if len(ks) == 1:\n            # assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))\n            if not ks[0] in dl:\n                print('A new field ({}) detected!'.format(ks[0], dl))\n            dl[ks[0]] = str2num(v)\n        else:\n            if ks[0] not in dl.keys():\n                dl[ks[0]] = {}\n                print(\"A new Series field ({}) detected!\".format(ks[0], dl))\n            override(dl[ks[0]], ks[1:], v)\n\n\ndef override_config(config, options=None):\n    \"\"\"\n    Recursively override the config\n    Args:\n        config(dict): dict to be replaced\n        options(list): list of pairs(key0.key1.idx.key2=value)\n            such as: [\n                'topk=2',\n                'VALID.transforms.1.ResizeImage.resize_short=300'\n            ]\n    Returns:\n        config(dict): replaced config\n    \"\"\"\n    if options is not None:\n        for opt in options:\n            assert isinstance(opt, str), (\n                \"option({}) should be a str\".format(opt))\n            assert \"=\" in opt, (\n                \"option({}) should contain a =\"\n                \"to distinguish between key and value\".format(opt))\n            pair = opt.split('=')\n            assert len(pair) == 2, (\"there can be only a = in the option\")\n            key, value = pair\n            keys = key.split('.')\n            override(config, keys, value)\n    return config\n\n\ndef get_config(fname, overrides=None, show=False):\n    \"\"\"\n    Read config from file\n    \"\"\"\n    assert os.path.exists(fname), (\n        'config file({}) is not exist'.format(fname))\n    config = parse_config(fname)\n    override_config(config, overrides)\n\n    process_dist_config(config)\n    process_global_configs(config)\n    process_engine_config(config)\n    create_attr_dict(AttrDict(config))\n\n    if show:\n        print_config(config)\n    check_config(config)\n    return config\n\n\ndef process_auto_dist_configs(config):\n    \"\"\"\n    process distributed strategy for auto parallel\n    \"\"\"\n    configs = config['Distributed']\n    nranks = dist.get_world_size()\n\n    mp_degree = configs.setdefault(\"mp_degree\", 1)\n    pp_degree = configs.setdefault(\"pp_degree\", 1)\n    sharding_config = configs['sharding']\n    sharding_degree = sharding_config.setdefault(\"sharding_degree\", 1)\n\n    other_degree = mp_degree * pp_degree\n    assert nranks % other_degree == 0, \"Requires nranks should be divided by mp_degree*pp_degree.\"\n\n    dp_degree = configs.setdefault(\"dp_degree\", nranks // other_degree)\n    assert nranks % dp_degree == 0, \"unreasonable config of dist_strategy.\"\n    assert nranks == dp_degree * other_degree, \\\n        \"Mismatched config using {} cards with dp_degree[{}],\" \\\n            \"mp_degree[{}], pp_degree[{}] and sharding_degree[{}]\".format(nranks, \\\n                dp_degree, mp_degree, pp_degree, sharding_degree)\n\n\ndef process_auto_global_configs(config):\n    \"\"\"\n    process global configs for auto parallel\n    \"\"\"\n    dp_degree = config['Distributed']['dp_degree']\n    pp_degree = config['Distributed']['pp_degree']\n    # sharding_degree = config['Distributed']['sharding_degree']\n\n    config['Global']['enable_partial_send_recv'] = True\n    if config.get('Model', None) is not None and 'sequence_parallel' in config[\n            'Model'] and pp_degree > 1:\n        if config['Model']['sequence_parallel']:\n            config['Global']['enable_partial_send_recv'] = False\n            logger.warning(\n                \"if config.Distributed.pp_degree > 1 and config.Model.sequence_parallel is True, \" \\\n                \"config.Global.enable_partial_send_recv will be set False.\"\n            )\n\n    global_cfg = config['Global']\n    if global_cfg['global_batch_size'] is None and global_cfg[\n            'local_batch_size'] is None:\n        raise ValueError(\n            \"global_batch_size or local_batch_size should be set.\")\n    elif global_cfg['global_batch_size'] is not None and global_cfg[\n            'local_batch_size'] is not None:\n        assert global_cfg['global_batch_size'] // global_cfg['local_batch_size'] == dp_degree, \\\n            \"global_batch_size[{}] should be divided by local_batch_size[{}] when dp_degree is [{}]\"\\\n                .format(global_cfg['global_batch_size'], global_cfg['local_batch_size'], dp_degree)\n    elif global_cfg['global_batch_size'] is not None and global_cfg[\n            'local_batch_size'] is None:\n        assert global_cfg['global_batch_size'] % dp_degree == 0, \\\n            \"global_batch_size[{}] should be divided by dp_degree[{}]\".format(global_cfg['global_batch_size'], dp_degree)\n        global_cfg['local_batch_size'] = global_cfg[\n            'global_batch_size'] // dp_degree\n    else:\n        global_cfg['global_batch_size'] = global_cfg[\n            'local_batch_size'] * dp_degree\n    assert global_cfg['local_batch_size'] % global_cfg['micro_batch_size'] == 0\n\n\ndef process_auto_engine_configs(config):\n    \"\"\"\n    process engine configs for auto parallel\n    \"\"\"\n    if config.Engine.get(\"verbose\", None) is None:\n        config.Engine[\"verbose\"] = 2\n    if config.Engine.get(\"logging_freq\", None) is None:\n        config.Engine[\"logging_freq\"] = 10\n    config.Engine['save_load'] = config.Engine.get('save_load', {})\n    save_load_cfg = config.Engine.save_load\n    save_steps = save_load_cfg.get('save_steps', None)\n    save_epoch = save_load_cfg.get('save_epoch', None)\n    if save_steps is None or save_steps == -1:\n        save_load_cfg[\n            'save_steps'] = sys.maxsize if sys.version > '3' else sys.maxint\n    if save_epoch is None or save_epoch == -1:\n        save_load_cfg['save_epoch'] = 1\n    save_load_cfg['output_dir'] = save_load_cfg.get('output_dir', './output')\n    save_load_cfg['ckpt_dir'] = save_load_cfg.get('ckpt_dir', None)\n\n    config.Engine['max_steps'] = config.Engine.get('max_steps', 500000)\n    config.Engine['eval_freq'] = config.Engine.get('eval_freq', -1)\n    config.Engine['eval_iters'] = config.Engine.get('eval_iters', 0)\n    config.Engine['logging_freq'] = config.Engine.get('logging_freq', 1)\n    config.Engine['num_train_epochs'] = config.Engine.get('num_train_epochs',\n                                                          1)\n\n    config.Engine['test_iters'] = config.Engine['eval_iters'] * 10 \\\n            if config.Engine.get('test_iters', None) is None else config.Engine['test_iters']\n\n    config.Engine[\n        'accumulate_steps'] = config.Global.local_batch_size // config.Global.micro_batch_size\n\n\ndef process_auto_strategy(config):\n    \"\"\"\n    process auto strategy for auto parallel\n    \"\"\"\n    strategy = auto.Strategy()\n    strategy.auto_mode = \"semi\"\n    strategy.seed = config['Global']['seed']\n\n    # amp config\n    amp_cfg = config.Engine.get('mix_precision', {})\n    amp = strategy.amp\n    amp.enable = amp_cfg.get('enable', False)\n    amp.dtype = amp_cfg.get('dtype', \"float16\")\n    amp.level = amp_cfg.get('level', \"o2\")\n    amp.init_loss_scaling = amp_cfg.get('scale_loss', 32768)\n    amp.custom_black_list = amp_cfg.get('custom_black_list', [])\n    amp.custom_white_list = amp_cfg.get('custom_white_list', [])\n    amp.use_fp16_guard = amp_cfg.get('use_fp16_guard', False)\n    amp.use_bf16_guard = amp_cfg.get('use_bf16_guard', False)\n\n    # recompute config\n    if config.get('Model', None) is not None:\n        if not config.Model.get('no_recompute_layers', None):\n            config.Model['no_recompute_layers'] = []\n        else:\n            assert isinstance(config.Model['no_recompute_layers'],\n                              list), \"no_recompute_layers should be a list\"\n            for i in config.Model['no_recompute_layers']:\n                assert isinstance(\n                    i, int\n                ), \"all values in no_recompute_layers should be an integer\"\n            assert min(config.Model['no_recompute_layers']) >= 0, \\\n                \"the min value in no_recompute_layers should >= 0\"\n            assert max(config.Model['no_recompute_layers']) < config.Model['num_layers'], \\\n                \"the max value in no_recompute_layers should < num_layers\"\n            config.Model['no_recompute_layers'] = sorted(\n                list(set(config.Model['no_recompute_layers'])))\n        recompute = strategy.recompute\n        recompute.enable = config.Model.get('use_recompute', False)\n        recompute.no_recompute_segments = config.Model.pop(\n            'no_recompute_layers', [])\n        recompute.enable_tuning = config.get(\n            'Tuning', False) and config.Tuning.get('tuning_recompute', False)\n\n    # sharding config\n    sharding_cfg = config.Distributed.get('sharding', {})\n    sharding = strategy.sharding\n    sharding.enable = sharding_cfg.get('sharding_degree', 1) > 1\n    sharding.degree = sharding_cfg.get('sharding_degree', 1)\n    sharding.stage = sharding_cfg.get('sharding_stage', 1)\n\n    # gradient merge config\n    gradient_merge = strategy.gradient_merge\n    gradient_merge.enable = config.Engine.get('accumulate_steps') > 1\n    gradient_merge.k_steps = config.Engine.get('accumulate_steps', 1)\n\n    # quantization config\n    qat_cfg = config.get('Quantization', {})\n    qat = strategy.qat\n    qat.enable = qat_cfg.get('enable', False)\n    qat.channel_wise_abs_max = qat_cfg.get('channel_wise_abs_max', True)\n    qat.weight_bits = qat_cfg.get('weight_bits', 8)\n    qat.activation_bits = qat_cfg.get('activation_bits', 8)\n    qat.onnx_format = qat_cfg.get('onnx_format', True)\n\n    # tuning config\n    tuning_cfg = config.get('Tuning', {})\n    tuning = strategy.tuning\n    tuning.enable = tuning_cfg.get('enable', False)\n    tuning.profile_start_step = tuning_cfg.get('profile_start_step', 1)\n    tuning.profile_end_step = tuning_cfg.get('profile_end_step', 1)\n    tuning.run_after_tuning = tuning_cfg.get('run_after_tuning', True)\n    tuning.debug = tuning_cfg.get('debug', True)\n\n    engine_cfg = config['Engine']\n    engine_cfg['strategy'] = strategy\n\n\ndef process_auto_ckpt_dir(config):\n    configs = config[\"Engine\"][\"save_load\"]\n    ckpt_dir = configs.get(\"ckpt_dir\", None)\n    if ckpt_dir is None:\n        return\n\n    assert os.path.isdir(ckpt_dir) == False, \"Wrong setting of ckpt_dir!ckpt_dir can't be a folder,\"\\\n        \"but {} is a folder. Your `ckpt_dir` should be `dirname/prefix` like `output/auto`\"\\\n            \" if your model path is `output/auto_dist0.pdparams`\".format(ckpt_dir)\n\n    assert os.path.exists(ckpt_dir) == False, \"Wrong setting of ckpt_dir,\"\\\n        \"if you want to load weight,you should set ckpt_dir like this!\"\\\n        \"for example:\\ngpt_auto_model_save\\n\\t--auto_dist0.pdparams\\n\\t--auto_dist0.pdparams\\n\"\\\n        \"\\t--auto_dist0.pdattr\\nyou should set ckpt_dir=\\\"gpt_auto_model_save/auto\\\"\"\n\n    parent_path = os.path.split(ckpt_dir)[0]\n\n    if os.path.exists(parent_path) == False:\n        logging.warning(\"{} path is not existed!we will set ckpt_dir None.\".\n                        format(parent_path))\n        configs[\"ckpt_dir\"] == None\n\n\ndef get_auto_config(fname, overrides=None, show=False):\n    \"\"\"\n    Read config from file for auto parallel\n    \"\"\"\n    assert os.path.exists(fname), (\n        'config file({}) is not exist'.format(fname))\n    config = parse_config(fname)\n    override_config(config, overrides)\n\n    process_auto_dist_configs(config)\n    process_auto_global_configs(config)\n    process_auto_engine_configs(config)\n    process_auto_strategy(config)\n    process_auto_ckpt_dir(config)\n\n    if show:\n        print_config(config)\n    check_config(config)\n    return config\n\n\ndef parse_args():\n    parser = argparse.ArgumentParser(\"train script\")\n    parser.add_argument(\n        '-c',\n        '--config',\n        type=str,\n        default='configs/config.yaml',\n        help='config file path')\n    parser.add_argument(\n        '-o',\n        '--override',\n        action='append',\n        default=[],\n        help='config options to be overridden')\n    args = parser.parse_args()\n    return args\n"
  },
  {
    "path": "ppfleetx/utils/device.py",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nfrom .log import logger\n\n\ndef get_device_and_mapping():\n    \"\"\"\n        Return device type and name-bool mapping implifying which type is supported.\n    \"\"\"\n    suppoted_device_map = {\n        \"gpu\": paddle.is_compiled_with_cuda(),\n        \"xpu\": paddle.is_compiled_with_xpu(),\n        \"rocm\": paddle.is_compiled_with_rocm(),\n        \"npu\": paddle.is_compiled_with_custom_device(\"npu\"),\n        \"mlu\": 'mlu' in paddle.device.get_all_custom_device_type(),\n        \"cpu\": True\n    }\n    for d, v in suppoted_device_map.items():\n        if v:\n            return d, suppoted_device_map\n\n\ndef get_device():\n    \"\"\"\n        Return the device with which the paddle is compiled, including 'gpu'(for rocm and gpu), 'npu', 'xpu', 'cpu'.\n    \"\"\"\n    d, _ = get_device_and_mapping()\n    return d\n\n\ndef synchronize():\n    \"\"\"\n    Synchronize device, return True if succeeded, otherwise return False\n    \"\"\"\n    device = paddle.get_device().split(\":\")[0]\n    if device in [\"gpu\", \"rocm\"]:\n        paddle.device.cuda.synchronize()\n        return True\n    elif device == \"xpu\":\n        paddle.device.xpu.synchronize()\n        return True\n    elif device in paddle.device.get_all_custom_device_type():\n        paddle.device.synchronize()\n        return True\n    else:\n        logger.warning(\n            \"The synchronization is only supported on cuda and xpu now.\")\n    return False\n"
  },
  {
    "path": "ppfleetx/utils/download.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport time\nimport requests\nimport shutil\nfrom ppfleetx.utils.log import logger\n\nfrom tqdm import tqdm\nimport paddle\n\nDOWNLOAD_RETRY_LIMIT = 3\n\n\ndef is_url(path):\n    \"\"\"\n    Whether path is URL.\n    Args:\n        path (string): URL string or not.\n    \"\"\"\n    return path.startswith('http://') or path.startswith('https://')\n\n\ndef _map_path(url, root_dir):\n    # parse path after download under root_dir\n    fname = os.path.split(url)[-1]\n    fpath = fname\n    return os.path.join(root_dir, fpath)\n\n\ndef cached_path(url_or_path, cache_dir=None):\n    if cache_dir is None:\n        cache_dir = '~/.cache/ppfleetx/'\n\n    cache_dir = os.path.expanduser(cache_dir)\n\n    if not os.path.exists(cache_dir):\n        os.makedirs(cache_dir, exist_ok=True)\n\n    if is_url(url_or_path):\n        path = _map_path(url_or_path, cache_dir)\n        url = url_or_path\n    else:\n        path = url_or_path\n        url = None\n\n    if os.path.exists(path):\n        logger.info(\n            f\"Found {os.path.split(path)[-1]} in cache_dir: {cache_dir}.\")\n        return path\n\n    download(url, path)\n    return path\n\n\ndef _download(url, fullname):\n    \"\"\"\n    Download from url, save to path.\n    url (str): download url\n    path (str): download to given path\n    \"\"\"\n    retry_cnt = 0\n\n    while not os.path.exists(fullname):\n        if retry_cnt < DOWNLOAD_RETRY_LIMIT:\n            retry_cnt += 1\n        else:\n            raise RuntimeError(\"Download from {} failed. \"\n                               \"Retry limit reached\".format(url))\n\n        logger.info(\"Downloading {}\".format(url))\n\n        try:\n            req = requests.get(url, stream=True)\n        except Exception as e:  # requests.exceptions.ConnectionError\n            logger.info(\"Downloading {} failed {} times with exception {}\".\n                        format(url, retry_cnt + 1, str(e)))\n            time.sleep(1)\n            continue\n\n        if req.status_code != 200:\n            raise RuntimeError(\"Downloading from {} failed with code \"\n                               \"{}!\".format(url, req.status_code))\n\n        # For protecting download interupted, download to\n        # tmp_fullname firstly, move tmp_fullname to fullname\n        # after download finished\n        tmp_fullname = fullname + \"_tmp\"\n        total_size = req.headers.get('content-length')\n        with open(tmp_fullname, 'wb') as f:\n            if total_size:\n                with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:\n                    for chunk in req.iter_content(chunk_size=1024):\n                        f.write(chunk)\n                        pbar.update(1)\n            else:\n                for chunk in req.iter_content(chunk_size=1024):\n                    if chunk:\n                        f.write(chunk)\n        shutil.move(tmp_fullname, fullname)\n\n    return fullname\n\n\ndef download(url, path):\n    local_rank = 0\n    world_size = 1\n    if paddle.fluid.core.is_compiled_with_dist(\n    ) and paddle.distributed.get_world_size() > 1:\n        local_rank = paddle.distributed.ParallelEnv().dev_id\n        world_size = paddle.distributed.get_world_size()\n    if world_size > 1 and local_rank != 0:\n        while not os.path.exists(path):\n            time.sleep(1)\n    else:\n        _download(url, path)\n"
  },
  {
    "path": "ppfleetx/utils/export.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport paddle\nimport logging\n\nfrom .log import logger\n\n__all__ = ['export_inference_model']\n\n\ndef _prune_input_spec(input_spec, program, targets):\n    # try to prune static program to figure out pruned input spec\n    # so we perform following operations in static mode\n    device = paddle.get_device()\n    paddle.enable_static()\n    paddle.set_device(device)\n    pruned_input_spec = []\n    program = program.clone()\n    program = program._prune(targets=targets)\n    global_block = program.global_block()\n    for spec in input_spec:\n        try:\n            v = global_block.var(spec.name)\n            pruned_input_spec.append(spec)\n        except Exception:\n            pass\n    paddle.disable_static(place=device)\n    return pruned_input_spec\n\n\ndef export_inference_model(\n        model,\n        input_spec,\n        save_dir='./output',\n        save_name='model',\n        export_quant_model=False,\n        quanter=None, ):\n    if not os.path.exists(save_dir):\n        os.makedirs(save_dir)\n\n    static_model = paddle.jit.to_static(model, input_spec)\n    pruned_input_spec = _prune_input_spec(input_spec,\n                                          static_model.forward.main_program,\n                                          static_model.forward.outputs)\n\n    if export_quant_model:\n        quanter.save_quantized_model(\n            model,\n            os.path.join(save_dir, save_name),\n            input_spec=pruned_input_spec)\n        logger.info(\"export quantized inference model saved in {}\".format(\n            save_dir))\n        return\n\n    paddle.jit.save(\n        static_model,\n        os.path.join(save_dir, save_name),\n        input_spec=pruned_input_spec)\n    logger.info(\"export inference model saved in {}\".format(save_dir))\n"
  },
  {
    "path": "ppfleetx/utils/file.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport csv\nimport zipfile\nimport tarfile\nfrom typing import Iterable, Callable\n\nimport paddle\nfrom ppfleetx.distributed.apis import env\n\n\n@env.work_at_local_rank0\ndef unzip(zip_path, mode=\"r\", out_dir=None, delete=False):\n    with zipfile.ZipFile(zip_path, mode) as zip_ref:\n        zip_ref.extractall(out_dir)\n\n    if delete:\n        os.remove(zip_path)\n\n\n@env.work_at_local_rank0\ndef untar(tar_path, mode=\"r:gz\", out_dir=None, delete=False):\n    try:\n        with tarfile.open(tar_path, 'r:gz') as f:\n            f.extractall(out_dir)\n    finally:\n        if delete:\n            os.remove(tar_path)\n\n\ndef parse_csv(path,\n              skip_lines=0,\n              delimiter=' ',\n              quotechar='|',\n              quoting=csv.QUOTE_NONE,\n              map_funcs=None,\n              filter_funcs=None):\n\n    with open(path, newline='') as csvfile:\n        data = []\n        spamreader = csv.reader(\n            csvfile, delimiter=delimiter, quotechar=quotechar, quoting=quoting)\n        for idx, row in enumerate(spamreader):\n            if idx < skip_lines:\n                continue\n            filter_flag = True\n            if filter_funcs is not None:\n                if isinstance(filter_funcs, Iterable):\n                    for func in filter_funcs:\n                        filter_flag = func(row)\n                        if filter_flag is False:\n                            break\n                else:\n                    assert isinstance(filter_funcs, Callable)\n                    filter_flag = filter_funcs(row)\n            if filter_flag is False:\n                continue\n\n            if map_funcs is not None:\n                if isinstance(map_funcs, Iterable):\n                    for func in map_funcs:\n                        row = func(row)\n                else:\n                    assert isinstance(map_funcs, Callable)\n                    row = map_funcs(row)\n            data.append(row)\n        return data\n"
  },
  {
    "path": "ppfleetx/utils/log.py",
    "content": "# Copyright (c) 2022  PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport contextlib\nimport copy\nimport functools\nimport logging\nimport os\nimport sys\nimport time\nimport datetime\nimport threading\nfrom typing import List\n\nimport colorlog\nfrom colorama import Fore\n\nimport paddle\n\nloggers = {}\n\nlog_config = {\n    'DEBUG': {\n        'level': 10,\n        'color': 'purple'\n    },\n    'INFO': {\n        'level': 20,\n        'color': 'green'\n    },\n    'TRAIN': {\n        'level': 21,\n        'color': 'cyan'\n    },\n    'EVAL': {\n        'level': 22,\n        'color': 'blue'\n    },\n    'WARNING': {\n        'level': 30,\n        'color': 'yellow'\n    },\n    'ERROR': {\n        'level': 40,\n        'color': 'red'\n    },\n    'CRITICAL': {\n        'level': 50,\n        'color': 'bold_red'\n    }\n}\n\n\nclass Logger(object):\n    '''\n    Deafult logger in PaddleFleetX\n\n    Args:\n        name(str) : Logger name, default is 'PaddleFleetX'\n    '''\n\n    def __init__(self, name: str=None):\n        name = 'PaddleFleetX' if not name else name\n        self.logger = logging.getLogger(name)\n\n        for key, conf in log_config.items():\n            logging.addLevelName(conf['level'], key)\n            self.__dict__[key] = functools.partial(self.__call__,\n                                                   conf['level'])\n            self.__dict__[key.lower()] = functools.partial(self.__call__,\n                                                           conf['level'])\n\n        self.format = colorlog.ColoredFormatter(\n            '%(log_color)s[%(asctime)-15s] [%(levelname)s]%(reset)s - %(message)s',\n            log_colors={\n                key: conf['color']\n                for key, conf in log_config.items()\n            })\n\n        self.handler = logging.StreamHandler()\n        self.handler.setFormatter(self.format)\n\n        self.logger.addHandler(self.handler)\n        self.logLevel = 'DEBUG'\n        self.logger.setLevel(logging.DEBUG)\n        self.logger.propagate = False\n        self._is_enable = True\n\n    def disable(self):\n        self._is_enable = False\n\n    def enable(self):\n        self._is_enable = True\n\n    @property\n    def is_enable(self) -> bool:\n        return self._is_enable\n\n    def __call__(self, log_level: str, msg: str):\n        if not self.is_enable:\n            return\n\n        self.logger.log(log_level, msg)\n\n    @contextlib.contextmanager\n    def use_terminator(self, terminator: str):\n        old_terminator = self.handler.terminator\n        self.handler.terminator = terminator\n        yield\n        self.handler.terminator = old_terminator\n\n    @contextlib.contextmanager\n    def processing(self, msg: str, interval: float=0.1):\n        '''\n        Continuously print a progress bar with rotating special effects.\n\n        Args:\n            msg(str): Message to be printed.\n            interval(float): Rotation interval. Default to 0.1.\n        '''\n        end = False\n\n        def _printer():\n            index = 0\n            flags = ['\\\\', '|', '/', '-']\n            while not end:\n                flag = flags[index % len(flags)]\n                with self.use_terminator('\\r'):\n                    self.info('{}: {}'.format(msg, flag))\n                time.sleep(interval)\n                index += 1\n\n        t = threading.Thread(target=_printer)\n        t.start()\n        yield\n        end = True\n\n\nlogger = Logger()\n\n\ndef advertise():\n    \"\"\"\n    Show the advertising message like the following:\n    ===========================================================\n    ==        PaddleFleetX is powered by PaddlePaddle !        ==\n    ===========================================================\n    ==                                                       ==\n    ==   For more info please go to the following website.   ==\n    ==                                                       ==\n    ==       https://github.com/PaddlePaddle/PaddleFleetX    ==\n    ===========================================================\n    \"\"\"\n    copyright = \"PaddleFleetX is powered by PaddlePaddle !\"\n    ad = \"For more info please go to the following website.\"\n    website = \"https://github.com/PaddlePaddle/PaddleFleetX\"\n    AD_LEN = 6 + len(max([copyright, ad, website], key=len))\n\n    logger.info(\"\\n{0}\\n{1}\\n{2}\\n{3}\\n{4}\\n{5}\\n{6}\\n{7}\\n\".format(\n        \"=\" * (AD_LEN + 4),\n        \"=={}==\".format(copyright.center(AD_LEN)),\n        \"=\" * (AD_LEN + 4),\n        \"=={}==\".format(' ' * AD_LEN),\n        \"=={}==\".format(ad.center(AD_LEN)),\n        \"=={}==\".format(' ' * AD_LEN),\n        \"=={}==\".format(website.center(AD_LEN)),\n        \"=\" * (AD_LEN + 4), ))\n\nfrom .device import synchronize\ndef get_timestamp():\n    if synchronize():\n        return time.time()\n    else:\n        logger.warning(f\"Device synchronizing failed, which may result uncorrect time\")\n    return time.time()\n\ndef convert_timestamp_to_data(timeStamp):\n    return str(datetime.timedelta(seconds=int(timeStamp)))\n"
  },
  {
    "path": "ppfleetx/utils/tensor_fusion_helper.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nfrom paddle.framework import core\nimport numpy as np\nfrom collections import OrderedDict\n\nfrom paddle.distributed.fleet.meta_parallel.sharding.group_sharded_storage import ParamStorage, GradStorage\nfrom paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import Type\n\nalignment = {\"gpu\": 256, }\nalign = {\n    Type.fp16.value: 2,\n    Type.fp32.value: 4,\n}\n\n\ndef assign_group_by_size(parameters, group_size=256 * 1024 * 1024):\n    is_sparse_gradient = [False] * len(parameters)\n\n    group_indices = core.eager_assign_group_by_size(\n        parameters, is_sparse_gradient, [group_size, group_size])\n\n    var_groups = OrderedDict()\n    for group_idx, indices in enumerate(group_indices):\n        for index in indices:\n            var_groups.setdefault(group_idx, []).append(parameters[index])\n    return var_groups\n\n\ndef flatten_dense_tensors(parameters):\n    _buffer_size = 0\n    _param2align = {}\n    dtype = parameters[0].dtype\n\n    for param in parameters:\n        assert param.trainable, \"param must be trainable...\"\n        size = np.prod(param.shape) * align[dtype]\n        remaining = size % alignment[\"gpu\"]\n        ali = 0 if remaining == 0 else alignment[\"gpu\"] - remaining\n        align_ = ali // align[dtype]\n        _buffer_size += np.prod(param.shape) + align_\n        _param2align[param.name] = align_\n\n    param_storage = ParamStorage(size=_buffer_size, dtype=dtype, device=\"gpu\")\n\n    param_storage.add_rank_params(parameters, _param2align)\n\n    # process gradient\n    grad_storage = GradStorage(\n        size=_buffer_size,\n        dtype=dtype,\n        device=\"gpu\",\n        destination=\"0\",\n        parm2align=_param2align)\n\n    for param in parameters:\n        grad_storage.add_grad(param, _param2align[param.name])\n\n    # param_storage --> grad_storage\n    param_storage.buffer._copy_gradient_from(grad_storage.buffer)\n    param_storage.buffer.stop_gradient = False\n    return param_storage, grad_storage\n\n\ndef obtain_storage(parameters):\n    if len(parameters) < 1:\n        return []\n\n    var_groups = assign_group_by_size(parameters)\n    storage = []\n    for group_idx, parameters in var_groups.items():\n        param_storage, grad_storage = flatten_dense_tensors(parameters)\n        storage.append(param_storage.buffer)\n    return storage\n\n\ndef fused_parameters(parameters, use_sharding=False):\n    decay_params = []\n    other_params = []\n\n    for param in parameters:\n        if not any(nd in param.name for nd in [\"bias\", \"norm\", \"b_0\"]):\n            decay_params.append(param)\n        else:\n            other_params.append(param)\n\n    decay_fused = decay_params if use_sharding else obtain_storage(\n        decay_params)\n    other_fused = other_params if use_sharding else obtain_storage(\n        other_params)\n    all_fused = decay_fused + other_fused\n\n    return decay_fused, all_fused\n\n\ndef all_reduce_parameters(params, group):\n    if group.nranks < 2:\n        return\n\n    div_factor = 1.0 / group.nranks\n    with paddle.framework.no_grad():\n        for p in params:\n            grad = p.grad.scale_(div_factor)\n            paddle.distributed.all_reduce(grad, group=group)\n"
  },
  {
    "path": "ppfleetx/utils/version.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport paddle\nfrom ppfleetx.utils.log import logger\n\ndef version_check():\n    version = paddle.version.full_version\n    logger.info('run with paddle {}, commit id {}'.format(paddle.__version__, paddle.__git_commit__[:8]))\n    if version != '0.0.0':\n        paddle.utils.require_version(min_version='2.4.0')\n\n"
  },
  {
    "path": "projects/ernie/auto_export_ernie_345M_mp1.sh",
    "content": "\n#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nlog_dir=log_auto\nrm -rf $log_dir\n\n# 345M mp1 export\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0\" \\\n    ./tools/auto_export.py \\\n    -c ./ppfleetx/configs/nlp/ernie/auto/finetune_ernie_345M_single_card.yaml \\\n    -o Distributed.mp_degree=1 \\\n"
  },
  {
    "path": "projects/ernie/auto_export_ernie_345M_mp2.sh",
    "content": "\n#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nlog_dir=log_auto\nrm -rf $log_dir\n\n# 345M mp2 export\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1\" \\\n    ./tools/auto_export.py \\\n    -c ./ppfleetx/configs/nlp/ernie/auto/finetune_ernie_345M_single_card.yaml \\\n    -o Distributed.mp_degree=2 \\\n"
  },
  {
    "path": "projects/ernie/auto_export_ernie_345M_mp2_npu.sh",
    "content": "\n#! /bin/bash\n\n# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nlog_dir=log_auto\nrm -rf $log_dir\n\n# 345M mp2 export\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1\" \\\n    ./tools/auto_export.py \\\n    -c ./ppfleetx/configs/nlp/ernie/auto/finetune_ernie_345M_single_card.yaml \\\n    -o Distributed.mp_degree=2 \\\n    -o Global.device=npu \n"
  },
  {
    "path": "projects/ernie/auto_export_ernie_345M_mp2_xpu.sh",
    "content": "\n#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nlog_dir=log_auto\nrm -rf $log_dir\n\nFILENAME=./ppfleetx/configs/nlp/ernie/auto/finetune_ernie_base.yaml\nsed -i \"s/device: gpu/device: xpu/g\" $FILENAME\n\nexport BKCL_PCIE_RING=1\n# 345M mp2 export\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1\" \\\n    ./tools/auto_export.py \\\n    -c ./ppfleetx/configs/nlp/ernie/auto/finetune_ernie_345M_single_card.yaml \\\n    -o Distributed.mp_degree=2 \\\n"
  },
  {
    "path": "projects/ernie/docs/README.md",
    "content": "# ERNIE: Enhanced Representation through kNowledge IntEgration\n\n\n## 1. 模型简介\n\nERNIE是百度开创性提出的基于知识增强的持续学习语义理解框架，它将大数据预训练与多源丰富知识相结合，通过持续学习技术，不断吸收海量文本数据中词汇、结构、语义等方面的知识，实现模型效果不断进化。\n\nERNIE在情感分析、文本匹配、自然语言推理、词法分析、阅读理解、智能问答等16个公开数据集上全面显著超越世界领先技术，在国际权威的通用语言理解评估基准GLUE上，得分首次突破90分，获得全球第一。\n相关创新成果也被国际顶级学术会议AAAI、IJCAI收录。\n同时，ERNIE在工业界得到了大规模应用，如搜索引擎、新闻推荐、广告系统、语音交互、智能客服等。\n\nERNIE 通过建模海量数据中的词、实体及实体关系，学习真实世界的语义知识。相较于 BERT 学习原始语言信号，ERNIE 直接对先验语义知识单元进行建模，增强了模型语义表示能力。\n\n这里我们举个例子：\n```\nLearnt by BERT ：哈 [mask] 滨是 [mask] 龙江的省会，[mask] 际冰 [mask] 文化名城。\nLearnt by ERNIE：[mask] [mask] [mask] 是黑龙江的省会，国际 [mask] [mask] 文化名城。\n```\n在 BERT 模型中，我们通过『哈』与『滨』的局部共现，即可判断出『尔』字，模型没有学习与『哈尔滨』相关的任何知识。而 ERNIE 通过学习词与实体的表达，使模型能够建模出『哈尔滨』与『黑龙江』的关系，学到『哈尔滨』是 『黑龙江』的省会以及『哈尔滨』是个冰雪城市。\n\n\n### 1.1 目录结构\n\n```text\n.\n├── docs\n│   └── inference.md\n│   └── README.md\n├── auto_export_ernie_345M_mp1.sh           # 345M ernie-base模型，自动切分单卡导出\n├── auto_export_ernie_345M_mp2.sh           # 345M ernie-base模型，自动切分多卡导出\n├── auto_export_ernie_345M_mp2_xpu.sh       # 345M ernie-base模型，自动切分多卡导出（XPU）\n├── export_ernie_345M_single_card.sh        # 345M ernie-base模型，单卡导出\n├── finetune_ernie_345M_single_card.sh      # 345M ernie-base模型，单卡finetune训练\n├── inference.py                            # ernie推理代码\n├── pretrain_ernie_base_175B_mp8_pp16.sh    # 175B ernie-base模型，3D混合并行\n├── pretrain_ernie_base_3D.sh               # ci测试\n├── pretrain_ernie_base_6.7B_sharding16.sh  # 6.7B ernie-base模型，sharding16\n├── pretrain_ernie_base.sh                  # 345M ernie-base模型，单卡\n├── pretrain_ernie_large.sh                 # ernie-large模型，单卡\n├── run_inference.sh                        # ernie 推理运行脚本 \n├── run_inference_mp2.sh                    # ernie 多卡推理运行脚本 \n└── run_inference_mp2_xpu.sh                # ernie 多卡推理运行脚本（XPU)\n\n```\n\n\n\n### 1.2 依赖环境\n\n- paddlenlp\n- pybind11\n\n安装命令 `pip install pybind11 paddlenlp`\n\n\n## 2.中文预训练\n\nERNIE预训练采用的是MLM（Mask Language Model）的训练方式，采用WWM（Whole Word Mask）方式，对于完整语义单元的Token，会同时进行Mask。整体的训练损失loss是mlm_loss + sop_loss。\n\n\n### 2.1 小规模语料预训练: 14GB - CLUECorpusSmall\n\n<details>\n<summary><b>CLUECorpusSmall 数据准备</b></summary>\n\n#### 数据准备\n数据下载部分请参考[data_tools](https://github.com/PaddlePaddle/PaddleFleetX/tree/develop/ppfleetx/data/data_tools/ernie/preprocess/docs/CLUECorpusSmall.md)目录，根据文档中`CLUECorpusSmall 数据集处理教程`，下载数据。下载好后:\n\n解压文件\n```shell\nunzip comment2019zh_corpus.zip -d  clue_corpus_small_14g/comment2019zh_corpus\nunzip news2016zh_corpus.zip    -d  clue_corpus_small_14g/news2016zh_corpus\nunzip webText2019zh_corpus.zip -d  clue_corpus_small_14g/webText2019zh_corpus\nunzip wiki2019zh_corpus.zip    -d  clue_corpus_small_14g/wiki2019zh_corpus\n```\n将txt文件转换为jsonl格式\n```\npython ./ppfleetx/data/data_tools/ernie/preprocess/trans_to_json.py  --input_path ./clue_corpus_small_14g --output_path clue_corpus_small_14g.jsonl\n```\n现在我们得到了jsonl格式的数据集，下面是针对训练任务的数据集应用，此处以ernie为例。\n```\npython -u  ./ppfleetx/data/data_tools/ernie/preprocess/create_pretraining_data.py \\\n    --model_name ernie-1.0-base-zh \\\n    --tokenizer_name ErnieTokenizer \\\n    --input_path clue_corpus_small_14g.jsonl \\\n    --split_sentences\\\n    --chinese \\\n    --cn_whole_word_segment \\\n    --cn_seg_func jieba \\\n    --output_prefix clue_corpus_small_14g_20220104 \\\n    --workers 48 \\\n    --log_interval 10000\n```\n数据共有文档`15702702`条左右，由于分词比较耗时，大概一小时左右可以完成。在当前目录下产出训练所需数据。\n```\nclue_corpus_small_14g_20220104_ids.npy\nclue_corpus_small_14g_20220104_idx.npz\n```\n\n</details>\n\n\n<details>\n<summary><b>CLUECorpusSmall 开始训练</b></summary>\n\n#### 开始训练\n\n\n将制作好的数据`clue_corpus_small_14g_20220104_ids.npy,clue_corpus_small_14g_20220104_idx.npz`移动到input_dir中，即可开始训练。\n\n\n除了单卡训练，飞桨还支持数据并行、混合并行、自动并行、重计算等多种分布式策略，减少显存占用、加速训练，达到大模型可训练且训得快的效果。在模型训练前，需要根据模型规模选择合适的并行策略。下面分别从单卡训练、混合并行训练和自动并行训练三个方面来介绍ERNIE模型训练的配置文件和启动方式。\n\n\n- 单卡训练\n\n```shell\ncd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略\n\n# 345M\npython tools/train.py -c ppfleetx/configs/nlp/ernie/pretrain_ernie_base_345M_single_card.yaml \n```\n\n- 混合并行\n\n```shell\ncd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略\n\n# 175B run_pretrain\nlog_dir=log_175B\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/train.py \\\n    -c ./ppfleetx/configs/nlp/ernie/pretrain_ernie_base_175B_mp8_pp16.yaml\n\n```\n\n## 3.下游任务微调\n基于训练中产出的checkpoint，用户可以快速对当前模型效果进行评估。PaddleFleetX已经适配了主流下游任务 —— 序列分类，用户可以根据自己的需求，评估自己所需的数据集。\n\n#### 运行实例\n\n- 单卡训练\n\n```\ncd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略\n\npython tools/train.py -c ppfleetx/configs/nlp/ernie/finetune_ernie_345M_single_card.yaml\n```\n\n\n- 数据并行\n\n```\ncd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略\n\nlog_dir=log_dp8\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/train.py \\\n    -c ./ppfleetx/configs/nlp/ernie/finetune_ernie_345M_single_card.yaml \\\n    -o Model.use_recompute=True\n```\n</details>\n\n## 3. 推理部署\n\n[推理部署](inference.md)\n"
  },
  {
    "path": "projects/ernie/docs/inference.md",
    "content": "# 推理部署\n\n模型训练完成后，可使用飞桨高性能推理引擎Paddle Inference通过如下方式进行推理部署。\n\n## 1. 模型导出\n\n以`ERNIE(345M)`模型为例\n\n\n导出单卡`ERNIE(345M)`模型：\n```bash\nsh projects/ernie/auto_export_ernie_345M_mp1.sh\n```\n\n导出多卡`ERNIE(345M)`模型：\n```bash\nsh projects/ernie/auto_export_ernie_345M_mp2.sh\n```\n\n导出多卡`ERNIE(345M)`模型(XPU)：\n```bash\nsh projects/ernie/auto_export_ernie_345M_mp2_xpu.sh\n```\n\n## 2. 推理部署\n\n模型导出后，可通过`tasks/ernie/inference.py`脚本进行推理部署。\n\n`ERNIE(345M)` 推理\n```bash\nbash projects/ernie/run_inference.sh\n```\n\n`ERNIE(345M)` 多卡推理\n```bash\nbash projects/ernie/run_inference_mp2.sh\n```\n\n`ERNIE(345M)` 多卡推理(XPU)\n```bash\nbash projects/ernie/run_inference_mp2_xpu.sh\n```\n\n## 3. Benchmark\n\n测试中\n"
  },
  {
    "path": "projects/ernie/export_ernie_345M_single_card.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nexport CUDA_VISIBLE_DEVICES=0\npython ./tools/export.py -c ./ppfleetx/configs/nlp/ernie/inference_ernie_345M_single_card.yaml \n"
  },
  {
    "path": "projects/ernie/finetune_ernie_345M_single_card.sh",
    "content": "\n#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nexport CUDA_VISIBLE_DEVICES=0\npython tools/train.py -c ppfleetx/configs/nlp/ernie/finetune_ernie_345M_single_card.yaml\n"
  },
  {
    "path": "projects/ernie/finetune_ernie_345M_single_card_npu.sh",
    "content": "\n#! /bin/bash\n\n# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\npython tools/train.py -c ppfleetx/configs/nlp/ernie/finetune_ernie_345M_single_card.yaml \\\n        -o Global.device=npu \\\n        -o Model.hidden_size=256\n"
  },
  {
    "path": "projects/ernie/inference.py",
    "content": "# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))\n\nimport numpy as np\nimport paddle.distributed.fleet as fleet\nfrom ppfleetx.data.tokenizers import GPTTokenizer\nfrom ppfleetx.core.engine import InferenceEngine\nimport argparse\n\n\ndef parse_args():\n    parser = argparse.ArgumentParser(\"ernie inference\")\n    parser.add_argument(\n        '-m', '--model_dir', type=str, default='./output', help='model dir')\n    parser.add_argument(\n        '-mp', '--mp_degree', type=int, default=1, help='mp degree')\n    parser.add_argument(\n        '-d', '--device', type=str, default='', help='device type')\n    args = parser.parse_args()\n    return args\n\n\ndef main(args):\n    fleet.init(is_collective=True)\n    infer_engine = InferenceEngine(\n        args.model_dir, args.mp_degree, device=args.device)\n    tokenizer = GPTTokenizer.from_pretrained(\"gpt2\")\n    text = 'Hi ERNIE. Tell me who Jack Ma is.'\n    inputs = tokenizer(text, padding=True, return_attention_mask=True)\n\n    whole_data = [\n        np.array(inputs['token_type_ids']).reshape(1, -1),\n        np.array(inputs['input_ids']).reshape(1, -1)\n    ]\n    outs = infer_engine.predict(whole_data)\n    print(outs)\n\n\nif __name__ == \"__main__\":\n    args = parse_args()\n    main(args)\n"
  },
  {
    "path": "projects/ernie/pretrain_ernie_base.sh",
    "content": "\n#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nexport CUDA_VISIBLE_DEVICES=1\npython tools/train.py -c ppfleetx/configs/nlp/ernie/pretrain_ernie_base_345M_single_card.yaml \n"
  },
  {
    "path": "projects/ernie/pretrain_ernie_base_175B_mp8_pp16.sh",
    "content": "#! /bin/bash\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nlog_dir=log_hybrid\nrm -rf $log_dir\n\n# 175B run_pretrain\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/train.py \\\n    -c ./ppfleetx/configs/nlp/ernie/pretrain_ernie_base_175B_mp8_pp16.yaml\n"
  },
  {
    "path": "projects/ernie/pretrain_ernie_base_3D.sh",
    "content": "\n#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nlog_dir=log_hybrid\nrm -rf $log_dir\n\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/train.py \\\n    -c ppfleetx/configs/nlp/ernie/pretrain_ernie_base_3D.yaml \\\n    -o Data.Train.dataset.input_dir=./dataset/ernie \\\n    -o Data.Eval.dataset.input_dir=./dataset/ernie \\\n    -o Engine.max_steps=10\n"
  },
  {
    "path": "projects/ernie/pretrain_ernie_base_3D_npu.sh",
    "content": "\n#! /bin/bash\n\n# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nlog_dir=log_hybrid\nrm -rf $log_dir\nexport PADDLE_P2P_SYNC_SEND=1\n\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/train.py \\\n    -c ppfleetx/configs/nlp/ernie/pretrain_ernie_base_3D.yaml \\\n    -o Data.Train.dataset.input_dir=./dataset/ernie \\\n    -o Data.Eval.dataset.input_dir=./dataset/ernie \\\n    -o Engine.max_steps=10 \\\n    -o Global.device=npu\n"
  },
  {
    "path": "projects/ernie/pretrain_ernie_base_6.7B_sharding16.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nlog_dir=log_hybrid\nrm -rf $log_dir\n\n# 6.7B+sharding16 run_pretrain\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/train.py \\\n    -c ./ppfleetx/configs/nlp/ernie/pretrain_ernie_base_6.7B_sharding16.yaml\n"
  },
  {
    "path": "projects/ernie/pretrain_ernie_large.sh",
    "content": "\n#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nexport CUDA_VISIBLE_DEVICES=1\npython tools/train.py -c ppfleetx/configs/nlp/ernie/pretrain_ernie_large_single_card.yaml \n"
  },
  {
    "path": "projects/ernie/pretrain_ernie_large_mp2_mlu.sh",
    "content": "#! /bin/bash\n# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nexport MLU_VISIBLE_DEVICES=0,1\nexport PADDLE_XCCL_BACKEND=mlu\nexport FLAGS_selected_mlus=0,1\nLOG_DIR=log_ernie\nLOG_GFILE=log_ernie_large_hybrid\n\nmkdir -p ${LOG_DIR}\n\npython -m paddle.distributed.launch \\\n       --log_dir ${LOG_DIR} \\\n       --device 0,1 tools/train.py \\\n       -c ppfleetx/configs/nlp/ernie/pretrain_ernie_large_single_card.yaml \\\n       -o Global.device=mlu \\\n       -o Distributed.mp_degree=2 \\\n       -o Distributed.dp_degree=1 \\\n       -o Distributed.pp_degree=1 \\\n       -o Model.use_recompute=Fasle > ${LOG_DIR}/${LOG_GFILE} 2>&1 &\n"
  },
  {
    "path": "projects/ernie/pretrain_ernie_large_mp2_npu.sh",
    "content": "\n#! /bin/bash\n\n# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\npython -m paddle.distributed.launch \\\n        --device 0,1 tools/train.py \\\n        -c ppfleetx/configs/nlp/ernie/pretrain_ernie_large_single_card.yaml \\\n        -o Global.device=npu \\\n        -o Distributed.mp_degree=2 \\\n        -o Distributed.dp_degree=1 \\\n        -o Distributed.pp_degree=1 \\\n        -o Model.use_recompute=Fasle\n"
  },
  {
    "path": "projects/ernie/pretrain_ernie_large_mp2_pp2_npu.sh",
    "content": "\n#! /bin/bash\n\n# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nexport PADDLE_P2P_SYNC_SEND=1\n\npython -m paddle.distributed.launch \\\n        --device 0,1,2,3 tools/train.py \\\n        -c ppfleetx/configs/nlp/ernie/pretrain_ernie_large_single_card.yaml \\\n        -o Global.device=npu \\\n        -o Distributed.mp_degree=2 \\\n        -o Distributed.dp_degree=1 \\\n        -o Distributed.pp_degree=2 \\\n        -o Model.use_recompute=True\n"
  },
  {
    "path": "projects/ernie/pretrain_ernie_large_npu.sh",
    "content": "\n#! /bin/bash\n\n# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\npython tools/train.py -c ppfleetx/configs/nlp/ernie/pretrain_ernie_large_single_card.yaml \\\n        -o Global.device=npu\n"
  },
  {
    "path": "projects/ernie/run_inference.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nunset CUDA_VISIBLE_DEVICES\n\npython -u -m paddle.distributed.launch \\\n    --gpus \"0\" \\\n    --log_dir \"log\" \\\n    projects/ernie/inference.py --model_dir \"./output\" --mp_degree 1\n"
  },
  {
    "path": "projects/ernie/run_inference_mp2.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nunset CUDA_VISIBLE_DEVICES\n\npython -u -m paddle.distributed.launch \\\n    --gpus \"0,1\" \\\n    --log_dir \"log\" \\\n    projects/ernie/inference.py --model_dir \"./output\" --mp_degree 2\n"
  },
  {
    "path": "projects/ernie/run_inference_mp2_npu.sh",
    "content": "# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\npython -u -m paddle.distributed.launch \\\n    --devices \"0,1\" \\\n    --log_dir \"log\" \\\n    projects/ernie/inference.py --model_dir \"./output\" --mp_degree 2 --device npu\n"
  },
  {
    "path": "projects/ernie/run_inference_mp2_xpu.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nexport BKCL_PCIE_RING=1\npython -u -m paddle.distributed.launch \\\n    --devices \"0,1\" \\\n    --log_dir \"log\" \\\n    projects/ernie/inference.py --model_dir \"./output\" --mp_degree 2\n"
  },
  {
    "path": "projects/gpt/auto_export_gpt_175B_mp8.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nlog_dir=log_mp8\nrm -rf $log_dir\n\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/auto_export.py \\\n    -c ./ppfleetx/configs/nlp/gpt/auto/generation_gpt_175B_mp8.yaml"
  },
  {
    "path": "projects/gpt/auto_export_gpt_345M_mp2.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nlog_dir=log_mp2\nrm -rf $log_dir\n\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1\" \\\n    ./tools/auto_export.py \\\n    -c ./ppfleetx/configs/nlp/gpt/auto/generation_gpt_345M_mp2.yaml \\\n"
  },
  {
    "path": "projects/gpt/auto_export_gpt_345M_single_card.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nlog_dir=log_345m_mp1\nrm -rf $log_dir\n\nDIRECTORY=./pretrained\nif [ ! -d \"$DIRECTORY\" ]; then\n  echo \"start download ckpt\"\n  wget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_FP16.tar.gz\n  tar -zxvf GPT_345M_FP16.tar.gz\nfi\n\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"1\" \\\n    ./tools/auto_export.py \\\n    -c ./ppfleetx/configs/nlp/gpt/auto/generation_gpt_345M_single_card.yaml \\\n    -o Engine.save_load.ckpt_dir=./pretrained/auto\n"
  },
  {
    "path": "projects/gpt/auto_export_gpt_6.7B_mp1.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nlog_dir=log_mp1\nrm -rf $log_dir\n\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0\" \\\n    ./tools/auto_export.py \\\n    -c ./ppfleetx/configs/nlp/gpt/auto/generation_gpt_6.7B_mp1.yaml"
  },
  {
    "path": "projects/gpt/auto_export_gpt_fp16_single_card.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\npython ./tools/auto_export.py -c ./ppfleetx/configs/nlp/gpt/auto/export_gpt_fp16_single_card.yaml \\\n    -o Engine.save_load.output_dir=\"./serial_model\" \\\n    -o Engine.save_load.ckpt_dir=\"./output/rank_0/model\" \\\n"
  },
  {
    "path": "projects/gpt/auto_gpt_1.3B_dp8.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nlog_dir=log_auto\nrm -rf $log_dir\n\n# 1.3B+dp8 run_pretrain\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/auto.py \\\n    -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8.yaml\n"
  },
  {
    "path": "projects/gpt/auto_gpt_1.3B_dp8_tuning.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nlog_dir=log_auto\nrm -rf $log_dir\n\n# 1.3B+dp8 recompute tuning\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/auto.py \\\n    -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8_tuning.yaml\n"
  },
  {
    "path": "projects/gpt/auto_gpt_1.3B_single_card.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nexport FLAGS_USE_STANDALONE_EXECUTOR=False\nexport CUDA_VISIBLE_DEVICES=0\npython ./tools/auto.py -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_single_card.yaml \n"
  },
  {
    "path": "projects/gpt/auto_gpt_345M_single_card.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nexport FLAGS_USE_STANDALONE_EXECUTOR=False\nexport CUDA_VISIBLE_DEVICES=0\npython ./tools/auto.py -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_345M_single_card.yaml \n"
  },
  {
    "path": "projects/gpt/auto_gpt_6.7B_sharding16.sh",
    "content": "#! /bin/bash\n# Runs the \"1.3B\" parameter model\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nlog_dir=log_auto\nrm -rf $log_dir\n\n# 6.7B+sharding16 run_pretrain\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/auto.py \\\n    -c ./ppfleetx/configs/nlp/gp/auto/pretrain_gpt_6.7B_sharding16.yaml\n"
  },
  {
    "path": "projects/gpt/auto_qat_export_gpt_345M_mp2.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nlog_dir=log_auto\nrm -rf $log_dir\n\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1\" \\\n    ./tools/auto_export.py \\\n    -c ./ppfleetx/configs/nlp/gpt/auto/qat_generation_gpt_345M_mp2.yaml \\\n    -o Engine.save_load.output_dir=\"./mp2_qat_model\" \\\n"
  },
  {
    "path": "projects/gpt/benchmark.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport time\nimport argparse\nimport numpy as np\n\nimport paddle\nimport paddle.distributed.fleet as fleet\nfrom ppfleetx.core.engine.inference_engine import InferenceEngine\nimport ppfleetx_ops\n\n\ndef parse_args():\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\n        \"--seq_len\",\n        default=128,\n        type=int,\n        required=False,\n        help=\"seq length of inputs\")\n    parser.add_argument(\n        \"--iter\", default=100, type=int, help=\"run iterations for timing\")\n    parser.add_argument(\"--mp_degree\", default=1, type=int, help=\"\")\n    parser.add_argument(\n        \"--model_dir\", default=\"output\", type=str, help=\"model directory\")\n\n    args = parser.parse_args()\n    return args\n\n\ndef predict(engine, data, args):\n\n    with engine._static_guard:\n        for d, name in zip(data, engine.input_names()):\n            handle = engine.predictor.get_input_handle(name)\n            handle.copy_from_cpu(d)\n\n        for _ in range(10):\n            engine.predictor.run()\n        engine.predictor.get_output_handle(engine.output_names()[\n            0]).copy_to_cpu()\n\n        start = time.perf_counter()\n        for _ in range(args.iter):\n            engine.predictor.run()\n        end = time.perf_counter()\n        print(\n            f\"batch {args.iter} run time: {1000 * (end - start) / args.iter}ms\")\n\n        return {name: engine.predictor.get_output_handle(name).copy_to_cpu() \\\n                for name in engine.output_names()}\n\n\ndef main():\n\n    args = parse_args()\n\n    fleet.init(is_collective=True)\n    infer_engine = InferenceEngine(args.model_dir, args.mp_degree)\n    ids = [100] * args.seq_len\n\n    # run test\n    for batch in [1, 2, 4, 8, 16]:\n\n        whole_data = [ids] * batch\n        whole_data = np.array(whole_data, dtype=\"int64\").reshape(1, batch, -1)\n\n        _ = predict(infer_engine, whole_data, args)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "projects/gpt/docs/README.md",
    "content": "# GPT\n\n## 模型介绍\nGPT-[2](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)/[3](https://arxiv.org/pdf/2005.14165.pdf) 是以[Transformer](https://arxiv.org/abs/1706.03762) 解码器为网络基本组件，使用自回归的方式在大规模无标注文本语料上进行预训练得到的语言生成模型。\n\n本项目是语言模型 GPT 的 PaddlePaddle 大模型实现。目前，PaddleFleetX 提供了 [GPT-345M](https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz) 的预训练模型文件；分别基于 [LAMBADA](https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl) 和 [WikiText](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip) 数据集，采用 ACC(accuracy) 和 PPL(perplexity) 指标后的评估结果如下：\n\n| **模型文件** | **ACC** | **PPL** |\n|---------|-----------|---------------|\n| GPT-345M | 44.17% |  18.01  |\n\n下面是本例的简要目录结构及说明：\n\n```text\n.\n├── auto_export_gpt_345M_mp2.sh            # 自动并行345M模型两卡张量并行导出入口\n├── auto_gpt_345M_single_card.sh           # 自动并行345M模型单卡预训练入口\n├── auto_gpt_1.3B_single_card.sh           # 自动并行1.3B模型单卡预训练入口\n├── auto_gpt_1.3B_dp8.sh                   # 自动并行1.3B模型数据并行预训练入口\n├── auto_gpt_6.7B_sharding16.sh            # 自动并行6.7B模型分组切片并行预训练入口\n├── evaluate_gpt_345M_single_card.sh       # 单卡345M模型评估入口\n├── export_gpt_345M_single_card.sh         # 单卡345M模型动转静导出入口\n├── finetune_gpt_345M_single_card.sh       # 单卡345M模型finetune训练入口\n├── inference_gpt_345M_single_card.sh      # 单卡345M模型推理入口\n├── pretrain_gpt_345M_single_card.sh       # 单卡345M模型预训练入口\n├── pretrain_gpt_1.3B_single_card.sh       # 单卡1.3B模型预训练入口\n├── pretrain_gpt_1.3B_dp8.sh               # 8卡1.3B模型数据并行预训练入口\n├── pretrain_gpt_6.7B_sharding16.sh        # 16卡6.7B模型分组切片并行预训练入口\n├── pretrain_gpt_175B_mp8_pp16.sh          # 128卡175B模型混合并行预训练入口\n├── qat_gpt_345M_single_card.sh            # 单卡345M模型量化训练入口\n├── qat_gpt_345M_mp8.sh                    # 8卡345M模型模型并行量化训练入口\n├── qat_gpt_6.7B_sharding16.sh             # 16卡6.7B模型分组切片并行量化训练入口\n├── eval_qat_gpt_345M_single_card.sh       # 单卡345M量化模型验证入口\n├── export_qat_gpt_345M_single_card.sh     # 单卡345M量化模型导出入口\n```\n\n## 快速开始\n\n### 环境依赖\n\n请确保已根据根目录 requirements.txt 安装所需依赖，或者通过以下命令快速安装\n\n```shell\npython -m pip install -r https://raw.githubusercontent.com/PaddlePaddle/PaddleFleetX/develop/requirements.txt -i https://mirror.baidu.com/pypi/simple\n```\n\n### 数据准备\n\n数据获取和制作详见[GPT 模型预训练数据准备流程](https://github.com/PaddlePaddle/PaddleFleetX/tree/develop/ppfleetx/data/data_tools/gpt)\n\n为了方便用户运行测试本模型，此处提供处理好的300M的训练样本，在单卡训练或混合并行训练前都需要通过以下命令获取数据。\n\n**数据下载命令**\n```shell\ncd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略\n\n# 下载样例数据\nmkdir data && cd data\nwget -O gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy\nwget -O gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz\n\ncd .. # 回到 PaddleFleetX 根目录下\n```\n\n### 模型训练\n\n除了单卡训练，飞桨还支持数据并行、混合并行、自动并行、重计算等多种分布式策略，减少显存占用、加速训练，达到大模型可训练且训得快的效果。在模型训练前，需要根据模型规模选择合适的并行策略。下面分别从单卡训练、混合并行训练和自动并行训练三个方面来介绍GPT模型训练的配置文件和启动方式。\n\n\n- [单卡训练](./single_card.md)\n\n- [混合并行训练](./hybrid_parallel.md)\n\n- [自动并行训练](./auto_parallel.md)\n\n### 文本生成体验\n\n- [单卡预训练模型文本生成](./single_card.md#GPT-Zero-shot-文本生成)\n\n- [混合并行预训练模型文本生成](./hybrid_parallel.md#GPT-Zero-shot-文本生成)\n\n\n### 模型压缩\n\n- [量化训练](./quantization_aware_training.md)\n\n### 推理部署\n\n- [推理部署](inference.md)\n### GLUE 下游任务微调\n\n- [单卡微调](./single_finetune.md)\n\n\n## 参数释义\n\n\n### 全局信息\n全局参数指定训练的batch size，以及设备、随机种子等信息。\n```yaml\n  Global:\n    device: gpu\n    seed: 1024\n\n    global_batch_size: \n    local_batch_size: 1\n    micro_batch_size: 1\n```\n\n其中参数对应的释义如下：\n| **参数名**                      | **参数释义**               |\n|------------------------------|------------------------|\n| device | 设备信息 |\n| seed | 随机数种子 |\n| global_batch_size | 全局的batch size大小，即一次参数更新等效的batch size |\n| local_batch_size  | 每个进程训练的batch size大小                  |\n| micro_batch_size  | 每次前向计算的batch size大小                  |\n\n\n### Engine训练控制\n\nEngine训练设置完成模型训练/验证/推理等过程中的参数设置，是fleetX的EagerEngine的必要参数，所有使用该Engine都必须指定该配置。 其中包含的参数有：\n\n```yaml\n  Engine:\n    max_steps: 500000\n    num_train_epochs: 1\n    accumulate_steps: \n    logging_freq: 1\n    eval_freq: 500\n    eval_iters: 10\n    test_iters:\n    mix_precision:\n      enable: True\n      dtype: \"float16\"\n      level: \"O2\"\n      scale_loss: 32768.0\n      custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\"]\n      custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n    save_load:\n      save_steps: 1000\n      save_epoch: 1\n      output_dir: ./output\n      ckpt_dir:\n```\n其中参数对应的释义如下：\n\n| **参数名**                      | **参数释义**               |\n|------------------------------|------------------------|\n| max_steps         | 最大训练步数                               |\n| num_train_epochs  | 训练的epoch数量                           |\n| accumulate_steps  | 梯度累加次数                           |\n| logging_freq      | 训练日志打印的频率                            |\n| eval_freq         | 模型评估间隔                               |\n| eval_iters        | 模型评估时训练评估测试集的轮数                      |\n| test_iters        | 模型测试或推理时的轮数                      |\n| enable            | 是否使用混合精度策略进行训练                     |\n| dtype             | 混合精度训练数据类型使用float16还是bfloat16，默认为float16类型 |\n| level             | 混合精度训练模式，默认``O2``模式                 |\n| scale_loss        | 使用fp16混合精度策略下，loss的放缩比例                  |\n| custom_black_list | 自定义算子黑名单。这个名单中的算子在支持混合精度计算时会被认为是数值危险的，它们的影响也可能会在下游操作中观察到。这些算子通常不会转为float16/bfloat16计算 |\n| custom_white_list | 自定义算子白名单。这个名单中的算子在支持混合精度计算时会被认为是数值安全的，并且对性能至关重要。如果设置了白名单，该名单中的算子会使用float16/bfloat16计算 |\n| save_steps        | 保存模型间隔step数                         |\n| save_epoch        | 保存模型间隔epoch数                        |\n| output_dir        | 指定输出文件                              |\n| ckpt_dir          | checkpoint的加载目录                      |\n\n### 模型网络\n\n网络部分完成了网络的组网操作，GPT在[PaddleFleetX/ppfleetx/models/language_model/gpt/dygraph/single_model.py]((https://github.com/PaddlePaddle/PaddleFleetX/blob/develop/ppfleetx/models/language_model/gpt/dygraph/single_model.py))下。 \n可以使用配置文件配置模型的规模，如：\n\n```yaml\n  Model:\n    module: \"GPTModule\"\n    name: \"GPT\"\n    vocab_size: 50304\n    hidden_size: 1024\n    num_layers: 24\n    num_attention_heads: 16\n    ffn_hidden_size:\n    hidden_dropout_prob: 0.1\n    attention_probs_dropout_prob: 0.1\n    max_position_embeddings: 1024\n    type_vocab_size: 16\n    initializer_range: 0.02\n    use_recompute: True\n    recompute_granularity:\n    no_recompute_layers:\n    fused_linear: True\n    fuse_attn_qkv: True\n    sequence_parallel: False\n```\n\n其中参数对应的释义如下：\n| **参数名**                      | **参数释义**               |\n|------------------------------|------------------------|\n| module | 指定GPT模型的执行模块 ｜\n| vocab_size                   | 训练词表大小                 |\n| hidden_size                  | 隐藏层大小                  |\n| num_layers                   | transformer层数          |\n| num_attention_heads          | attention head的数量      |\n| max_seq_len                  | 输入文本序列的长度              |\n| ffn_hidden_size              | ffn层大小，一般为隐藏层的四倍       |\n| attention_probs_dropout_prob | attention中的dropout的失活率 |\n| max_position_embeddings      | position embedding的长度  |\n| type_vocab_size              | 词表类型                   |\n| initializer_range            | 参数初始化的范围               |\n| use_recompute     | 是否使用recompute训练                      |\n| recompute_granularity | recompute训练的粒度，可选 `full` `full_attn` `core_attn`，full即recompute全部transformer，full_attn表明只recompute所有self attention部分，core_attn表明只recompute `softmax(qkT)v` 部分。注：显存占用方面，`core_attn` > `full_attn` > `full`，若所选策略产生OOM错误，可以适当更改recompute_granularity |\n|no_recompute_layers| list of integer，标识哪些层的transformer不需要进行recompute。所有在该list中的值应该 >= 0 同时应该 < num_layers。向该参数中增加不进行recompute 的层数可以提升模型训练的整体吞吐，但是会适当的增加显存。若训练中发现有显存富裕，可以适当增加不进行recompute的层数。如果使用该参数后出现OOM错误，可以适当减小不进行recompute的层数。 ｜\n| fused_linear      | 是否使用fused_linear代替传统Linear加速训练。注：该功能需要cuda 11.6及以上编译的paddle支持。       |\n| fuse_attn_qkv     | 是否对attention层中的qkv计算使用fuse策略以加速训练 |\n| sequence_parallel | 是否使用序列并行策略以加速训练。注：只有混合并行的GPT才支持该功能，它与张量模型并行共用通信组，当mp_degree=1时，序列并行策略会被强制关闭。 |\n| virtual_pp_degree | 虚拟流水线并行维度，该参数会减小流水线bubble的占比以提升流水线的吞吐。但是该参数会增加流水线间的通讯，所以该参数的推荐值为2。并且，只有 num_layers可以被 pp_degree * virtual_pp_degree 整除时，才可以使用虚拟流水线并行。 |\n### 数据集\n\n数据集参数分为“Train”、“Eval”和“Test”三部分，分别对应模型预训练、离线评估、推理等三个模块。\n\n每个模型的配置参数都包含以下内容：\n\n```yaml\n  Data:\n    Train:\n      dataset:\n        name: GPTDataset\n        input_dir: ./data/\n        split: [949, 50, 1]\n        max_seq_len: 1024\n      sampler:\n        name: DistributedBatchSampler\n        shuffle: False\n        drop_last: True\n      loader:\n        num_workers: 1\n        return_list: False\n        collate_fn: gpt_collate_fn\n```\n\n其中参数对应的释义如下：\n| **参数名**                      | **参数释义**               |\n|------------------------------|------------------------|\n| dataset.name         | 指定自定义数据集的名称  |\n| input_dir         | 指定输入文件，可以使用目录，指定目录时将包括目录中的所有文件       |\n| split             | 训练集，验证集和测试集的切分比例                     |\n| max_seq_len       | 输入文本序列的长度                            |\n| sampler.name         | 指定自定义采样器的名称  |\n| shuffle         | 是否需要在生成样本下标时打乱顺序     |\n| drop_last             | 是否需要丢弃最后无法凑整一个mini-batch的样本        |\n| num_workers        | 用于加载数据的子进程个数  |\n| return_list         | 每个设备上的数据是否以list形式返回    |\n| collate_fn             | 通过此参数指定如果将样本列表组合为mini-batch数据；支持自定义     |\n\n\n### 优化器\n\n\nGPT训练默认使用AdamW优化器以及cosine学习率衰减，这里通过配置文件配置优化器的参数，如：\n\n```yaml\n  Optimizer:\n    name: AdamW\n    weight_decay: 0.01\n    beta1: 0.9\n    beta2: 0.999\n    epsilon: 1.0e-8\n    lr:\n      name: CosineAnnealingWithWarmupDecay\n      decay_steps: 360000\n      warmup_rate: 0.01\n      max_lr: 5.0e-5\n      min_lr: 1.0e-5\n    grad_clip:\n      name: \"ClipGradByGlobalNorm\"\n      clip_norm: 1.0\n    tensor_fusion: False\n```\n\n其中参数说明：\n\n| **参数名**      | **参数释义**                  |\n|--------------|---------------------------|\n| name | 指定自定义优化器的名称               |\n| weight_decay | weight的衰减率                |\n| beta1   | 一阶矩估计的指数衰减率               |\n| beta2   | 二阶矩估计的指数衰减率               |\n| epsilon | 指定优化器需要优化的参数              |\n| lr.name | 指定自定义学习率策略的名称               |\n| decay_steps  | 衰减的步长                     |\n| warmup_rate  | warmup 率                  |\n| max_lr       | Adam 的初始最大学习率             |\n| min_lr       | Adam 的初始最小学习率             |\n| grad_clip.name    | 指定自定义梯度裁剪策略的名称 |\n| clip_norm    | 所允许的范数最大值 |\n| tensor_fusion    | 是否使用tensor_fustion功能加速训练 |\n\n另外，[Profiler](./hybrid_profiler.md)中还介绍了在 GPT 中开启 Profiler 并分析调试分析结果的方法及相关的参数解释。\n\n### 模型压缩\nPaddleFleetX 集成了 PaddleSlim 中的常见的压缩方法：量化训练（Qutization Aware Training，QAT）、结构化稀疏（Structured Pruning，SP）和知识蒸馏（Knowledge Distillation，KD）。详细参数介绍见[模型压缩介绍](../../../docs/compression.md)。\n\n\n## 参考文献\n- [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)\n- [Language Models are Few-Shot Learners](https://arxiv.org/pdf/2005.14165.pdf)\n- [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413)\n"
  },
  {
    "path": "projects/gpt/docs/auto_parallel.md",
    "content": "# GPT 自动并行模型训练\n\n分布式并行训练技术使超大模型成为可能，但分布式训练程序的编写门槛较高，并行算法较为复杂，开发者需同时具有较好的工程能力和算法功底。为了降低分布式训练的难度，自动并行成为新的研究热点，受到学术界和工业界的广泛关注。自动并行通常分为半自动并行和全自动并行。半自动并行指的是开发者在单机脚本的基础上额外添加少量标注信息即可表达并行逻辑。而全自动并行则无需开发者添加任何并行逻辑，根据单机脚本自动搜索出较为高效的并行策略，实现分布式训练。\n\n\n## 参数释义\n\n### 全局信息\n全局信息指定训练的 batch size，以及设备、随机种子等信息\n\n```yaml\nGlobal:\n  device: gpu\n  seed: 1024\n\n  global_batch_size: \n  local_batch_size: 1\n  micro_batch_size: 1\n```\n\n其中参数对应的释义如下：\n| **参数名**                      | **参数释义**               |\n|--------------------------------|---------------------------|\n| device | 设备信息 |\n| seed | 随机数种子 |\n| global_batch_size | 全局的batch size大小，即一次参数更新等效的 batch size |\n| local_batch_size  | 每个进程训练的batch size大小                        |\n| micro_batch_size  | 每次前向计算的batch size大小                        |\n\n\n### Engine训练控制\n\nEngine训练设置完成模型训练/验证/推理等过程中的参数设置，是PaddleFleetX AutoEngine的必要参数，所有使用该Engine都必须指定该配置。 其中包含的参数有：\n\n```yaml\n  Engine:\n    max_steps: 500000\n    num_train_epochs: 1\n    eval_freq: 1\n    eval_iters: 10\n    test_iters:\n    mix_precision:\n      enable: True\n      dtype: \"float16\"\n      level: \"o2\"\n      scale_loss: 32768.0\n      custom_black_list: [\"reduce_sum\", \"c_softmax_with_cross_entropy\", \"elementwise_div\"]\n      custom_white_list: [\"lookup_table\", \"lookup_table_v2\"]\n    save_load:\n      output_dir: ./output\n      ckpt_dir:\n```\n\n其中参数对应的释义如下：\n\n| **参数名**         | **参数释义**                              |\n|-------------------|------------------------------------------|\n| max_steps         | 最大训练步数                               |\n| num_train_epochs  | 训练的epoch数量                            |\n| logging_freq      | 训练日志打印的频率                          |\n| eval_freq         | 模型评估间隔，以epoch为粒度                  |\n| eval_iters        | 模型评估时训练评估测试集的轮数                |\n| test_iters        | 模型测试或推理时的轮数                       |\n| enable            | 是否使用混合精度的类型，可选: `True` `False`  |\n| dtype             | 使用混合精度的类型，可选: `float16` `bfloat16`|\n| level             | 使用混合精度训练的等级，可选 `o1` `o2` `o3`   |\n| scale_loss        | 使用混合精度float16下，loss的放缩比例         |\n| custom_black_list | 自定义算子黑名单。这个名单中的算子在支持float16/bfloat16计算时会被认为是数值危险的，它们的影响也可能会在下游操作中观察到。这些算子通常不会转为float16/bfloat16计算。 |\n| custom_white_list | 自定义算子白名单。这个名单中的算子在支持float16/bfloat16计算时会被认为是数值安全的，并且对性能至关重要。如果设置了白名单，该名单中的算子会使用float16/bfloat16计算。|\n| output_dir        | 指定输出文件                              |\n| ckpt_dir          | checkpoint的加载目录                      |\n\n\n### 模型网络\n\n网络部分完成了网络的组网操作，GPT在[PaddleFleetX/ppfleetx/models/language_model/gpt/auto/auto_model.py]((https://github.com/PaddlePaddle/PaddleFleetX/blob/develop/ppfleetx/models/language_model/gpt/auto/auto_model.py))下。 \n可以使用配置文件配置模型的规模，如：\n\n```yaml\n  Model:\n    module: \"GPTModuleAuto\"\n    name: \"GPT\"\n    vocab_size: 50304\n    hidden_size: 1024\n    num_layers: 24\n    num_attention_heads: 16\n    ffn_hidden_size:\n    hidden_dropout_prob: 0.1\n    attention_probs_dropout_prob: 0.1\n    max_position_embeddings: 1024\n    type_vocab_size: 16\n    initializer_range: 0.02\n    use_recompute: True\n    fuse_attn_qkv: True\n```\n\n其中参数对应的释义如下：\n| **参数名**                    | **参数释义**               |\n|------------------------------|------------------------|\n| module | 指定GPT模型的执行模块  |\n| vocab_size                   | 训练词表大小                 |\n| hidden_size                  | 隐藏层大小                  |\n| num_layers                   | transformer层数          |\n| num_attention_heads          | attention head的数量      |\n| max_seq_len                  | 输入文本序列的长度              |\n| ffn_hidden_size              | ffn层大小，一般为隐藏层的四倍       |\n| attention_probs_dropout_prob | attention中的dropout的失活率 |\n| max_position_embeddings      | position embedding的长度  |\n| type_vocab_size              | 词表类型                   |\n| initializer_range            | 参数初始化的范围               |\n| use_recompute                | 是否使用recompute训练，重计算全部transformer  |\n| fuse_attn_qkv                | 是否对attention层中qkv计算使用fuse代替传统Linear加速训练 |\n\n\n### 数据集\n\n数据集参数分为“Train”、“Eval”和“Test”三部分，分别对应模型预训练、离线评估、推理等三个模块。\n\n每个模型的配置参数都包含以下内容：\n\n```yaml\n  Data:\n    Train:\n      collate_fn: gpt_collate_fn\n      sample_split: 2\n      dataset:\n        name: GPTDataset\n        input_dir: ./data/\n        split: [949, 50, 1]\n        max_seq_len: 1024\n```\n\n其中参数对应的释义如下：\n| **参数名**         | **参数释义**               |\n|-------------------|------------------------|\n| collate_fn        | 通过此参数指定如果将样本列表组合为mini-batch数据；支持自定义  |\n| sample_split      | 通过此参数dataset返回的sample被组织为(inputs,labels) |\n| dataset.name      | 指定自定义数据集的名称  |\n| input_dir         | 指定输入文件，可以使用目录，指定目录时将包括目录中的所有文件 |\n| split             | 训练集，验证集和测试集的切分比例 |\n| max_seq_len       | 输入文本序列的长度 |\n\n\n### 优化器\n\nGPT训练默认使用AdamW优化器以及cosine学习率衰减，这里通过配置文件配置优化器的参数，如：\n\n```yaml\n  Optimizer:\n    name: AdamW\n    weight_decay: 0.01\n    beta1: 0.9\n    beta2: 0.999\n    epsilon: 1.0e-8\n    lr:\n      name: CosineAnnealingWithWarmupDecay\n      decay_steps: 360000\n      warmup_rate: 0.01\n      max_lr: 5.0e-5\n      min_lr: 1.0e-5\n    grad_clip:\n      name: \"ClipGradByGlobalNorm\"\n      clip_norm: 1.0\n```\n\n其中参数说明：\n\n| **参数名**      | **参数释义**                |\n|----------------|---------------------------|\n| name           | 指定自定义优化器的名称        |\n| weight_decay   | weight的衰减率              |\n| beta1          | 一阶矩估计的指数衰减率        |\n| beta2          | 二阶矩估计的指数衰减率        |\n| epsilon        | 指定优化器需要优化的参数      |\n| lr.name        | 指定自定义学习率策略的名称     |\n| decay_steps    | 衰减的步长                  |\n| warmup_rate    | warmup 率                  |\n| max_lr         | Adam 的初始最大学习率        |\n| min_lr         | Adam 的初始最小学习率        |\n| grad_clip.name | 指定自定义梯度裁剪策略的名称   |\n| clip_norm      | 所允许的范数最大值           |\n\n\n### 并行维度\n\n当前GPT模型已适配自动并行的**半自动策略**，用户可以通过配置文件选择并行的维度。\n\n```yaml\n  Distributed:\n    dp_degree: 2\n    mp_degree: 2\n    pp_degree: 2\n    sharding:\n      sharding_degree: 1\n      sharding_stage: 1\n```\n\n其中参数说明：\n\n| **参数名**          | **参数释义**                             |\n|------------------|--------------------------------------|\n| dp_degree        | 数据并行维度                               |\n| mp_degree        | 张量模型并行维度                             |\n| pp_degree        | 流水线并行维度                              |\n| sharding_degree  | 分组切分并行维度                             |\n| sharding_stage   | 切分策略；1表示仅切分优化器状态，2表示再切分梯度，3表示再切分前向参数 |\n\n\n## 运行方式\n本目录按照345M、1.3B和6.7B规模大小，给出32G V100环境下GPT模型半自动并行训练的策略配置如下：\n\n| 模型规模   | 训练策略                     | yaml文件                               |\n|----------|---------------------------- |----------------------------------------|\n| 345MB    | 单卡+fp16                    | pretrain_gpt_345M_single_card.yaml     |\n| 1.3B     | dp8+fp16+recompute          | pretrain_gpt_1.3B_dp8.yaml             |\n| 6.7B     | sharding16+fp16+recompute   | pretrain_gpt_6.7B_sharding16.yaml  |\n\n若要在显存容量更小的16G V100环境下进行GPT大模型训练，可将对应yaml文件中的`Model`-`hidden size`值改为原来的1/2即可。\n\n### 策略支持\n\n自动并行包括2种模式：半自动并行与全自动并行。\n半自动并行包括了数据并行、张量模型并行、流水线并行和分组切片并行。此外还支持重计算、混合精度等策略，来减少显存占用、加速训练。**目前，GPT 模型训练可以支持任意维度的策略组合。**\n\n|                 | data parallel | tensor parallel | pipeline parallel | pure fp16 | recompute |\n|-----------------|---------------|-----------------|-------------------|-----------|-----------|\n| sharding stage1 | ✓             | ✓               | ✓                 | ✓         | ✓         |\n| sharding stage2 | ✓             | ✓               | ✓                 | ✓         | ✓         |\n| sharding stage3 | ✓             | ✓               | ✓                 | ✓         | ✓         |\n\n\n### 单卡训练\n\n以单机1.3B模型训练为例，该gpt程序需要单卡32G V100以运行\n\n**启动命令**\n```shell\ncd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略\n\nexport FLAGS_USE_STANDALONE_EXECUTOR=False # 设置执行器环境变量\npython ./tools/auto.py -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_single_card.yaml\n```\n\n### 单机训练\n\n以单机1.3B模型数据并行训练为例，通过``paddle.distributed.launch``启动多进程训练，该gpt程序需要8卡32G V100以运行。\n\n**启动命令**\n```shell\ncd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略\n\nlog_dir=log_auto\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/auto.py \\\n    -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8.yaml\n```\n\n若要在显存容量更小的16G V100环境下进行GPT模型单机训练，可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练，命令如下：\n\n**启动命令**\n```shell\nlog_dir=log_auto\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/auto.py \\\n    -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8.yaml \\\n    -o Model.hidden_size=1024\n```\n\n每张GPU的运行日志`workerlog.x`可在launch命令中指定的`log_dir`路径下找到；若未指定，日志路径为`log/workerlog.x`。运行日志具体内容如下：\n\n**运行日志**\n\n```\n[INFO 2022-08-19 10:47:00,392 engine.py:461] [train] epoch: 0 step: 0 lr: 5.555556e-09 loss: 10.972320\n[INFO 2022-08-19 10:47:02,858 engine.py:461] [train] epoch: 0 step: 1 lr: 8.333333e-09 loss: 10.950481\n[INFO 2022-08-19 10:47:05,321 engine.py:461] [train] epoch: 0 step: 2 lr: 1.111111e-08 loss: 10.951584\n[INFO 2022-08-19 10:47:07,791 engine.py:461] [train] epoch: 0 step: 3 lr: 1.388889e-08 loss: 10.954518\n[INFO 2022-08-19 10:47:10,256 engine.py:461] [train] epoch: 0 step: 4 lr: 1.666667e-08 loss: 10.959060\n[INFO 2022-08-19 10:47:12,725 engine.py:461] [train] epoch: 0 step: 5 lr: 1.944444e-08 loss: 10.957585\n[INFO 2022-08-19 10:47:15,198 engine.py:461] [train] epoch: 0 step: 6 lr: 2.222222e-08 loss: 10.947868\n[INFO 2022-08-19 10:47:17,680 engine.py:461] [train] epoch: 0 step: 7 lr: 2.500000e-08 loss: 10.939037\n```\n\n### 多机训练\n\n若需要在更多机器上进行大模型训练，则需要在每个参与训练的节点上设置master节点ip/port信息后执行启动命令（master节点ip为训练所用某一台机器的ip即可）。\n\n以2机16卡32G V100上的6.7B模型分组切分并行训练为例，启动命令为：\n\n```shell\nmaster_ip=master节点ip\nmaster_port=可用的空闲端口号\n\nlog_dir=log_sharding16\npython -m paddle.distributed.launch --log_dir $log_dir \\\n    --master=$master_ip:$master_port --nnodes=2 --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/auto.py -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_6.7B_sharding16.yaml\n```\n\n若要在显存容量更小的16G V100环境下进行GPT模型两机训练，也可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练，命令如下：\n\n```shell\nmaster_ip=master节点ip\nmaster_port=可用的空闲端口号\n\nlog_dir=log_sharding16\npython -m paddle.distributed.launch --log_dir $log_dir \\\n    --master=$master_ip:$master_port --nnodes=2 --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/auto.py -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_6.7B_sharding16.yaml \\\n    -o Model.hidden_size=2048\n```\n"
  },
  {
    "path": "projects/gpt/docs/hybrid_parallel.md",
    "content": "# GPT 混合并行模型训练\n\n当训练超大模型时，就必须借助混合并行策略，混合并行策略分别指数据并行、张量模型并行、流水线并行和分组切片并行。其中数据并行保存完整的模型参数并独立处理一份子数据集，以加速模型训练过程；张量模型并行将网络中的张量（Tensor）切分到不同的设备，从而降低单个设备的显存消耗；流水线并行将模型的不同层放置到不同的计算设备，降低单个计算设备的显存消耗；分组切片并行将参数和模型状态划分到不同卡上，每个GPU只保存部分副本，以减少显存占用。联合四种训练方式，可以实现更大模型、更快训练的效果。具体策略以及相关FleetAPI介绍可以参考以下教程：\n\n- [数据并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/data_parallel/index_cn.html)\n\n- [张量模型并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/model_parallel_cn.html\n)\n- [流水线并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/pipeline_parallel_cn.html)\n\n- [分组切片并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/group_sharded_parallel_cn.html)\n\n\n## 参数释义\n\n### 并行维度\n\n当前GPT模型已适配3D混合并行，并能够在训练超大模型，用户可以通过配置文件选择并行的维度。\n\n```yaml\n  Distributed:\n    dp_degree: 2\n    mp_degree: 2\n    pp_degree: 2\n    sharding:\n      sharding_degree: 1\n      sharding_stage: 1\n      sharding_offload: False\n      reduce_overlap: False\n      broadcast_overlap: False\n```\n\n其中参数说明：\n\n| **参数名**          | **参数释义**                             |\n|------------------|--------------------------------------|\n| dp_degree        | 数据并行维度                               |\n| mp_degree        | 张量模型并行维度                             |\n| pp_degree        | 流水线并行维度                              |\n| sharding_degree  | 分组切分并行维度                             |\n| sharding_stage   | 切分策略；1表示仅切分优化器状态，2表示再切分梯度，3表示再切分前向参数 |\n| sharding_offload | CPU offload策略                        |\n|reduce_overlap| 是否在sharding stage 2的模式下进行reduce通讯与反向计算的overlap，该策略暂时不支持sharding_offload|\n|broadcast_overlap| 是否在sharding stage 2的模式下进行broadcast通讯与下一个batch的 前向计算的overlap，该策略暂时不支持sharding_offload。若使用该模型，在evaluation与save之前，必须调用 `paddle.device.cuda.synchronize()` 方法|\n\n## 运行方式\n本目录中按照345M、1.3B、6.7B和175B规模大小，给出32G V100环境下GPT模型混合并行训练的策略配置如下：\n\n| 模型规模 | 训练策略                 | yaml文件                   |\n|----------|---------------------------|------------------------------|\n| 345M     | fp16+mp8+qat              | qat_gpt_345M_mp8.yaml    |\n| 1.3B     | fp16+dp8+recompute        | pretrain_gpt_1.3B_dp8.yaml   |\n| 6.7B     | fp16+sharding16+recompute | pretrain_gpt_6.7B_sharding16.yaml  |\n| 175B     | fp16+mp8+pp16+recompute   | pretrain_gpt_175B_mp8_pp16.yaml   |\n\n若要在显存容量更小的16G V100环境下进行GPT大模型训练，可将对应yaml文件中的`Model`-`hidden size`值改为原来的1/2即可。\n\n### 策略支持\n\n飞桨的混合并行技术包括4个维度：数据并行、张量模型并行、流水线并行和分组切片并行，此外还支持重计算、offload、混合精度、序列并行等策略，来减少显存占用、加速训练。\n\n目前，GPT模型训练已支持前3个维度的任意策略组合，但分组切片并行stage2/3仅支持与数据并行策略组合使用；详见下表。\n\n|                 | data parallel | tensor parallel | pipeline parallel | pure fp16 | recompute |\n|-----------------|---------------|-----------------|-------------------|-----------|-----------|\n| sharding stage1 | ✓             | ✓               | ✓                 | ✓         | ✓         |\n| sharding stage2 | ✓             | ㄨ               | ㄨ                 | ✓         | ✓         |\n| sharding stage3 | ✓             | ㄨ               | ㄨ                 | ✓         | ✓         |\n\n### 单机训练\n\n以单机1.3B模型数据并行训练为例，通过``paddle.distributed.launch``启动多进程训练，该gpt程序需要8卡32G V100以运行。\n\n**启动命令**\n```shell\ncd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略\n\nlog_dir=log_dp8\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    tools/train.py \\\n    -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml\n```\n\n若要在显存容量更小的16G V100环境下进行GPT模型单机训练，可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练，命令如下：\n\n**启动命令**\n```shell\nlog_dir=log_dp8\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    tools/train.py \\\n    -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml \\\n    -o Model.hidden_size=1024\n```\n\n每张GPU的运行日志`workerlog.x`可在launch命令中指定的`log_dir`路径下找到；若未指定，日志路径为`log/workerlog.x`。运行日志具体内容如下：\n\n**运行日志**\n\n```\n[2022-09-21 05:43:58,797] [    INFO] - [train] epoch: 0, batch: 0, loss: 10.992407799, avg_batch_cost: 5.51734 sec, speed: 0.18 step/s, ips_total: 11878 tokens/s, ips: 1485 tokens/s, learning rate: 2.77778e-08\n[2022-09-21 05:43:59,508] [    INFO] - [train] epoch: 0, batch: 1, loss: 11.000075340, avg_batch_cost: 0.71029 sec, speed: 1.41 step/s, ips_total: 92267 tokens/s, ips: 11533 tokens/s, learning rate: 4.16667e-08\n[2022-09-21 05:44:00,242] [    INFO] - [train] epoch: 0, batch: 2, loss: 11.017463684, avg_batch_cost: 0.73301 sec, speed: 1.36 step/s, ips_total: 89406 tokens/s, ips: 11176 tokens/s, learning rate: 5.55556e-08\n[2022-09-21 05:44:00,965] [    INFO] - [train] epoch: 0, batch: 3, loss: 10.983654976, avg_batch_cost: 0.72319 sec, speed: 1.38 step/s, ips_total: 90620 tokens/s, ips: 11328 tokens/s, learning rate: 6.94444e-08\n[2022-09-21 05:44:01,678] [    INFO] - [train] epoch: 0, batch: 4, loss: 11.014451981, avg_batch_cost: 0.71223 sec, speed: 1.40 step/s, ips_total: 92016 tokens/s, ips: 11502 tokens/s, learning rate: 8.33333e-08\n[2022-09-21 05:44:02,385] [    INFO] - [train] epoch: 0, batch: 5, loss: 11.005180359, avg_batch_cost: 0.70707 sec, speed: 1.41 step/s, ips_total: 92687 tokens/s, ips: 11586 tokens/s, learning rate: 9.72222e-08\n[2022-09-21 05:44:03,100] [    INFO] - [train] epoch: 0, batch: 6, loss: 10.989698410, avg_batch_cost: 0.71402 sec, speed: 1.40 step/s, ips_total: 91785 tokens/s, ips: 11473 tokens/s, learning rate: 1.11111e-07\n[2022-09-21 05:44:03,806] [    INFO] - [train] epoch: 0, batch: 7, loss: 10.992337227, avg_batch_cost: 0.70554 sec, speed: 1.42 step/s, ips_total: 92888 tokens/s, ips: 11611 tokens/s, learning rate: 1.25000e-07\n[2022-09-21 05:44:04,516] [    INFO] - [train] epoch: 0, batch: 8, loss: 10.972790718, avg_batch_cost: 0.71011 sec, speed: 1.41 step/s, ips_total: 92290 tokens/s, ips: 11536 tokens/s, learning rate: 1.38889e-07\n[2022-09-21 05:44:05,228] [    INFO] - [train] epoch: 0, batch: 9, loss: 10.983499527, avg_batch_cost: 0.71128 sec, speed: 1.41 step/s, ips_total: 92138 tokens/s, ips: 11517 tokens/s, learning rate: 1.52778e-07\n```\n\n### 多机训练\n\n若需要在更多机器上进行大模型训练，则需要在每个参与训练的节点上设置master节点ip/port信息后执行启动命令（master节点ip为训练所用某一台机器的ip即可）。\n\n以2机16卡32G V100上的6.7B模型分组切分并行训练为例，启动命令为：\n\n```shell\nmaster_ip=master节点ip\nmaster_port=可用的空闲端口号\n\nlog_dir=log_sharding16\npython -m paddle.distributed.launch --log_dir $log_dir \\\n    --master=$master_ip:$master_port --nnodes=2 --devices \"0,1,2,3,4,5,6,7\" \\\n    tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml\n```\n\n若要在显存容量更小的16G V100环境下进行GPT模型两机训练，也可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练，命令如下：\n\n```shell\nmaster_ip=master节点ip\nmaster_port=可用的空闲端口号\n\nlog_dir=log_sharding16\npython -m paddle.distributed.launch --log_dir $log_dir \\\n    --master=$master_ip:$master_port --nnodes=2 --devices \"0,1,2,3,4,5,6,7\" tools/train.py \\\n    -c ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml \\\n    -o Model.hidden_size=2048\n```\n\n若要执行16机175B大模型混合并行训练，以运行启动命令为：\n\n```shell\nmaster_ip=master节点ip\nmaster_port=可用的空闲端口号\n\nlog_dir=log_mp8_pp16\npython -m paddle.distributed.launch --log_dir $log_dir \\\n    --master=$master_ip:$master_port --nnodes=16 --devices \"0,1,2,3,4,5,6,7\" tools/train.py \\\n    -c ppfleetx/configs/nlp/gpt/pretrain_gpt_175B_mp8_pp16.yaml\n```\n\n当节点较多时，可以考虑使用 `ssh` 脚本或 `mpirun` 进行跨节点命令分发。\n\n### 量化训练\n\n\n若需要对模型进行量化训练，按照以上在配置文件中添加量化参数，可参考`qat_gpt_345M_mp8.yaml`，量化训练时可以可以适当减少训练轮数和学习率。以单机345M模型模型并行训练为例，通过``paddle.distributed.launch``启动多进程训练，该gpt程序需要8卡32G V100以运行，命令如下：\n\n```shell\nlog_dir=log_mp8\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" tools/train.py \\\n    -c ppfleetx/configs/nlp/gpt/qat_gpt_345M_mp8.yaml\n    -o Engine.max_steps=100000 \\\n    -o Optimizer.lr.decay_steps=72000 \\\n    -o Optimizer.lr.max_lr=5.0e-6 \\\n    -o Optimizer.lr.min_lr=1.0e-6 \n```\n\n\n# GPT Zero-shot 文本生成\n\n## 参数释义\n\n```yaml\nGeneration:\n  top_k: 50\n  top_p: 0.75\n  temperature: 1.0\n  min_dec_len: 1\n  max_dec_len: 200\n  num_return_sequences: 1\n  decode_strategy: \"sampling\"\n```\n\n其中参数说明：\n\n| **参数名**      | **参数释义**                  |\n|--------------|---------------------------|\n| top_k | 每次为采样挑选保留分数最高的 k 个 token        |\n| top_p   | 如果设置小于 1.0 的小数，则保留加起来为 top_p 或更高的最可能的概率的 token。默认值为 1.0        |\n| temperature   |  调节下一个 token 的概率温度，logits = logits / temperature，默认值为 1.0           |\n| min_dec_len | 最小生成 token 长度              |\n| max_dec_len  | 最大生成 token 长度                     |\n| num_return_sequences  | 每个输入生成的序列个数，默认值为 1                  |\n| decode_strategy       | 解码策略，默认值为 \"sampling\"，目前只支持 \"sampling\"，未来会支持 \"greedy_search\"，\"beam_search\" |\n\n## 文本生成\n\n下载预训练好的模型，快速体验文本生成\n\n```shell\ncd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略\n\nmkdir -p ckpt\nwget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz\ntar -xzf ckpt/GPT_345M.tar.gz -C ckpt/\n\n# --devices 根据并行策略设置设备\n\npython -m paddle.distributed.launch --devices \"0\" tasks/gpt/generation.py \\\n    -c ppfleetx/configs/nlp/gpt/generation_gpt_345M_dp8.yaml \\\n    -o Engine.save_load.ckpt_dir=./ckpt/PaddleFleetX_GPT_345M_220826/\n\n# 生成的文本，由于 checkpoint 不同，超参不同，随机数不同，您执行可能会生成不一样的内容\n\nPrompt: Hi, GPT2. Tell me who Jack Ma is.\nGeneration: Hi, GPT2. Tell me who Jack Ma is. I don’t want to hear that.”\n\nFor now, the only question the crowd is asking is whether or not Jack Ma will step down from the board of directors of Alibaba.\n\nJack Ma on why he never wanted to run for President in 2016:\n\nThere were two reasons. One is that I wanted to spend more time with my family. I thought it was better to spend more time with my family and spend more time with my children. So it was a very personal reason. But the second reason was that I thought it would be difficult to get elected, because there are a lot of political interests in this country. So I thought it was better to spend more time with my family.\n\nOn how Alibaba will evolve into a new player in China’s transportation and logistics sector:\n\nI think that we are going to become a very important player in the logistics industry. So our strategy is to make it easy for people to travel.\n```\n\n### 剖析体验文本生成\n\n#### GPT 文本生成模块初始化\n\n```python\n    module = build_module(cfg)\n    module.model.eval()\n```\n\n#### 预训练模型加载\n\n```python\n    # 获取到预训练 checkpoint 的根目录\n    ckpt_dir = cfg.Engine.save_load.ckpt_dir\n\n    # 构造出具体路径\n    model_path = os.path.join(ckpt_dir, \"model.pdparams\")\n\n    # 加载模型参数\n    model_dict = paddle.load(model_path)\n\n    # FP16 模型参数转成 FP32 模型参数\n    for key, value in model_dict.items():\n        model_dict[key] = model_dict[key].astype(paddle.float32)\n\n    # 设置模型参数为预训练参数\n    module.model.set_state_dict(model_dict)\n```\n\n#### 文本生成与结果展示\n\n```python\n    input_text = \"Historical Records: Tell us about the history of the Great Wall.\"\n    result = module.generate(input_text)\n\n    print(f'Prompt: {input_text}')\n    print(f'Generation: {result[0]}')\n```\n"
  },
  {
    "path": "projects/gpt/docs/hybrid_profiler.md",
    "content": "# Profiler\n\n本文档主要包括在 GPT 中开启 Profiler 并分析调试分析结果的方法，在模型开发中使用 Profiler 分析工具的方法请参考[教程](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/performance_improving/profiling_model.html)和[API文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/profiler/Profiler_cn.html)。\n\n## 参数配置\n\n使用 Profiler 功能需要在任务配置文件中添加 Profiler 配置信息并确保字段为 `enable: True` 以开启分析器。\n\n完整的可配置参数如下所示，可以根据使用场景调整配置。\n\n```\nProfiler:\n  enable: True\n  scheduler: [1, 5]\n  profiler_log: log_path\n  detailed: True\n  record_shapes: True\n  profile_memory: True\n  summary:\n    overview: True\n    device: True\n    model: True\n    dist: True\n    kernel: True\n    op: True\n    mem: True\n    memcpy: True\n```\n\n其中参数说明：\n\n| **参数名**                      | **参数释义**               |  **默认值** |\n|------------------------------|------------------------|------------------------|\n|  enable |   是否开启 Profiler | False |\n|  scheduler  | 定义分析区间，如 [1, 5] 记录 step 1 到 step 4 的分析数据 | None |\n|  profiler_log  | 日志文件目录 |   profiler_log |\n|  detailed  | 是否显示详细信息 |   False |\n|  record_shapes  |   是否记录 tensor shape 相关信息 | True |\n|  profile_memory |   是否统计 memory 相关信息 | True |\n\n其中，当 detailed=True 时会打印所有 summary 表格数据，当 detailed=False 时用户可以根据以下说明定制需要展示的表格信息。\n\n| **参数名**                      | **参数释义**               |  **默认值** |\n|------------------------------|------------------------|------------------------|\n|  summary.overview | 显示每种类型的 Event 时间消耗 |  True |\n|  summary.device | 显示 CPU 和 GPU 的平均利用率信息 |  False |\n|  summary.model  | 显示模型 dataloader、forward、backward、optimization 时间消耗 |  True |\n|  summary.dist  | 显示计算、通信以及重叠时间 |  False |\n|  summary.kernel  | 显示 GPU 执行的 kernel 信息 |  True |\n|  summary.op  | 显示框架中算子 (op) 的执行信息 |  True |\n|  summary.mem  | 显示内存/显存占用统计信息 |  False |\n|  summary.memcpy  | 显示框架中调用内存操作所花费的时间 | False |\n\n## 运行分析\n\n本节以 gpt混合并行 为例，首先进入目录，\n\n```\ncd PaddleFleetX\n```\n\n\n修改`ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml` 中 Profiler.enable 为 True, 同时可以根据上节说明调整相关配置，或者使用命令行参数覆盖，例如可以使用以下命令运行程序，\n```\npython -m paddle.distributed.launch \\\n    ./tools/train.py -c \\\n    ./ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml -o Profiler.enable=True\n\n```\n\n> 在使用 Profiler 工具进行性能分析时，建议减少 train 的步数，获得分析数据即可停止训练。\n\n## 结果分析\n\n在训练结束后会有以下数据：\n\n* 根据配置信息在控制台打印 summary 表格\n* 在配置的 `profiler_log` 目录保存 profiler json 文件\n\n这里保存的 json 文件可以通过如下两种方式查看：\n\n* 在 chrome 浏览器中打开 chrome://tracing/，然后打开 json 文件查看\n* 根据控制台信息安装并启动 `visualdl --logdir log_path` 然后根据提示在浏览器中**性能分析**模块查看\n\n具体的信息含义解释以及分析方法请参考[文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/performance_improving/profiling_model.html)。\n\n> 在使用 visualdl 时，如果 log 文件数据较大，启动会比较耗时，请耐心等待。\n\n## 附录\n\n控制台打印的 summary 信息示例如下所示。\n\n**Overview Summary**\n```\n---------------------------------------------Overview Summary---------------------------------------------\nTime unit: ms\n-------------------------  -------------------------  -------------------------  -------------------------\nEvent Type                 Calls                      CPU Time                   Ratio (%)\n-------------------------  -------------------------  -------------------------  -------------------------\nProfileStep                4                          18591.04                   100.00\n  CudaRuntime              87527                      8555.11                    46.02\n  Operator                 21912                      1883.11                    10.13\n  UserDefined              13116                      1841.33                    9.90\n  OperatorInner            33668                      1018.39                    5.48\n  Forward                  8                          731.46                     3.93\n  Backward                 4                          671.82                     3.61\n  Optimization             4                          315.91                     1.70\n  Dataloader               4                          1.37                       0.01\n-------------------------  -------------------------  -------------------------  -------------------------\n                           Calls                      GPU Time                   Ratio (%)\n-------------------------  -------------------------  -------------------------  -------------------------\n  Kernel                   16092                      4924.90                    26.49\n  Memcpy                   4278                       3617.26                    19.46\n  Memset                   780                        2.31                       0.01\n  Communication            192                        2363.13                    12.71\n-------------------------  -------------------------  -------------------------  -------------------------\n```\n\n**Model Summary**\n\n```\n-----------------------------------------------------Model Summary-----------------------------------------------------\nTime unit: ms\n---------------  ------  -----------------------------------------------  ---------------------------------------------  \nName             Calls   CPU Total / Avg / Max / Min / Ratio(%)           GPU Total / Avg / Max / Min / Ratio(%)         \n---------------  ------  -----------------------------------------------  ---------------------------------------------  \nProfileStep      4       18591.04 / 4647.76 / 14114.47 / 757.27 / 100.00  4924.90 / 1231.22 / 2853.61 / 682.04 / 100.00  \n  Dataloader     4       1.37 / 0.34 / 0.85 / 0.16 / 0.01                 0.00 / 0.00 / 0.00 / 0.00 / 0.00               \n  Forward        8       731.46 / 91.43 / 133.28 / 49.03 / 3.93           714.83 / 89.35 / 174.91 / 4.72 / 14.51         \n  Backward       4       671.82 / 167.96 / 168.29 / 167.52 / 3.61         1701.53 / 425.38 / 426.97 / 424.10 / 34.55     \n  Optimization   4       315.91 / 78.98 / 89.07 / 73.78 / 1.70            108.27 / 27.07 / 27.09 / 27.06 / 2.20          \n  Others         -       16870.48 / - / - / - / 90.75                     2400.27 / - / - / - / 48.74                    \n---------------  ------  -----------------------------------------------  ---------------------------------------------  \n```\n\n**Operator Summary**\n\n```\n----------------------------------------------------------------Operator Summary-----------------------------------------------------------------\nTime unit: ms\n----------------------------------------------------  ------  -----------------------------------------  ----------------------------------------\nName                                                  Calls   CPU Total / Avg / Max / Min / Ratio(%)     GPU Total / Avg / Max / Min / Ratio(%)\n----------------------------------------------------  ------  -----------------------------------------  ----------------------------------------\n-----------------------------------------------------------Thread: All threads merged------------------------------------------------------------\nGradNodePyLayer_RecomputeFunction_backward            96      663.37 / 6.91 / 17.17 / 4.01 / 18.56       1629.87 / 16.98 / 17.41 / 16.69 / 26.98\n  TransformerDecoderLayer                             96      262.68 / 2.74 / 5.91 / 1.90 / 39.60        661.18 / 6.89 / 7.11 / 6.73 / 40.57\n  backward                                            96      318.62 / 3.32 / 10.57 / 1.31 / 48.03       968.69 / 10.09 / 10.31 / 9.91 / 59.43\nmatmul dygraph                                        2312    200.13 / 0.09 / 1.61 / 0.04 / 5.60         1487.76 / 0.64 / 9.81 / 0.22 / 24.63\n  matmul infer_meta                                   964     1.42 / 0.00 / 0.01 / 0.00 / 0.71           0.00 / 0.00 / 0.00 / 0.00 / 0.00\n  matmul compute                                      964     71.38 / 0.07 / 1.59 / 0.03 / 35.67         644.02 / 0.67 / 9.81 / 0.22 / 43.29\n    MEMSET                                            192     - / - / - / - / -                          0.42 / 0.00 / 0.00 / 0.00 / 0.07\n    volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_nn      384     - / - / - / - / -                          199.35 / 0.52 / 0.83 / 0.22 / 30.95\n    volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_nn      384     - / - / - / - / -                          263.96 / 0.69 / 0.79 / 0.59 / 40.99\n    volta_h884gemm_64x128_ldg8_nn                     192     - / - / - / - / -                          141.13 / 0.74 / 0.92 / 0.61 / 21.91\n    void cutlass::Kernel<cutlass_70_tensorop_f16_...  4       - / - / - / - / -                          39.15 / 9.79 / 9.81 / 9.78 / 6.08\n  matmul node_creation                                676     2.05 / 0.00 / 0.03 / 0.00 / 1.02           0.00 / 0.00 / 0.00 / 0.00 / 0.00\n...\n```\n\n**Kernel Summary**\n```\n---------------------------------------------------------------Kernel Summary---------------------------------------------------------------\nTime unit: ms\n------------------------------------------------------------------------------------------  ------  ----------------------------------------\nName                                                                                        Calls   GPU Total / Avg / Max / Min / Ratio(%)\n------------------------------------------------------------------------------------------  ------  ----------------------------------------\nncclKernel_AllReduce_RING_LL_Sum_half(ncclWorkElem)                                         96      2360.57 / 24.59 / 2202.54 / 0.46 / 47.93\nvolta_fp16_s884gemm_fp16_256x128_ldg8_f2f_nn                                                384     263.96 / 0.69 / 0.79 / 0.59 / 5.36\nvolta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn                                    384     241.74 / 0.63 / 0.84 / 0.22 / 4.91\nvoid paddle::operators::VectorizedRandomGenerator<phi::dtype::float16, unsigned char>       580     209.08 / 0.36 / 0.97 / 0.06 / 4.25\nvolta_h884gemm_64x128_ldg8_nn                                                               288     203.89 / 0.71 / 0.92 / 0.57 / 4.14\nvolta_fp16_s884gemm_fp16_128x128_ldg8_f2f_nn                                                384     199.35 / 0.52 / 0.83 / 0.22 / 4.05\nvolta_h884gemm_256x64_ldg8_tn                                                               288     149.52 / 0.52 / 0.54 / 0.45 / 3.04\nvoid phi::funcs::VectorizedBroadcastKernel<phi::dtype::float16, phi::dtype::float16, ph...  1352    123.12 / 0.09 / 0.40 / 0.05 / 2.50\nvoid paddle::operators::SoftmaxMaskFuseUpperTriangleGPUKernel<phi::dtype::float16, 10>      192     122.37 / 0.64 / 0.66 / 0.60 / 2.48\nvoid cutlass::Kernel<cutlass_70_tensorop_f16_s884gemm_f16_256x128_nt_align8>                100     103.07 / 1.03 / 8.08 / 0.73 / 2.09\nvoid phi::funcs::VectorizedElementwiseKernel<phi::dtype::float16, paddle::operators::Cu...  292     90.80 / 0.31 / 0.83 / 0.06 / 1.84\nvolta_h884gemm_64x128_ldg8_nt                                                               192     79.76 / 0.42 / 0.43 / 0.40 / 1.62\nvoid Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eige...  576     75.36 / 0.13 / 0.20 / 0.07 / 1.53\n...\n```\n"
  },
  {
    "path": "projects/gpt/docs/inference.md",
    "content": "\n# 推理部署\n\n模型训练完成后，可使用飞桨高性能推理引擎Paddle Inference通过如下方式进行推理部署。\n\n## 1. 模型导出\n\n### 1.1 非量化模型导出\n\n以`GPT-3(345M)`模型为例，通过如下方式下载PaddleFleetX发布的训练好的权重。若你已下载或使用训练过程中的权重，可跳过此步。\n\n```bash\nwget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_FP16.tar.gz\ntar -zxvf GPT_345M_FP16.tar.gz\n```\n\n通过如下方式进行推理模型导出\n导出单卡`GPT-3(345M)`模型：\n```bash\nsh projects/gpt/auto_export_gpt_345M_single_card.sh\n```\n\n导出单卡`GPT-3(6.7B)`模型：\n```bash\nsh projects/gpt/auto_export_gpt_6.7B_mp1.sh\n```\n\n导出8卡`GPT-3(175B)`模型：\n```bash\nsh projects/gpt/auto_export_gpt_175B_mp8.sh\n```\n\n### 1.2 量化模型导出\n\n导出单卡`GPT-3(345M)`量化模型：\n\n```shell\n# 为了方便快速体验，这里给出345M量化训练的模型，若已有量化模型，则无需下载\nwget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_QAT_wo_analysis.tar\ntar xf GPT_345M_QAT_wo_analysis.tar\n\nexport CUDA_VISIBLE_DEVICES=0\npython ./tools/export.py \\\n    -c ./ppfleetx/configs/nlp/gpt/generation_qat_gpt_345M_single_card.yaml \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0 \\\n    -o Engine.save_load.ckpt_dir='./GPT_345M_QAT_wo_analysis/'\n```\n\n导出单卡`GPT-3(6.7B)`量化模型：\n\n```shell\nexport CUDA_VISIBLE_DEVICES=0\npython ./tools/export.py \\\n    -c ./ppfleetx/configs/nlp/gpt/generation_qat_gpt_6.7B_single_card.yaml \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0\n```\n\n## 2. 推理部署\n\n模型导出后，可通过`tasks/gpt/inference.py`脚本进行推理部署。\n\n单卡推理\n```bash\nbash projects/gpt/inference_gpt_single_card.sh\n```\n\n多卡推理(以8卡为例)\n\n```bash\nexport CUDA_VISIBLE_DEVICES=\"0,1,2,3,4,5,6,7\"\nexport MP=8\nbash projects/gpt/inference_gpt_multigpu.sh\n```\n\n\n## 3. Benchmark\n- 导出模型\n修改配置文件\nPaddleFleetX/ppfleetx/configs/nlp/gpt/auto/generation_gpt_6.7B_mp1.yaml，将`Generation/early_finish`选项设置为False(关闭提前终止，仅适用于测速场景)\n\n执行导出\n```bash\nsh projects/gpt/auto_export_gpt_6.7B_mp1.sh\n```\n如果打开了topp_sampling,则需要安装自定义算子：\n```bash\ncd ppfleetx/ops && python setup_cuda.py install && cd ../..\n```\n\n- 运行benchmark脚本\n```\nbash projects/gpt/run_benchmark.sh\n```\n\n| 模型          | 输入长度 | 输出长度 | batch size | GPU卡数 | FP16推理时延 | INT8推理时延 |\n| :------------ | :------: | :------: | :--------: | :-----: | :----------: | :----------: |\n| GPT-3(345M)   |    128   |    8     |     1      |    1    |   18.91ms    |   18.30ms    |\n| GPT-3(345M)   |    128   |    8     |     2      |    1    |   20.01ms    |   18.88ms    |\n| GPT-3(345M)   |    128   |    8     |     4      |    1    |   20.83ms    |   20.77ms    |\n| GPT-3(345M)   |    128   |    8     |     8      |    1    |   24.06ms    |   23.90ms    |\n| GPT-3(345M)   |    128   |    8     |    16      |    1    |   29.32ms    |   27.95ms    |\n| GPT-3(6.7B)   |    128   |    8     |     1      |    1    |   84.93ms    |   63.96ms    |\n| GPT-3(6.7B)   |    128   |    8     |     2      |    1    |   91.93ms    |   67.25ms    |\n| GPT-3(6.7B)   |    128   |    8     |     4      |    1    |   105.50ms   |   78.98ms    |\n| GPT-3(6.7B)   |    128   |    8     |     8      |    1    |   138.56ms   |   99.54ms    |\n| GPT-3(6.7B)   |    128   |    8     |    16      |    1    |   204.33ms   |   140.97ms   |\n| GPT-3(175B)   |    128   |    8     |     1      |    8    |   327.26ms   |   230.11ms   |\n| GPT-3(175B)   |    128   |    8     |     2      |    8    |   358.61ms   |   244.23ms   |\n| GPT-3(175B)   |    128   |    8     |     4      |    8    |   428.93ms   |   278.63ms   |\n| GPT-3(175B)   |    128   |    8     |     8      |    8    |   554.28ms   |   344.00ms   |\n| GPT-3(175B)   |    128   |    8     |    16      |    8    |   785.92ms   |   475.19ms   |\n\n以上性能数据基于PaddlePaddle[每日版本](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/Tables.html#whl-develop) ，依赖CUDA 11.6测试环境。\n"
  },
  {
    "path": "projects/gpt/docs/quantization_aware_training.md",
    "content": "\n# GPT模型量化训练\n\n本项目对语言模型 GPT 进行量化训练。目前，PaddleFleetX 提供了 [GPT-345M量化模型](https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_QAT_w_analysis.tar) 的预训练模型文件；基于 [LAMBADA](https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl)，采用 ACC(accuracy) 指标后的评估结果如下：\n\n| **模型文件** | **数据类型** | **ACC** |\n|---------|-----------|---------------|\n| GPT-345M | FP16 |  44.17%  |\n| GPT-345M | INT8 |  44.94%  |\n\n下面是本例涉及的文件及说明：\n\n```text\n.\n├── qat_gpt_345M_single_card.sh            # 单卡345M模型量化训练入口\n├── qat_gpt_345M_mp8.sh                    # 8卡345M模型模型并行量化训练入口\n├── qat_gpt_6.7B_sharding16.sh             # 16卡6.7B模型分组切片并行量化训练入口\n├── eval_qat_gpt_345M_single_card.sh       # 单卡345M量化模型验证入口\n├── export_qat_gpt_345M_single_card.sh     # 单卡345M量化模型导出入口\n\n```\n\n\n### 环境依赖和数据准备\n环境依赖和数据准备请参考[GPT文档](./README.md)。\n\n另外，模型导出还依赖于`ppfleetx-ops`的安装\n\n```\ncd PaddleFleetX/ # 如果已在此目录下，则忽略\ncd ppfleetx/ops && python setup_cuda.py install && cd ../..\n```\n\n### 预训练模型准备\n量化训练需加载[GPT-345M](https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz) 的预训练模型。\n\n**预训练模型下载命令**\n```shell\nwget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz\ntar xf GPT_345M.tar.gz\n```\n\n### 量化训练\n\n- [345M模型单卡训练](../qat_gpt_345M_single_card.sh)\n\n快速启动：\n```shell\nbash ./projects/gpt/qat_gpt_345M_single_card.sh\n```\n\n或如下启动：\n```shell\nexport CUDA_VISIBLE_DEVICES=0\n\nlog_dir=log_hybrid\nrm -rf $log_dir\n\npython ./tools/train.py \\\n    -c ./ppfleetx/configs/nlp/gpt/qat_gpt_345M_single_card.yaml \\\n    -o Engine.max_steps=100000 \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0 \\\n    -o Optimizer.lr.decay_steps=72000 \\\n    -o Optimizer.weight_decay=0.02 \\\n    -o Optimizer.lr.max_lr=5.0e-6 \\\n    -o Optimizer.lr.min_lr=1.0e-6 \\\n    -o Compress.pretrained='./PaddleFleetX_GPT_345M_220826'\n    \n```\n\n- [345M模型模型并行训练](../qat_gpt_345M_mp8.sh)\n\n快速启动：\n```shell\nbash ./projects/gpt/qat_gpt_345M_mp8.sh\n```\n\n或如下启动：\n```shell\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n\nlog_dir=log_hybrid\nrm -rf $log_dir\n\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/train.py \\\n    -c ./ppfleetx/configs/nlp/gpt/qat_gpt_345M_mp8.yaml \\\n    -o Engine.max_steps=100000 \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0 \\\n    -o Optimizer.lr.decay_steps=72000 \\\n    -o Optimizer.weight_decay=0.02 \\\n    -o Optimizer.lr.max_lr=5.0e-6 \\\n    -o Optimizer.lr.min_lr=1.0e-6 \\\n    -o Compress.pretrained='./PaddleFleetX_GPT_345M_220826'\n```\n\nTips：尽管设置的最大训练轮数为100000轮，但实验经验4000轮即可达到最优效果。\n\n\n### 量化训练精度调优\n针对生成式预训练语言模型的模型压缩一直是学界上的难点，潜在的原因目前并不清楚。经我们研究分析发现，生成式预训练语言模型的Transformer层的权重分布差异较大，且由于生成式预训练语言模型的从左到右预测的性质，量化误差会逐步累积，精度损失较大。为了保证量化模型的精度，PaddleSlim提供量化训练敏感度分析工具，可以有效定位模型某层带来的量化损失较大，以规避一些敏感层并提高量化模型精度。\n\nPaddleSlim中的量化训练敏感度分析工具仅支持静态图模型，需要将量化模型导出为静态图模型。导出命令为：\n\n```shell\n# 下载未经过分析的量化模型\nwget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_QAT_wo_analysis.tar\ntar xf GPT_345M_QAT_wo_analysis.tar\n\nexport CUDA_VISIBLE_DEVICES=0\n\npython ./tools/export.py \\\n    -c ./ppfleetx/configs/nlp/gpt/export_qat_gpt_345M_single_card.yaml \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0 \\\n    -o Engine.save_load.ckpt_dir='./GPT_345M_QAT_wo_analysis/'\n```\n注意：此处导出的并非GenerationModule，而是可用于验证的GPTModule。\n\n具体步骤可参考\n[GPT量化训练敏感度分析示例](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/example/quantization_analysis/GPT/README.md)。\n\n\n\n### 模型验证\n```shell\n# 下载验证数据\nwget https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl\n\n# 下载已经训练好的量化模型\nwget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_QAT_w_analysis.tar\ntar xf GPT_345M_QAT_w_analysis.tar\n\nexport CUDA_VISIBLE_DEVICES=0\npython ./tools/eval.py \\\n    -c ./ppfleetx/configs/nlp/gpt/eval_qat_gpt_345M_single_card.yaml \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0 \\\n    -o Engine.save_load.ckpt_dir='./GPT_345M_QAT_w_analysis' \\\n    -o Offline_Eval.eval_path=./lambada_test.jsonl \\\n    -o Offline_Eval.cloze_eval=True \n```\n\n### 模型导出\n```shell\n# 下载已经训练好的量化模型，若已有量化模型，不需要下载\nwget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_QAT_wo_analysis.tar\ntar xf GPT_345M_QAT_wo_analysis.tar\n\nexport CUDA_VISIBLE_DEVICES=0\npython ./tools/export.py \\\n    -c ./ppfleetx/configs/nlp/gpt/generation_qat_gpt_345M_single_card.yaml \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0 \\\n    -o Engine.save_load.ckpt_dir='./GPT_345M_QAT_wo_analysis/'\n```\n"
  },
  {
    "path": "projects/gpt/docs/single_card.md",
    "content": "# GPT 单卡模型训练\n\n## 运行方式\n\n本文档按照345M和1.3B规模大小，给出32G V100环境下GPT模型单卡训练的策略配置如下：\n\n| 模型规模 | 训练策略       | yaml文件                    | 显存占用 |\n|----------|----------------|-------------------------------|----------|\n| 345M     | fp16           | pretrain_gpt_345M_single_card.yaml | 30.9GB   |\n| 1.3B     | fp16+recompute | pretrain_gpt_1.3B_single_card.yaml | 26.0GB   |\n\n**启动命令**\n```shell\ncd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略\n\n# 345M\npython tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml\n\n# 1.3B\npython tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_single_card.yaml\n```\n\n若要在显存容量更小的16G V100环境下进行GPT模型单机训练，可通过减小`Model.hidden_size`调整模型规模至合适大小，或使用重计算等显存优化策略再启动训练，命令如下：\n\n```shell\n# 345M\npython tools/train.py \\\n    -c ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml \\\n    -o Model.use_recompute=True\n\n# 1.3B\npython tools/train.py \\\n    -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_single_card.yaml \\\n    -o Model.hidden_size=1024\n```\n\n**运行日志**\n\n```\n[2022-09-21 05:45:27,009] [    INFO] - [train] epoch: 0, batch: 0, loss: 10.999595642, avg_batch_cost: 2.53083 sec, speed: 0.40 step/s, ips_total: 3237 tokens/s, ips: 3237 tokens/s, learning rate: 2.77778e-08\n[2022-09-21 05:45:27,518] [    INFO] - [train] epoch: 0, batch: 1, loss: 10.997043610, avg_batch_cost: 0.50907 sec, speed: 1.96 step/s, ips_total: 16092 tokens/s, ips: 16092 tokens/s, learning rate: 4.16667e-08\n[2022-09-21 05:45:28,021] [    INFO] - [train] epoch: 0, batch: 2, loss: 10.994422913, avg_batch_cost: 0.50265 sec, speed: 1.99 step/s, ips_total: 16298 tokens/s, ips: 16298 tokens/s, learning rate: 5.55556e-08\n[2022-09-21 05:45:28,526] [    INFO] - [train] epoch: 0, batch: 3, loss: 11.005314827, avg_batch_cost: 0.50378 sec, speed: 1.98 step/s, ips_total: 16261 tokens/s, ips: 16261 tokens/s, learning rate: 6.94444e-08\n[2022-09-21 05:45:29,029] [    INFO] - [train] epoch: 0, batch: 4, loss: 10.988020897, avg_batch_cost: 0.50237 sec, speed: 1.99 step/s, ips_total: 16307 tokens/s, ips: 16307 tokens/s, learning rate: 8.33333e-08\n[2022-09-21 05:45:29,531] [    INFO] - [train] epoch: 0, batch: 5, loss: 10.983006477, avg_batch_cost: 0.50179 sec, speed: 1.99 step/s, ips_total: 16326 tokens/s, ips: 16326 tokens/s, learning rate: 9.72222e-08\n[2022-09-21 05:45:30,035] [    INFO] - [train] epoch: 0, batch: 6, loss: 10.988540649, avg_batch_cost: 0.50379 sec, speed: 1.98 step/s, ips_total: 16261 tokens/s, ips: 16261 tokens/s, learning rate: 1.11111e-07\n[2022-09-21 05:45:30,540] [    INFO] - [train] epoch: 0, batch: 7, loss: 10.966930389, avg_batch_cost: 0.50387 sec, speed: 1.98 step/s, ips_total: 16258 tokens/s, ips: 16258 tokens/s, learning rate: 1.25000e-07\n[2022-09-21 05:45:31,044] [    INFO] - [train] epoch: 0, batch: 8, loss: 10.980175018, avg_batch_cost: 0.50365 sec, speed: 1.99 step/s, ips_total: 16265 tokens/s, ips: 16265 tokens/s, learning rate: 1.38889e-07\n[2022-09-21 05:45:31,562] [    INFO] - [train] epoch: 0, batch: 9, loss: 10.966150284, avg_batch_cost: 0.51796 sec, speed: 1.93 step/s, ips_total: 15816 tokens/s, ips: 15816 tokens/s, learning rate: 1.52778e-07\n```\n\n\n# GPT 单卡模型评估\n\n我们提供了对[WikiText](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip)、[LAMBADA](https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl)两种数据集的评估脚本，其中数据集WikiText采用的是PPL(perplexity)评估指标，LAMBADA采用的是ACC(accuracy)指标。\n\n## 参数释义\n\n请在模型评估前将前述数据集下载到FleetX根目录下(WikiText数据集需要解压缩)，然后可以使用配置文件配置评估相关的参数，包括：\n\n```yaml\n  Offline_Eval:\n    eval_path: ./wikitext-103/wiki.valid.tokens\n    cloze_eval: False\n    overlapping_eval: 32\n    batch_size: 8\n    max_seq_len: 1024\n    logging_freq: 10\n```\n\n其中参数对应的释义如下：\n\n| **参数名**                      | **参数释义**          |\n|------------------------------|------------------------|\n| eval_path         | 评估数据集地址                      |\n| cloze_eval  | lambada数据集参数                     |\n| overlapping_eval  | wikitext数据集参数              |\n| batch_size         | 模型评估时batch size             |\n| max_seq_len        | 模型评估时文本序列长度           |\n| logging_freq     | 评估日志的打印频率                |\n\n## 运行方式\n\n以单卡345M模型评估为例，可以使用如下命令启动评估：\n\n### WikiText数据集评估\n\n```shell\ncd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略\n\nmkdir -p ckpt\nwget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz\ntar -xzf ckpt/GPT_345M.tar.gz -C ckpt/\n\nwget -O wikitext-103-v1.zip https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip\nunzip -q wikitext-103-v1.zip\n\nckpt_dir=ckpt/PaddleFleetX_GPT_345M_220826/\neval_dir=./wikitext-103\n\npython tools/eval.py -c ppfleetx/configs/nlp/gpt/eval_gpt_345M_single_card.yaml \\\n    -o Engine.save_load.ckpt_dir=$ckpt_dir \\\n    -o Offline_Eval.eval_path=$eval_dir/wiki.valid.tokens \\\n    -o Offline_Eval.overlapping_eval=32 \\\n    -o Offline_Eval.batch_size=16\n```\n\n评估日志如下：\n```shell\n[2022-09-21 05:28:26,263] [    INFO] - [eval] epoch: 0, batch: 0, loss: 0.170368048, speed: 0.29 step/s\n[2022-09-21 05:28:39,642] [    INFO] - [eval] epoch: 0, batch: 10, loss: 0.231640193, speed: 0.75 step/s\n[2022-09-21 05:28:53,469] [    INFO] - [eval] epoch: 0, batch: 20, loss: 0.292417919, speed: 0.72 step/s\n[2022-09-21 05:29:07,012] [    INFO] - [eval] epoch: 0, batch: 30, loss: 0.351391476, speed: 0.74 step/s\n[2022-09-21 05:29:27,359] [    INFO] - [eval] epoch: 0, batch: 40, loss: 0.415404772, speed: 0.49 step/s\n```\n\n评估结果如下：\n\n```shell\n[2022-09-21 05:40:32,820] [    INFO] - validation results on ./wikitext-103/wiki.valid.tokens | avg loss: 2.9554E+00 | ppl: 1.9210E+01 | adjusted ppl: 2.4948E+01 | token ratio: 1.0884484081583892\n```\n\n### LAMBADA数据集评估\n\n```shell\ncd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略\n\nmkdir -p ckpt\nwget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz\ntar -xzf ckpt/GPT_345M.tar.gz -C ckpt/\n\nwget -O lambada_test.jsonl https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl\n\nckpt_dir=ckpt/PaddleFleetX_GPT_345M_220826/\n\npython tools/eval.py -c ppfleetx/configs/nlp/gpt/eval_gpt_345M_single_card.yaml \\\n    -o Engine.save_load.ckpt_dir=$ckpt_dir \\\n    -o Offline_Eval.eval_path=./lambada_test.jsonl \\\n    -o Offline_Eval.cloze_eval=True \\\n    -o Offline_Eval.batch_size=16\n\n```\n\n评估日志如下：\n```shell\n[2022-09-21 05:18:24,152] [    INFO] - [eval] epoch: 0, batch: 0, number correct: 50.000000000, speed: 0.29 step/s\n[2022-09-21 05:18:37,264] [    INFO] - [eval] epoch: 0, batch: 10, number correct: 130.000000000, speed: 0.76 step/s\n[2022-09-21 05:18:50,408] [    INFO] - [eval] epoch: 0, batch: 20, number correct: 209.000000000, speed: 0.76 step/s\n[2022-09-21 05:19:03,578] [    INFO] - [eval] epoch: 0, batch: 30, number correct: 279.000000000, speed: 0.76 step/s\n[2022-09-21 05:19:16,760] [    INFO] - [eval] epoch: 0, batch: 40, number correct: 343.000000000, speed: 0.76 step/s\n```\n\n评估结果如下：\n\n```shell\n[2022-09-21 05:25:28,662] [    INFO] - validation results on ./lambada_test.jsonl | number correct: 2.1240E+03 | total examples: 5.1530E+03 | avg accuracy: 4.1219E-01\n```\n\n# GPT Zero-shot 文本生成\n\n## 参数释义\n\n```yaml\n  Generation:\n    top_k: 50\n    top_p: 0.75\n    temperature: 1.0\n    min_dec_len: 1\n    max_dec_len: 200\n    num_return_sequences: 1\n    decode_strategy: \"sampling\"\n```\n\n其中参数说明：\n\n| **参数名**      | **参数释义**                  |\n|--------------|---------------------------|\n| top_k | 每次为采样挑选保留分数最高的 k 个 token        |\n| top_p   | 如果设置小于 1.0 的小数，则保留加起来为 top_p 或更高的最可能的概率的 token。默认值为 1.0        |\n| temperature   |  调节下一个 token 的概率温度，logits = logits / temperature，默认值为 1.0           |\n| min_dec_len | 最小生成 token 长度              |\n| max_dec_len  | 最大生成 token 长度                     |\n| num_return_sequences  | 每个输入生成的序列个数，默认值为 1                  |\n| decode_strategy       | 解码策略，默认值为 \"sampling\"，目前只支持 \"sampling\"，未来会支持 \"greedy_search\"，\"beam_search\" |\n\n## 文本生成\n\n下载预训练好的模型，快速体验文本生成\n\n### 快速体验文本生成\n\n\n```shell\ncd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略\n\nmkdir -p ckpt\nwget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz\ntar -xzf ckpt/GPT_345M.tar.gz -C ckpt/\n\npython tasks/gpt/generation.py \\\n    -c ppfleetx/configs/nlp/gpt/generation_gpt_345M_single_card.yaml \\\n    -o Engine.save_load.ckpt_dir=./ckpt/PaddleFleetX_GPT_345M_220826/\n\n# 生成的文本，由于 checkpoint 不同，超参不同，随机数不同，您执行可能会生成不一样的内容\n\nPrompt: Hi, GPT2. Tell me who Jack Ma is.\nGeneration: Hi, GPT2. Tell me who Jack Ma is. I don’t want to hear that.”\n\nFor now, the only question the crowd is asking is whether or not Jack Ma will step down from the board of directors of Alibaba.\n\nJack Ma on why he never wanted to run for President in 2016:\n\nThere were two reasons. One is that I wanted to spend more time with my family. I thought it was better to spend more time with my family and spend more time with my children. So it was a very personal reason. But the second reason was that I thought it would be difficult to get elected, because there are a lot of political interests in this country. So I thought it was better to spend more time with my family.\n\nOn how Alibaba will evolve into a new player in China’s transportation and logistics sector:\n\nI think that we are going to become a very important player in the logistics industry. So our strategy is to make it easy for people to travel.\n```\n\n### 剖析体验文本生成\n\n#### GPT 文本生成模块初始化\n\n```python\n    module = build_module(cfg)\n    module.model.eval()\n```\n\n#### 预训练模型加载\n\n```python\n    # 获取到预训练 checkpoint 的根目录\n    ckpt_dir = cfg.Engine.save_load.ckpt_dir\n\n    # 构造出具体路径\n    model_path = os.path.join(ckpt_dir, \"model.pdparams\")\n\n    # 加载模型参数\n    model_dict = paddle.load(model_path)\n\n    # FP16 模型参数转成 FP32 模型参数\n    for key, value in model_dict.items():\n        model_dict[key] = model_dict[key].astype(paddle.float32)\n\n    # 设置模型参数为预训练参数\n    module.model.set_state_dict(model_dict)\n```\n\n#### 文本生成与结果展示\n\n```python\n    input_text = \"Historical Records: Tell us about the history of the Great Wall.\"\n    result = module.generate(input_text)\n\n    print(f'Prompt: {input_text}')\n    print(f'Generation: {result[0]}')\n```\n"
  },
  {
    "path": "projects/gpt/docs/single_finetune.md",
    "content": "# GPT2 微调\n\n本教程主要针对于 GLUE (General Language Understanding Evaluation) benchmark 中的数据集进行微调，涉及到分类和回归任务。\n\n## 下载 GPT345M 预训练模型\n```\n# 如果已经下载可以忽略\nmkdir -p ckpt\nwget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz\ntar -xzf ckpt/GPT_345M.tar.gz -C ckpt/\n```\n\n## 快速体验运行\n\n```\n# cd path/to/PaddleFleetX\n# bash projects/gpt/finetune_gpt_345M_single_card.sh taskname [split]\n\n# taskname 可选: CoLA, SST2, MRPC, QQP, STSB, MNLI, QNLI, RTE, WNLI\n# 例如 bash projects/gpt/finetune_gpt_345M_single_card.sh CoLA\n\n# 注：当数据集为 MNLI 时，验证集有两种，分别是 dev_matched 和 dev_mismatched，\n# 其他数据集，只有一种验证集，因此不用选择\n# 可以通过 bash projects/gpt/finetune_gpt_345M_single_card.sh MNLI dev_matched\n# 或者 bash projects/gpt/finetune_gpt_345M_single_card.sh MNLI dev_mismatched\n# 进行 finetune 训练\n\nbash projects/gpt/finetune_gpt_345M_single_card.sh SST2\n```\n\n## GLUE benchmark 数据集\n\nGLUE benchmark 包含 9 个数据集，分别是 **CoLA**、**SST-2**、**MRPC**、**QQP**、**STS-B**、**MNLI**、**QNLI**、**RTE**、**WNLI**，涉及到 **自然语言推断**，**文本蕴含**，**情感分析**，**语义相似** 等任务，整体可以归位 3 类，分别是单句任务：CoLA、SST-2；相似性：MRPC、QQP、STS-B；释义：MNLI、QNLI、RTE、WNLI。\n\n以下介绍载自 [huggingface](https://huggingface.co/datasets/glue/blob/main/glue.py).\n\n* CoLA: The Corpus of Linguistic Acceptability consists of English acceptability judgments drawn from books and journal articles on linguistic theory. Each example is a sequence of words annotated with whether it is a grammatical English sentence.\n* SST-2: The Stanford Sentiment Treebank consists of sentences from movie reviews and human annotations of their sentiment. The task is to predict the sentiment of a given sentence. We use the two-way (positive/negative) class split, and use only sentence-level labels.\n* MRPC: The Microsoft Research Paraphrase Corpus (Dolan & Brockett, 2005) is a corpus of sentence pairs automatically extracted from online news sources, with human annotations for whether the sentences in the pair are semantically equivalent.\n* QQP: The Quora Question Pairs2 dataset is a collection of question pairs from the community question-answering website Quora. The task is to determine whether a pair of questions are semantically equivalent.\n* STS-B: The Semantic Textual Similarity Benchmark (Cer et al., 2017) is a collection of sentence pairs drawn from news headlines, video and image captions, and natural language inference data. Each pair is human-annotated with a similarity score from 1 to 5.\n* MNLI: The Multi-Genre Natural Language Inference Corpus is a crowdsourced collection of sentence pairs with textual entailment annotations. Given a premise sentence and a hypothesis sentence, the task is to predict whether the premise entails the hypothesis (entailment), contradicts the hypothesis (contradiction), or neither (neutral). The premise sentences are gathered from ten different sources, including transcribed speech, fiction, and government reports. We use the standard test set, for which we obtained private labels from the authors, and evaluate on both the matched (in-domain) and mismatched (cross-domain) section. We also use and recommend the SNLI corpus as 550k examples of auxiliary training data.\n* QNLI: The Stanford Question Answering Dataset is a question-answering dataset consisting of question-paragraph pairs, where one of the sentences in the paragraph (drawn from Wikipedia) contains the answer to the corresponding question (written by an annotator). We convert the task into sentence pair classification by forming a pair between each question and each sentence in the corresponding context, and filtering out pairs with low lexical overlap between the question and the context sentence. The task is to determine whether the context sentence contains the answer to the question. This modified version of the original task removes the requirement that the model select the exact answer, but also removes the simplifying assumptions that the answer is always present in the input and that lexical overlap is a reliable cue.\n* RTE: The Recognizing Textual Entailment (RTE) datasets come from a series of annual textual entailment challenges. We combine the data from RTE1 (Dagan et al., 2006), RTE2 (Bar Haim et al., 2006), RTE3 (Giampiccolo et al., 2007), and RTE5 (Bentivogli et al., 2009).4 Examples are constructed based on news and Wikipedia text. We convert all datasets to a two-class split, where for three-class datasets we collapse neutral and contradiction into not entailment, for consistency.\n* WNLI: The Winograd Schema Challenge (Levesque et al., 2011) is a reading comprehension task in which a system must read a sentence with a pronoun and select the referent of that pronoun from a list of choices. The examples are manually constructed to foil simple statistical methods: Each one is contingent on contextual information provided by a single word or phrase in the sentence. To convert the problem into sentence pair classification, we construct sentence pairs by replacing the ambiguous pronoun with each possible referent. The task is to predict if the sentence with the pronoun substituted is entailed by the original sentence. We use a small evaluation set consisting of new examples derived from fiction books that was shared privately by the authors of the original corpus. While the included training set is balanced between two classes, the test set is imbalanced between them (65% not entailment). Also, due to a data quirk, the development set is adversarial: hypotheses are sometimes shared between training and development examples, so if a model memorizes the training examples, they will predict the wrong label on corresponding development set example. As with QNLI, each example is evaluated separately, so there is not a systematic correspondence between a model's score on this task and its score on the unconverted original task. We call converted dataset WNLI (Winograd NLI).\n\n\n## 微调相关类\n\n### `GPTForSequenceClassification`\n在 GPT 模型输出的 logits 基础上，增加一个分类层，并且用正态分布对新增的层参数进行初始化。\n\n```\nself.score = nn.Linear(self.gpt.hidden_size, num_classes, bias_attr=False)\n\nfrom paddle.nn.initializer import Normal\nnormal_ = Normal(std=self.gpt.initializer_range)\nnormal_(self.score.weight)\n```\n\n### `GPTFinetuneModule`\n该类继承自`BasicModule`，负责微调模型的初始化以及逻辑计算的类，需要实现几个重要的函数，下面给出两个具体的示例。 \n\n* `__init__`: 负责初始化 loss 函数以及评测指标函数。\n* `get_model`: 负责微调类 `GPTForSequenceClassification`、`GPTTokenizer` 初始化以及预训练模型的加载。\n\n## 超参数\n微调训练也需要一套完整的超参数，但是微调涉及的核心超参数并不多。\n\n### Engine\n\n| 参数字段 | 参数含义 |\n| ------ | --------|\n|run_mode| 运行的模式，需要设置为 epoch 方式|\n|num_train_epochs| 需要 finetune 的 epoch 数 |\n\n```\nEngine:\n  run_mode: epoch\n  num_train_epochs: 3 # WNLI 和 MRPC 数据集比较小，因此 `num_train_epochs=5`。\n```\n\n### Model\n\n| 参数字段 | 参数含义 |\n| ------ | --------|\n|module| 需要设置为 \"GPTFinetuneModule\" |\n|name | 需要设置为 \"GPT\" |\n|num_classes | finetune 时的类别数，根据语料库以及任务来设定 |\n|pretrained | 预训练的权重文件路径前缀，去掉 \".pdparams\" |\n|loss.train.name | finetune 时的训练损失函数类名 |\n|loss.eval.name | finetune 时的验证损失函数类名 |\n|metric.eval.name | finetune 时的验证评估函数类名 |\n\n微调时，不同任务对应的类别数 和 loss 函数以及评测指标不同，因此需要通过配置来改变设置。\n```\nModel:\n  module: \"GPTFinetuneModule\"\n  name: \"GPT\"\n  num_classes: 2 # 1 or 2 or 3\n  pretrained: 'path/to/pretrained_model'\n  \n  loss:\n    train:\n      name: 'CrossEntropyLoss'\n    eval:\n      name: 'CrossEntropyLoss'\n  \n  metric:\n    eval:\n      name: 'Accuracy'\n```\n\n### Optimizer 和 LRScheduler\n\n| 参数字段 | 参数含义 |\n| ------ | --------|\n|name| 优化器类名 |\n|weight_decay| 权重衰减值 |\n|beta1| FusedAdamW 的 beta1 |\n|beta2| FusedAdamW 的 beta2 |\n|epsilon| FusedAdamW 的 epsilon |\n|multi_precision| 当使用 FP16 O2 级别时，是否开启参数使用多精度表示 |\n|tensor_fusion| 是否开启 tensor_fusion |\n|lr.name| 学习率调整策略类名 |\n|lr.warmup| 当参数时小数时，表示 warmup 步数占总步数的比例，如果是整数时，则表示 warmup 的步数 |\n|lr.learning_rate| 初始化学习率值 |\n\n注：这里的超参会跟随优化器类的不同而不同，可以自行查看优化器类和学习率调整策略类初始化函数需要设置的超参数设定。\n\n```\nOptimizer:\n  name: FusedAdamW\n  weight_decay: 0.0\n  beta1: 0.9\n  beta2: 0.999\n  epsilon: 1e-6\n  multi_precision: True\n  tensor_fusion: False\n  lr:\n    name: LinearDecayWithWarmup\n    warmup: 0.1\n    learning_rate: 2e-5\n```\n\n### Data\n\n| 参数字段 | 参数含义 |\n| ------ | --------|\n|Train.dataset| 描述 finetune 时的数据集 |\n|Train.sampler| 描述 dataloader 所需要的 batch sampler |\n|Train.loader| 描述 dataloader 所需要的相关信息，例如 num_workers 等 |\n\n注：数据集的设定会根据不同任务不同语料库不同而设定不同，例如 `split` 字段，不同数据集是有不同的设定，请参考所需要 finetune 的数据集初始化函数。\n\n```\nData:\n  Train:\n    dataset:\n      name: SST2\n      root: ./dataset/SST-2/\n      split: 'train'\n      max_length: 128\n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 32\n      shuffle: True\n      drop_last: True\n    loader:\n      num_workers: 4\n      return_list: False\n  \n  Eval:\n    dataset:\n      name: SST2\n      root: ./dataset/SST-2/\n      split: 'dev'\n      max_length: 128\n    sampler:\n      name: DistributedBatchSampler\n      batch_size: 32\n      shuffle: False\n      drop_last: False\n    loader:\n      num_workers: 4\n      return_list: False\n```\n\n## 运行\n\nGLUE benchmark 上的语料库 finetune，大部分设置相同，可以同享一份，只有少量区别处需要改变，因此可以通过超参数的覆盖方式来设置。\n\n数据集加载时会自动判断是否已经缓存下载，如果未缓存下载会自行下载，请保证网络的畅通。当自动下载失败时，可以尝试多次以及检查是否有代理设置等。当下载失败时，也可以自己下载及解压到对应的目录中。\n\n以下是 GLUE benchmark 上的每个语料库的 finetune 单机单卡启动命令：\n\n### CoLA 数据集\n```\npython ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \\\n  -o Data.Train.dataset.name=CoLA \\\n  -o Data.Train.dataset.root=./dataset/cola_public/ \\\n  -o Data.Eval.dataset.name=CoLA \\\n  -o Data.Eval.dataset.root=./dataset/cola_public/ \\\n  -o Data.Eval.dataset.split=dev \\\n  -o Model.metric.train.name=Mcc \\\n  -o Model.metric.eval.name=Mcc\n  -o Model.num_classes=2\n```\n\n### SST2 数据集\n```\npython ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \\\n  -o Data.Train.dataset.name=SST2 \\\n  -o Data.Train.dataset.root=./dataset/SST-2/ \\\n  -o Data.Eval.dataset.name=SST2 \\\n  -o Data.Eval.dataset.root=./dataset/SST-2/ \\\n  -o Data.Eval.dataset.split=dev \\\n  -o Model.num_classes=2\n```\n\n### MRPC 数据集\n```\npython ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \\\n  -o Engine.num_train_epochs=5 \\\n  -o Data.Train.dataset.name=MRPC \\\n  -o Data.Train.dataset.root=./dataset/MRPC/ \\\n  -o Data.Eval.dataset.name=MRPC \\\n  -o Data.Eval.dataset.root=./dataset/MRPC/ \\\n  -o Data.Eval.dataset.split=test \\\n  -o Model.num_classes=2 \\\n  -o Model.metric.train.name=AccuracyAndF1 \\\n  -o Model.metric.eval.name=AccuracyAndF1\n```\n\n### QQP 数据集\n```\npython ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \\\n  -o Data.Train.dataset.name=QQP \\\n  -o Data.Train.dataset.root=./dataset/QQP/ \\\n  -o Data.Eval.dataset.name=QQP \\\n  -o Data.Eval.dataset.root=./dataset/QQP/ \\\n  -o Data.Eval.dataset.split=dev \\\n  -o Model.num_classes=2 \\\n  -o Model.metric.train.name=AccuracyAndF1 \\\n  -o Model.metric.eval.name=AccuracyAndF1\n```\n\n### STSB 数据集\n```\npython ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \\\n  -o Data.Train.dataset.name=STSB \\\n  -o Data.Train.dataset.root=./dataset/STS-B/ \\\n  -o Data.Eval.dataset.name=STSB \\\n  -o Data.Eval.dataset.root=./dataset/STS-B/ \\\n  -o Data.Eval.dataset.split=dev \\\n  -o Model.num_classes=1 \\\n  -o Model.metric.train.name=PearsonAndSpearman \\\n  -o Model.metric.eval.name=PearsonAndSpearman \\\n  -o Model.loss.train.name=MSELoss \\\n  -o Model.loss.eval.name=MSELoss\n```\n\n### MNLI 数据集\n\n注：MNLI 数据集验证集分为 `dev_matched` 和 `dev_mismatched`，目前暂不支持两个集合同时评测，如果要评测两种验证集，有两种方法：\n\n* 分别 finetune 2次，Data.Eval.dataset.split 设置不同的验证集\n* 保存 finetune 后的 checkpoint，在不同验证集上离线评测。\n\n\n```\npython ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \\\n  -o Data.Train.dataset.name=MNLI \\\n  -o Data.Train.dataset.root=./dataset/multinli_1.0 \\\n  -o Data.Eval.dataset.name=MNLI \\\n  -o Data.Eval.dataset.root=./dataset/multinli_1.0 \\\n  -o Data.Eval.dataset.split=dev_matched \\\n  -o Model.num_classes=3\n```\n\n### QNLI 数据集\n```\npython ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \\\n  -o Data.Train.dataset.name=QNLI \\\n  -o Data.Train.dataset.root=./dataset/QNLI/ \\\n  -o Data.Eval.dataset.name=QNLI \\\n  -o Data.Eval.dataset.root=./dataset/QNLI/ \\\n  -o Data.Eval.dataset.split=dev \\\n  -o Model.num_classes=2\n```\n\n### RTE 数据集\n```\npython ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \\\n  -o Data.Train.dataset.name=RTE \\\n  -o Data.Train.dataset.root=./dataset/RTE/ \\\n  -o Data.Eval.dataset.name=RTE \\\n  -o Data.Eval.dataset.root=./dataset/RTE/ \\\n  -o Data.Eval.dataset.split=dev \\\n  -o Model.num_classes=2\n```\n\n### WNLI 数据集\n```\npython ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \\\n  -o Engine.num_train_epochs=5 \\\n  -o Data.Train.dataset.name=WNLI \\\n  -o Data.Train.dataset.root=./dataset/WNLI/ \\\n  -o Data.Eval.dataset.name=WNLI \\\n  -o Data.Eval.dataset.root=./dataset/WNLI/ \\\n  -o Data.Eval.dataset.split=dev \\\n  -o Model.num_classes=2\n```\n\n\n## 运行结果\n\n以下的指标是通过 [GPT_345M](https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz) 预训练模型 finetune 得到的结果，仅作为参考。\n\n| Corpus | Task                | Domanin            | Metric                       | Result                       |\n| ------ | ------------------- | ------------------ | ---------------------------- | ---------------------------- |\n| CoLA   | acceptability       | Misc.              | Matthews corr                | 0.60471                      |\n| SST-2  | sentiment           | Movie reviews      | Accuracy                     | 0.93005                      |\n| MNLI   | NLI                 | Misc.              | Matched acc./Mismatched acc. | 0.84238/0.84815              |\n| QNLI   | QA/NLI              | Wikipedia          | Accuracy                     | 0.90445                      |\n| RTE    | NLI                 | News, Wikipedia    | Accuracy                     | 0.70397                      |\n| WNLI   | coreference         | Books              | Accuracy                     | 0.40845                      |\n| MRPC   | paraphrase          | News               | Accuracy/F1                  | 0.81913/0.87022              |\n| QQP    | paraphrase          | social QA question | Accuracy/F1                  | 0.86087/0.81055              |\n| STS-B  | sentence similarity | Misc.              | Pearson/Spearman corr.       | 0.85797/0.85824              |\n"
  },
  {
    "path": "projects/gpt/docs/structured_pruning.md",
    "content": "# GPT模型结构化稀疏\n\n本项目对语言模型 GPT 进行结构化稀疏（以下简称稀疏）。在 GPT 模型中，我们对 fused-qkv、out-linear、ffn1 和 ffn2 四层的权重进行了通道稀疏，其中，fused-qkv 和 ffn1 是在输出通道进行稀疏，out-linear 和 ffn2 是在输入通道进行稀疏。如果您需要自定义稀疏的层和通道，可以通过重写 ppfleetx/utils/compression_helper.py 中的 get_pruned_params() 函数实现。\n\n下面是本例涉及的文件及说明：\n\n```text\n.\n├── prune_gpt_345M_single_card.sh            # 单卡345M稀疏训练入口\n├── eval_prune_gpt_345M_single_card.sh       # 单卡345M稀疏模型验证入口\n├── export_prune_gpt_345M_single_card.sh     # 单卡345M稀疏模型导出入口\n```\n\n\n### 环境依赖和数据准备\n环境依赖和数据准备请参考[GPT训练文档](./README.md)。\n\n特别的，本示例需要依赖 PaddleSlim develop版本。安装命令如下：\n\n```shell\ngit clone https://github.com/PaddlePaddle/PaddleSlim.git & cd PaddleSlim\npip install -r requirements.txt\npython setup.py install\n```\n\n\n### 预训练模型准备\n稀疏训练需加载[GPT-345M](https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz) 的预训练模型。\n\n**预训练模型下载命令**\n```shell\nwget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz\ntar xf GPT_345M.tar.gz\n```\n\n### 稀疏训练\n\n- [345M模型稀疏训练](../gpt/prune_gpt_345M_single_card.sh)\n\n快速启动：\n```shell\nbash ./projects/gpt/prune_gpt_345M_single_card.sh\n```\n\n或如下启动：\n```shell\nexport CUDA_VISIBLE_DEVICES=0\npython ./tools/train.py \\\n    -c ./ppfleetx/configs/nlp/gpt/prune_gpt_345M_single_card.yaml \\\n    -o Engine.max_steps=100000 \\\n    -o Optimizer.lr.decay_steps=72000 \\\n    -o Optimizer.weight_decay=0.0 \\\n    -o Optimizer.lr.max_lr=2.5e-5 \\\n    -o Optimizer.lr.min_lr=5.0e-6 \\\n    -o Compress.pretrained='./PaddleFleetX_GPT_345M_220826'\n    \n```\n\n### 模型验证\n```shell\n# 下载验证数据\nwget https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl\nexport CUDA_VISIBLE_DEVICES=0\npython ./tools/eval.py \\\n    -c ./ppfleetx/configs/nlp/gpt/eval_pruned_gpt_345M_single_card.yaml \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0 \\\n    -o Engine.save_load.ckpt_dir='./output' \\\n    -o Offline_Eval.eval_path=./lambada_test.jsonl \\\n    -o Offline_Eval.cloze_eval=True\n```\n\n### 模型导出\n```shell\nexport CUDA_VISIBLE_DEVICES=0\npython ./tools/export.py \\\n    -c ./ppfleetx/configs/nlp/gpt/generation_pruned_gpt_345M_single_card.yaml \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0 \\\n    -o Engine.save_load.ckpt_dir='./output'\n```\n"
  },
  {
    "path": "projects/gpt/eval_prune_gpt_345M_single_card.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nexport CUDA_VISIBLE_DEVICES=0\n\npython ./tools/eval.py \\\n    -c ./ppfleetx/configs/nlp/gpt/eval_pruned_gpt_345M_single_card.yaml\n"
  },
  {
    "path": "projects/gpt/eval_qat_gpt_345M_single_card.sh",
    "content": "\n#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nexport CUDA_VISIBLE_DEVICES=0\n\npython ./tools/eval.py \\\n    -c ./ppfleetx/configs/nlp/gpt/eval_qat_gpt_345M_single_card.yaml \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0 \\\n    -o Engine.save_load.ckpt_dir='./GPT_345M_QAT_w_analysis/'\n    -o Offline_Eval.eval_path=./lambada_test.jsonl \\\n    -o Offline_Eval.cloze_eval=True \n"
  },
  {
    "path": "projects/gpt/evaluate_gpt_345M_single_card.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nexport CUDA_VISIBLE_DEVICES=0\npython ./tools/eval.py -c ./ppfleetx/configs/nlp/gpt/eval_gpt_345M_single_card.yaml\n"
  },
  {
    "path": "projects/gpt/export_gpt_345M_single_card.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nexport CUDA_VISIBLE_DEVICES=0\npython ./tools/export.py -c ./ppfleetx/configs/nlp/gpt/generation_gpt_345M_single_card.yaml\n"
  },
  {
    "path": "projects/gpt/export_prune_gpt_345M_single_card.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nexport CUDA_VISIBLE_DEVICES=0\n\npython ./tools/export.py \\\n    -c ./ppfleetx/configs/nlp/gpt/generation_pruned_gpt_345M_single_card.yaml\n"
  },
  {
    "path": "projects/gpt/export_qat_gpt_345M_single_card.sh",
    "content": "\n#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nexport CUDA_VISIBLE_DEVICES=0\n\n\n# 导出可验证模型\n# python ./tools/export.py \\\n#     -c ./ppfleetx/configs/nlp/gpt/export_qat_gpt_345M_single_card.yaml \\\n#     -o Model.hidden_dropout_prob=0.0 \\\n#     -o Model.attention_probs_dropout_prob=0.0 \\\n#     -o Engine.save_load.ckpt_dir='./GPT_345M_QAT_w_analysis/'\n\n# 导出可生成句子模型\npython ./tools/export.py \\\n    -c ./ppfleetx/configs/nlp/gpt/generation_qat_gpt_345M_single_card.yaml \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0 \\\n    -o Engine.save_load.ckpt_dir='./GPT_345M_QAT_wo_analysis/'\n"
  },
  {
    "path": "projects/gpt/finetune_gpt_345M_single_card.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nexport CUDA_VISIBLE_DEVICES=0\n\n# Single-Sentence Tasks\nif [ $1 == \"CoLA\" ]\nthen\n    python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \\\n      -o Data.Train.dataset.name=CoLA \\\n      -o Data.Train.dataset.root=./dataset/cola_public/ \\\n      -o Data.Eval.dataset.name=CoLA \\\n      -o Data.Eval.dataset.root=./dataset/cola_public/ \\\n      -o Data.Eval.dataset.split=dev \\\n      -o Model.metric.train.name=Mcc \\\n      -o Model.metric.eval.name=Mcc \\\n      -o Model.num_classes=2\nelif [ $1 == \"SST2\" ]\nthen\n    python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \\\n      -o Data.Train.dataset.name=SST2 \\\n      -o Data.Train.dataset.root=./dataset/SST-2/ \\\n      -o Data.Eval.dataset.name=SST2 \\\n      -o Data.Eval.dataset.root=./dataset/SST-2/ \\\n      -o Data.Eval.dataset.split=dev \\\n      -o Model.num_classes=2\n# Similarity and Paraphrase Tasks\nelif [ $1 == \"MRPC\" ]\nthen\n    python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \\\n      -o Engine.num_train_epochs=5 \\\n      -o Data.Train.dataset.name=MRPC \\\n      -o Data.Train.dataset.root=./dataset/MRPC/ \\\n      -o Data.Eval.dataset.name=MRPC \\\n      -o Data.Eval.dataset.root=./dataset/MRPC/ \\\n      -o Data.Eval.dataset.split=test \\\n      -o Model.num_classes=2 \\\n      -o Model.metric.train.name=AccuracyAndF1 \\\n      -o Model.metric.eval.name=AccuracyAndF1\nelif [ $1 == \"QQP\" ]\nthen\n    python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \\\n      -o Data.Train.dataset.name=QQP \\\n      -o Data.Train.dataset.root=./dataset/QQP/ \\\n      -o Data.Eval.dataset.name=QQP \\\n      -o Data.Eval.dataset.root=./dataset/QQP/ \\\n      -o Data.Eval.dataset.split=dev \\\n      -o Model.num_classes=2 \\\n      -o Model.metric.train.name=AccuracyAndF1 \\\n      -o Model.metric.eval.name=AccuracyAndF1\nelif [ $1 == \"STSB\" ]\nthen\n    python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \\\n      -o Data.Train.dataset.name=STSB \\\n      -o Data.Train.dataset.root=./dataset/STS-B/ \\\n      -o Data.Eval.dataset.name=STSB \\\n      -o Data.Eval.dataset.root=./dataset/STS-B/ \\\n      -o Data.Eval.dataset.split=dev \\\n      -o Model.num_classes=1 \\\n      -o Model.metric.train.name=PearsonAndSpearman \\\n      -o Model.metric.eval.name=PearsonAndSpearman \\\n      -o Model.loss.train.name=MSELoss \\\n      -o Model.loss.eval.name=MSELoss\n# Inference Tasks\nelif [ $1 == \"MNLI\" ]\nthen\n    python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \\\n      -o Data.Train.dataset.name=MNLI \\\n      -o Data.Train.dataset.root=./dataset/multinli_1.0 \\\n      -o Data.Eval.dataset.name=MNLI \\\n      -o Data.Eval.dataset.root=./dataset/multinli_1.0 \\\n      -o Data.Eval.dataset.split=${2:-\"dev_matched\"} \\\n      -o Model.num_classes=3\nelif [ $1 == \"QNLI\" ]\nthen\n    python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \\\n      -o Data.Train.dataset.name=QNLI \\\n      -o Data.Train.dataset.root=./dataset/QNLI/ \\\n      -o Data.Eval.dataset.name=QNLI \\\n      -o Data.Eval.dataset.root=./dataset/QNLI/ \\\n      -o Data.Eval.dataset.split=dev \\\n      -o Model.num_classes=2\nelif [ $1 == \"RTE\" ]\nthen\n    python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \\\n      -o Data.Train.dataset.name=RTE \\\n      -o Data.Train.dataset.root=./dataset/RTE/ \\\n      -o Data.Eval.dataset.name=RTE \\\n      -o Data.Eval.dataset.root=./dataset/RTE/ \\\n      -o Data.Eval.dataset.split=dev \\\n      -o Model.num_classes=2\nelif [ $1 == \"WNLI\" ]\nthen\n    python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \\\n      -o Engine.num_train_epochs=5 \\\n      -o Data.Train.dataset.name=WNLI \\\n      -o Data.Train.dataset.root=./dataset/WNLI/ \\\n      -o Data.Eval.dataset.name=WNLI \\\n      -o Data.Eval.dataset.root=./dataset/WNLI/ \\\n      -o Data.Eval.dataset.split=dev \\\n      -o Model.num_classes=2\nelse\n   echo \"Task name not recognized, please input CoLA, SST2, MRPC, QQP, STSB, MNLI, QNLI, RTE, WNLI.\"\nfi\n"
  },
  {
    "path": "projects/gpt/inference.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport time\nimport argparse\nimport numpy as np\n\nimport paddle\nimport paddle.distributed.fleet as fleet\nfrom ppfleetx.data import build_dataloader, tokenizers\nfrom ppfleetx.core.engine.inference_engine import InferenceEngine\nimport ppfleetx_ops\n\n\ndef parse_args():\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\"--mp_degree\", default=1, type=int, help=\"\")\n    parser.add_argument(\n        \"--model_dir\", default=\"output\", type=str, help=\"model directory\")\n\n    args = parser.parse_args()\n    return args\n\n\ndef main():\n\n    args = parse_args()\n\n    fleet.init(is_collective=True)\n    infer_engine = InferenceEngine(args.model_dir, args.mp_degree)\n\n    tokenizer = tokenizers.GPTTokenizer.from_pretrained(\"gpt2\")\n    input_text = 'Hi, GPT2. Tell me where is Beijing?'\n    ids = [tokenizer.encode(input_text)]\n\n    # run test\n\n    outs = infer_engine.predict([ids])\n\n    ids = list(outs.values())[0]\n    out_ids = [int(x) for x in ids[0]]\n    result = tokenizer.decode(out_ids)\n    result = input_text + result\n\n    print('Prompt:', input_text)\n    print('Generation:', result)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "projects/gpt/inference_gpt_6.7B_single_card.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nlog_dir=log_mp1\nrm -rf $log_dir\n\npython -m paddle.distributed.launch --log_dir=$log_dir  projects/gpt/inference.py --mp_degree 1 --model_dir output\n"
  },
  {
    "path": "projects/gpt/inference_gpt_multigpu.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nlog_dir=log_mp1\nrm -rf $log_dir\n\nexport CUDA_VISIBLE_DEVICES=\"0,1,2,3,4,5,6,7\"\nexport MP=8\n\npython -m paddle.distributed.launch --devices \"0,1,2,3,4,5,6,7\" projects/gpt/inference.py --mp_degree ${MP} --model_dir output\n"
  },
  {
    "path": "projects/gpt/inference_gpt_single_card.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nlog_dir=log_mp1\nrm -rf $log_dir\n\nexport CUDA_VISIBLE_DEVICES=0\npython -m paddle.distributed.launch --devices \"0\"  projects/gpt/inference.py --mp_degree 1 --model_dir output\n"
  },
  {
    "path": "projects/gpt/pretrain_gpt_1.3B_dp8.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nlog_dir=log_hybrid\nrm -rf $log_dir\n\n# 1.3B+dp8 run_pretrain\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/train.py \\\n    -c ./ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml\n"
  },
  {
    "path": "projects/gpt/pretrain_gpt_1.3B_single_card.sh",
    "content": "\n#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\npython ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_single_card.yaml \n"
  },
  {
    "path": "projects/gpt/pretrain_gpt_175B_mp8_pp16.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nlog_dir=log_hybrid\nrm -rf $log_dir\n\n# 175B+mp8_pp16 run_pretrain\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/train.py \\\n    -c ./ppfleetx/configs/nlp/gpt/pretrain_gpt_175B_mp8_pp16.yaml\n"
  },
  {
    "path": "projects/gpt/pretrain_gpt_345M_single_card.sh",
    "content": "\n#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nexport CUDA_VISIBLE_DEVICES=0\npython ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml \n"
  },
  {
    "path": "projects/gpt/pretrain_gpt_6.7B_sharding16.sh",
    "content": "#! /bin/bash\n# Runs the \"1.3B\" parameter model\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nlog_dir=log_hybrid\nrm -rf $log_dir\n\n# 6.7B+sharding16 run_pretrain\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/train.py \\\n    -c ./ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml\n"
  },
  {
    "path": "projects/gpt/prune_gpt_345M_single_card.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nexport CUDA_VISIBLE_DEVICES=0\n\npython ./tools/train.py \\\n    -c ./ppfleetx/configs/nlp/gpt/prune_gpt_345M_single_card.yaml\n"
  },
  {
    "path": "projects/gpt/qat_gpt_345M_mp8.sh",
    "content": "\n#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n\nlog_dir=log_hybrid\nrm -rf $log_dir\n\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/train.py \\\n    -c ./ppfleetx/configs/nlp/gpt/qat_gpt_345M_mp8.yaml \\\n    -o Engine.max_steps=100000 \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0 \\\n    -o Optimizer.lr.decay_steps=72000 \\\n    -o Optimizer.weight_decay=0.02 \\\n    -o Optimizer.lr.max_lr=5.0e-6 \\\n    -o Optimizer.lr.min_lr=1.0e-6 \\\n    -o Compress.pretrained='./PaddleFleetX_GPT_345M_220826'\n"
  },
  {
    "path": "projects/gpt/qat_gpt_345M_single_card.sh",
    "content": "\n#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nexport CUDA_VISIBLE_DEVICES=0\n\npython ./tools/train.py \\\n    -c ./ppfleetx/configs/nlp/gpt/qat_gpt_345M_single_card.yaml \\\n    -o Engine.max_steps=100000 \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0 \\\n    -o Optimizer.lr.decay_steps=72000 \\\n    -o Optimizer.weight_decay=0.02 \\\n    -o Optimizer.lr.max_lr=5.0e-6 \\\n    -o Optimizer.lr.min_lr=1.0e-6 \\\n    -o Compress.pretrained='./PaddleFleetX_GPT_345M_220826'\n"
  },
  {
    "path": "projects/gpt/qat_gpt_6.7B_sharding16.sh",
    "content": "#! /bin/bash\n# Runs the \"1.3B\" parameter model\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nlog_dir=log_hybrid\nrm -rf $log_dir\n\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/train.py \\\n    -c ./ppfleetx/configs/nlp/gpt/qat_gpt_6.7B_sharding16.yaml \\\n    -o Engine.max_steps=100000 \\\n    -o Model.hidden_dropout_prob=0.0 \\\n    -o Model.attention_probs_dropout_prob=0.0 \\\n    -o Optimizer.lr.decay_steps=72000 \\\n    -o Optimizer.weight_decay=0.02 \\\n    -o Optimizer.lr.max_lr=5.0e-6 \\\n    -o Optimizer.lr.min_lr=1.0e-6 \\\n    -o Compress.pretrained='./PaddleFleetX_GPT_6.7B'\n"
  },
  {
    "path": "projects/gpt/run_benchmark.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# for mp=8(GPT 175b)\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython -m paddle.distributed.launch --devices \"0,1,2,3,4,5,6,7\" projects/gpt/benchmark.py --seq_len 128 --iter 10 --mp_degree 8 --model_dir ./output\n\n# for mp=1(GPT 6.7B & GPT 345M)\nexport CUDA_VISIBLE_DEVICES=0\npython -m paddle.distributed.launch --devices \"0\" projects/gpt/benchmark.py --seq_len 128 --iter 10 --mp_degree 1 --model_dir ./output\n"
  },
  {
    "path": "projects/imagen/README.md",
    "content": "\n<h1>Imagen</h1>\n<h3>Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding</h3>\n\n\n*  Paddle implementation of [Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding](https://arxiv.org/pdf/2205.11487.pdf). Google's Text-to-Image Diffussion Models that beats DALL-E2.\n\n\n## Updates\n\n***20/September/2022:***  The code of Text-to-image and Super Resolution model is released.\n\n\n\n## Introduction\nImagen is a text-to-image diffusion model with an unprecedented degree of photorealism and a deep level of language understanding.Imagen builds on the power of large transformer language models in understanding text and hinges on the strength of diffusion models in high-fidelity image generation.Imagen utilizes a pipeline of a base 64 × 64 model, and two text-conditional super-resolution diffusion models to upsample a 64 × 64 generated image into a 256 × 256 image, and then to 1024 × 1024 image.\n<br />  \nIn comparison to previous text-to-image diffusion generation methods (e.g., DALL-E2) that take advantages of multi-modal embeddings such as CLIP, Imagen benefits largely from the use of large pre-trained language models.\n\n<div align=center><img src=\"./demo/Imagen_theme.png\" width=\"40%\"></div>\n\n## Usage\n\n### Data preparing\nImagen need text-image pairs for the training loop. For scaling purpose, we provide a [demo dataset](https://paddlefleetx.bj.bcebos.com/data/laion400m/part-00079) which textual embeddings and mask is precomputed.\n```\ncp part-00079 PaddleFleetX/projects/imagen\n``` \n### Imagen text encoder preparing\nImagen need load pretrained text encoder model for the training loop. T5 and\nDeBERTa V2 are provided for Imagen.\n#### T5-11B\n``` \n# T5 tokenizer and model was converted from Huggingface.\nconfig.json: wget https://paddlefleetx.bj.bcebos.com/tokenizers/t5/t5-11b/config.json\nspiece.model: wget https://paddlefleetx.bj.bcebos.com/tokenizers/t5/t5-11b/spiece.model\ntokenizer.json: wget https://paddlefleetx.bj.bcebos.com/tokenizers/t5/t5-11b/tokenizer.json\nt5 model: wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.0\n          wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.1\n          wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.2\n          wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.3\n          wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.4\n          cat t5.pd.tar.gz.* |tar -xf - \nput them into t5 folder like this:\nPaddleFleetX/projects/imagen/t5\n                 ├── t5-11b\n                    ├── config.json\n                    ├── spiece.model\n                    ├── t5.pd\n                    └── tokenizer.json\n``` \n\n#### DeBERTa V2 1.5B\n```\n# DeBERTa V2 tokenizer and model was converted from Huggingface.\nconfig.json: wget https://paddlefleetx.bj.bcebos.com/tokenizers/debertav2/config.json\nspm.model: wget https://paddlefleetx.bj.bcebos.com/tokenizers/debertav2/spm.model\ntokenizer_config.json: https://paddlefleetx.bj.bcebos.com/tokenizers/debertav2/tokenizer_config.json\ndenerta v2 model: wget https://fleetx.bj.bcebos.com/DebertaV2/debertav2.pd.tar.gz.0\n                  wget https://fleetx.bj.bcebos.com/DebertaV2/debertav2.pd.tar.gz.1\n                  cat debertav2.pd.tar.gz.* | tar -xf -\nput them into cache folder like this:\nPaddleFleetX/projects/imagen/cache\n                  └── deberta-v-xxlarge\n                      ├── config.json\n                      ├── debertav2.pd\n                      ├── spm.model\n                      ├── tokenizer_config.json\n```\n### Train Imagen with T5-11B text encoder\n```\ncd PaddleFleetX/\n```\nTrain Imagen text-to-image 64×64 397M diffusion model with single gpu.\n```\nsh projects/imagen/run_text2im_397M_64x64_single_card.sh\n```\nTrain Imagen text-to-image 64×64 397M diffusion model with 128 gpus.\n \n```\nsh projects/imagen/run_text2im_397M_64x64_dp128.sh\n```\nTrain Imagen text-to-image 64×64 2B diffusion model with 256 gpus.\n \n- The 2B parameters diffusion model use Group Sharded data parallelism techniques to eliminate memory redundacies by partitioning the optimizer states, gradients, and parameters across multiple devices.\n\n \n```\ncd PaddleFleetX/\nsh projects/imagen/run_text2im_2B_64x64_T5-11B_sharding8_dp32.sh\n```\n### Train DeBERTaV2 1.5B Imagen diffusion model with 8 gpus.\n```\ncd PaddleFleetX/\nsh projects/imagen/run_text2im_64x64_DebertaV2_dp8.sh\n```\n### Train Imagen Super Resolusion 256×256 diffusion model.\nTrain Imagen Super Resolusion 256×256 diffusion model with single gpu.\n```\ncd PaddleFleetX/\nsh projects/imagen/run_super_resolution_256_single_card.sh\n```\nTrain Imagen Super Resolusion 256×256 diffusion model with 128 gpus.\n```\ncd PaddleFleetX/\nsh projects/imagen/run_super_resolution_256_dp128.sh\n```\nTrain Imagen Super Resolusion 1024×1024 diffusion model with 128 gpus.\n- The 1024x1024 super resolution diffusion model use checkpointing techniques to eliminate intermediate variable memory redundacies.\n```\ncd PaddleFleetX/\nsh projects/imagen/run_super_resolution_1024_sharding128.sh\n```\n\n## Citing Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding \n```\n@article{chen2022context,\n  title={Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding},\n  author={Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar Seyed Ghasemipour, Burcu Karagol Ayan, S. Sara Mahdavi, Rapha Gontijo Lopes, Tim Salimans, Jonathan Ho, David J Fleet, Mohammad Norouzi},\n  journal={arXiv preprint arXiv:2205.11487},\n  year={2022}\n}\n```\n"
  },
  {
    "path": "projects/imagen/filelist/laion_400M/train",
    "content": "projects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\nprojects/imagen/part-00079\n"
  },
  {
    "path": "projects/imagen/run_super_resolution_1024_sharding128.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nlog_dir=log_sharding\nrm -rf $log_dir\n\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/train.py \\\n    -c ./ppfleetx/configs/multimodal/imagen/imagen_super_resolution_1024.yaml \\\n    -o Distributed.sharding.sharding_stage=2 \\\n    -o Distributed.sharding.sharding_degree=8 \\\n    -o Engine.mix_precision.enable=False \\\n    -o Data.Train.loader.batch_size=1 \\\n    -o Model.use_recompute=True \\\n"
  },
  {
    "path": "projects/imagen/run_super_resolution_256_dp128.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nlog_dir=log_sharding\nrm -rf $log_dir\n\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/train.py \\\n    -c ./ppfleetx/configs/multimodal/imagen/imagen_super_resolution_256.yaml \\\n    -o Distributed.dp_degree=128\n"
  },
  {
    "path": "projects/imagen/run_super_resolution_256_single_card.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\npython3 tools/train.py -c ppfleetx/configs/multimodal/imagen/imagen_super_resolution_256.yaml\n"
  },
  {
    "path": "projects/imagen/run_text2im_2B_64x64_T5-11B_sharding8_dp32.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nlog_dir=log_sharding\nrm -rf $log_dir\n\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/train.py \\\n    -c ./ppfleetx/configs/multimodal/imagen/imagen_text2im_64x64_T5-11B.yaml \\\n    -o Distributed.sharding.sharding_stage=2 \\\n    -o Distributed.dp_degree=32 \\\n    -o Distributed.sharding.sharding_degree=8 \n"
  },
  {
    "path": "projects/imagen/run_text2im_397M_64x64_dp128.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nlog_dir=log_dp128\nrm -rf $log_dir\n\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n  tools/train.py \\\n  -c ppfleetx/configs/multimodal/imagen/imagen_397M_text2im_64x64.yaml \\\n  -o Distributed.dp_degree=128 \n"
  },
  {
    "path": "projects/imagen/run_text2im_397M_64x64_single_card.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\npython3 tools/train.py -c ppfleetx/configs/multimodal/imagen/imagen_397M_text2im_64x64.yaml\n"
  },
  {
    "path": "projects/imagen/run_text2im_64x64_DebertaV2_dp8.sh",
    "content": "#! /bin/bash\n\n# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nlog_dir=log_dp8\nrm -rf $log_dir\n\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n  tools/train.py \\\n  -c ppfleetx/configs/multimodal/imagen/imagen_text2im_64x64_DebertaV2.yaml \\\n  -o Distributed.dp_degree=8\n"
  },
  {
    "path": "projects/moco/README.md",
    "content": "# MoCo\n![MoCo](https://user-images.githubusercontent.com/11435359/71603927-0ca98d00-2b14-11ea-9fd8-10d984a2de45.png)\n\nThis is a PaddlePaddle implementation of the \n[MoCov1](https://arxiv.org/abs/1911.05722), \n[MoCov2](https://arxiv.org/abs/2003.04297).\n\n\n## Install Preparation\n\nMoCo requires `PaddlePaddle >= 2.4`.\n```shell\n# git clone https://github.com/PaddlePaddle/PaddleFleetX.git\ncd /path/to/PaddleFleetX\n```\n\nAll commands are executed in the `PaddleFleetX` root directory.\n\n```shell\npython -m pip install -r requirements.txt -i https://mirror.baidu.com/pypi/simple\n```\n\n## Data Preparation\n\nThe imagenet 1k dataset needs to be prepared first and will be organized into the following directory structure.\n\n```shell\nILSVRC2012\n├── train/\n├── xxx\n├── val/\n└── xxx\n```\n\nThen configure the path.\n\n```shell\nmkdir -p dataset\nln -s /path/to/ILSVRC2012 dataset/ILSVRC2012\n```\n\n## Unsupervised Training\n\nTo do unsupervised pre-training of a ResNet-50 model on ImageNet in an 8-gpu machine, you can run the script: \n\n### MoCo V1 (Single Node with 8 GPUs)\n```shell\nexport PADDLE_NNODES=1\nexport PADDLE_MASTER=\"127.0.0.1:12538\"\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython -m paddle.distributed.launch \\\n    --nnodes=$PADDLE_NNODES \\\n    --master=$PADDLE_MASTER \\\n    --devices=$CUDA_VISIBLE_DEVICES \\\n    tools/train.py -c ppfleetx/configs/vis/moco/mocov1_pt_in1k_1n8c.yaml\n```\n\n### MoCo V2 (Single Node with 8 GPUs)\n```shell\nexport PADDLE_NNODES=1\nexport PADDLE_MASTER=\"127.0.0.1:12538\"\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython -m paddle.distributed.launch \\\n    --nnodes=$PADDLE_NNODES \\\n    --master=$PADDLE_MASTER \\\n    --devices=$CUDA_VISIBLE_DEVICES \\\n    tools/train.py -c ppfleetx/configs/vis/moco/mocov2_pt_in1k_1n8c.yaml\n```\n\n\nThe differences between MoCo v1 and MoCo v2 are as follows:\n* MoCo v2 has a projector\n* Data augmentation\n* Softmax temperature\n* Learning rate scheduler\n\n## Linear Classification\n\nWhen the unsupervised pre-training is complete, or directly download the provided pre-training checkpoint, you can use the following script to train a supervised linear classifier.\n\n### MoCo v1\n\n#### [Optional] Download checkpoint\n```shell\nmkdir -p pretrained/moco/\nwget -O ./pretrained/moco/mocov1_pt_imagenet2012_resnet50.pdparams https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov1_pt_imagenet2012_resnet50.pdparams\n```\n\n#### Linear Classification Training (Single Node with 8 GPUs)\n\n```shell\nexport PADDLE_NNODES=1\nexport PADDLE_MASTER=\"127.0.0.1:12538\"\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython -m paddle.distributed.launch \\\n    --nnodes=$PADDLE_NNODES \\\n    --master=$PADDLE_MASTER \\\n    --devices=$CUDA_VISIBLE_DEVICES \\\n    tools/train.py -c ppfleetx/configs/vis/moco/moco_lincls_in1k_1n8c.yaml \\\n    -o Model.model.base_encoder.pretrained=./pretrained/moco/mocov1_pt_imagenet2012_resnet50\n\n```\n\n### MoCo v2\n\n#### [Optional] Download checkpoint\n```shell\nmkdir -p pretrained/moco/\nwget -O ./pretrained/moco/mocov2_pt_imagenet2012_resnet50.pdparams https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov2_pt_imagenet2012_resnet50.pdparams\n```\n\n#### Linear Classification Training (Single Node with 8 GPUs)\n\n```shell\nexport PADDLE_NNODES=1\nexport PADDLE_MASTER=\"127.0.0.1:12538\"\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython -m paddle.distributed.launch \\\n    --nnodes=$PADDLE_NNODES \\\n    --master=$PADDLE_MASTER \\\n    --devices=$CUDA_VISIBLE_DEVICES \\\n    tools/train.py -c ppfleetx/configs/vis/moco/moco_lincls_in1k_1n8c.yaml \\\n    -o Model.model.base_encoder.pretrained=./pretrained/moco/mocov2_pt_imagenet2012_resnet50\n\n```\n\n## Models\n\n| Model   | Phase                 | Epochs | Top1 Acc | Checkpoint                                                   | Log                                                          |\n| ------- | --------------------- | ------ | -------- | ------------------------------------------------------------ | ------------------------------------------------------------ |\n| MoCo v1 | Unsupervised Training | 200    | -        | [download](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov1_pt_imagenet2012_resnet50.pdparams) | [log](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov1_pt_imagenet2012_resnet50.log) |\n| MoCo v1 | Linear Classification | 100    | 0.606141 | [download](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov1_lincls_imagenet2012_resnet50.pdparams) | [log](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov1_lincls_imagenet2012_resnet50.log) |\n| MoCo v2 | Unsupervised Training | 200    | -        | [download](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov2_pt_imagenet2012_resnet50.pdparams) | [log](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov2_pt_imagenet2012_resnet50.log) |\n| MoCo v2 | Linear Classification | 100    | 0.676595 | [download](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov2_lincls_imagenet2012_resnet50.pdparams) | [log](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov2_lincls_imagenet2012_resnet50.log) |\n\n\n## Citations\n\n```\n@Article{he2019moco,\n  author  = {Kaiming He and Haoqi Fan and Yuxin Wu and Saining Xie and Ross Girshick},\n  title   = {Momentum Contrast for Unsupervised Visual Representation Learning},\n  journal = {arXiv preprint arXiv:1911.05722},\n  year    = {2019},\n}\n\n@Article{chen2020mocov2,\n  author  = {Xinlei Chen and Haoqi Fan and Ross Girshick and Kaiming He},\n  title   = {Improved Baselines with Momentum Contrastive Learning},\n  journal = {arXiv preprint arXiv:2003.04297},\n  year    = {2020},\n}\n```\n"
  },
  {
    "path": "projects/moco/run_mocov1_lincls_in1k.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nexport PADDLE_NNODES=1\nexport PADDLE_MASTER=\"127.0.0.1:12538\"\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython -m paddle.distributed.launch \\\n    --nnodes=$PADDLE_NNODES \\\n    --master=$PADDLE_MASTER \\\n    --devices=$CUDA_VISIBLE_DEVICES \\\n    tools/train.py -c ppfleetx/configs/vis/moco/moco_lincls_in1k_1n8c.yaml \\\n    -o Model.model.base_encoder.pretrained=./pretrained/moco/mocov1_pt_imagenet2012_resnet50\n"
  },
  {
    "path": "projects/moco/run_mocov1_pretrain_in1k.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nexport PADDLE_NNODES=1\nexport PADDLE_MASTER=\"127.0.0.1:12538\"\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython -m paddle.distributed.launch \\\n    --nnodes=$PADDLE_NNODES \\\n    --master=$PADDLE_MASTER \\\n    --devices=$CUDA_VISIBLE_DEVICES \\\n    tools/train.py -c ppfleetx/configs/vis/moco/mocov1_pt_in1k_1n8c.yaml\n"
  },
  {
    "path": "projects/moco/run_mocov2_lincls_in1k.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nexport PADDLE_NNODES=1\nexport PADDLE_MASTER=\"127.0.0.1:12538\"\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython -m paddle.distributed.launch \\\n    --nnodes=$PADDLE_NNODES \\\n    --master=$PADDLE_MASTER \\\n    --devices=$CUDA_VISIBLE_DEVICES \\\n    tools/train.py -c ppfleetx/configs/vis/moco/moco_lincls_in1k_1n8c.yaml \\\n    -o Model.model.base_encoder.pretrained=./pretrained/moco/mocov2_pt_imagenet2012_resnet50\n"
  },
  {
    "path": "projects/moco/run_mocov2_pretrain_in1k.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nexport PADDLE_NNODES=1\nexport PADDLE_MASTER=\"127.0.0.1:12538\"\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython -m paddle.distributed.launch \\\n    --nnodes=$PADDLE_NNODES \\\n    --master=$PADDLE_MASTER \\\n    --devices=$CUDA_VISIBLE_DEVICES \\\n    tools/train.py -c ppfleetx/configs/vis/moco/mocov2_pt_in1k_1n8c.yaml\n"
  },
  {
    "path": "projects/protein_folding/README.md",
    "content": "# Protein Folding\n\n声明: 本项目不提供具体能运行的蛋白质结构预测程序，如果想体验直接能运行的蛋白质结构预测代码，请跳转到\n[HelixFold](https://github.com/PaddlePaddle/PaddleHelix/tree/dev/apps/protein_folding/helixfold) 中运行。\n\n\n本项目是一个教程，展示如何将数据并行、动态轴并行、分支并行（DP-DAP-BP）混合并行接入到 HelixFold 中。\n想要在 HelixFold 中使用混合并行，则涉及到以下几个方面：\n\n* 依赖安装\n* 通信初始化\n* 混合并行网络模型使用\n* 优化器设置 DAP 和 BP 属性\n* 参数同步与梯度同步\n\n## 依赖安装\n```shell\npip install ppfleetx\n```\n\n## 通信初始化\n\n```python\nfrom ppfleetx.distributed.protein_folding import dp\nfrom ppfleetx.distributed.protein_folding.scg import scg\n\ndef init_distributed_env(args):\n    dp_rank = 0 # ID for current device in distributed data parallel collective communication group\n    dp_nranks = 1 # The number of devices in distributed data parallel collective communication group\n    if args.distributed:\n        # init bp, dap, dp hybrid distributed environment\n        scg.init_process_group(parallel_degree=[('dp', None), ('dap', args.dap_degree), ('bp', args.bp_degree)])\n\n        dp_nranks = dp.get_world_size()\n        dp_rank = dp.get_rank_in_group() if dp_nranks > 1 else 0\n\n        if args.bp_degree > 1 or args.dap_degree > 1:\n            assert args.seed is not None, \"BP and DAP should be set seed!\"\n\n    return dp_rank, dp_nranks\n```\n\n## 混合并行网络模型使用\n\n目前，在 HelixFold 网络模型中涉及到混合并行的有 Embedding 和 Evoformer 类，因此可以将原来 HelixFold 中的 `EmbeddingsAndEvoformer`\n修改为 `DistEmbeddingsAndEvoformer`。在网络模型中涉及 `DAP` 和 `BP` 的网络模型修改都在 [DistEmbeddingsAndEvoformer](../../ppfleetx/models/protein_folding/evoformer.py) 中封装，\n\n```python\nfrom ppfleetx.models.protein_folding.evoformer import DistEmbeddingsAndEvoformer \nevoformer = DistEmbeddingsAndEvoformer(\n    self.channel_num, self.config.embeddings_and_evoformer,\n    self.global_config)\n```\n\n## 优化器设置 DAP 和 BP 属性\n\n由于 `DAP` 和 `BP` 在网络模型中分别切分的是中间激活值和网络计算分支，参数是没有切分的，因此在梯度同步的时候，\n是需要区分同步的。我们将 `dap` 和 `bp` 属性设置在优化器参数分组中作为区分，并在后续梯度同步的时候使用。\n\n```python\nevoformer_params = []\ntemplate_and_pair_transition_params = []\nother_params = []\nfor name, p in model.named_parameters():\n    if 'template_pair_stack' in name or 'pair_transition' in name:\n        template_and_pair_transition_params.append(p)\n    elif 'evoformer_iteration' in name or 'extra_msa_stack' in name:\n        evoformer_params.append(p)\n    else:\n        other_params.append(p)\nparameters = []\n\nif args.dap_degree > 1 or args.bp_degree > 1:\n    parameters.append({'params': get_fused_params(other_params)})\n    parameters.append({'params': get_fused_params(evoformer_params), 'dap': True, 'bp': True})\n    parameters.append({'params': get_fused_params(template_and_pair_transition_params), 'dap': True})\nelse:\n    parameters.append({'params': get_fused_params(other_params + evoformer_params + template_and_pair_transition_params)})\n\noptimizer = paddle.optimizer.Adam(\n        learning_rate=lr_scheduler, \n        epsilon=1e-06,\n        grad_clip=grad_clip,\n        parameters = parameters\n    )\n```\n\n## 参数同步与梯度同步\n\n### 参数同步\n\n虽然是 `DP-DAP-BP` 混合并行，但是每个设备上的模型参数是没有切分的，因为在模型训练之前也需要做一次参数同步。\n\n```python\nfrom ppfleetx.distributed.protein_folding import dp\n\nmodel = RunModel(train_config, model_config)\ndp.param_sync(model, src_rank=0)\n```\n\n### 梯度同步\n\n如上节所述，在梯度同步的时候需要分别对 `DP`，`DAP`，`BP` 并行策略相关的模型参数的梯度进行同步。\n\n```python\nfrom ppfleetx.distributed.protein_folding import dap, bp, dp\n\nloss.backward()\n\n# sync the gradient for branch parallel firstly\nbp.grad_sync(optimizer._param_groups)\n# then sync the gradient for dap\ndap.grad_sync(optimizer._param_groups)\n# finally sync the gradient for ddp\ndp.grad_sync(optimizer._param_groups)\n\noptimizer.step()\noptimizer.clear_grad()\n```\n\n## 论文引用\n```\n@article{wang2022helixfold,\n  title={HelixFold: An Efficient Implementation of AlphaFold2 using PaddlePaddle},\n  author={Wang, Guoxia and Fang, Xiaomin and Wu, Zhihua and Liu, Yiqun and Xue, Yang and Xiang, Yingfei and Yu, Dianhai and Wang, Fan and Ma, Yanjun},\n  journal={arXiv preprint arXiv:2207.05477},\n  year={2022}\n}\n\n@article{wang2022efficient_alphafold2,\n  title={Efficient AlphaFold2 Training using Parallel Evoformer and Branch Parallelism},\n  author={Wang, Guoxia and Wu, Zhihua and Fang, Xiaomin and Xiang, Yingfei and Liu, Yiqun and Yu, Dianhai and Ma, Yanjun},\n  journal={arXiv preprint arXiv:2211.00235},\n  year={2022}\n}\n```\n"
  },
  {
    "path": "projects/ufo2.0/README.md",
    "content": "# VIMER-UFO 2.0 (文心-CV大模型)\n## 整体概述\n近年来预训练大模型一次次刷新记录，展现出惊人的效果，但对于产业界而言，势必要面对如何应用落地的问题。当前预训练模型的落地流程可被归纳为：针对只有少量标注数据的特定任务，使用任务数据 fine-tune 预训练模型并部署上线。然而，当预训练模型参数量不断增大后，该流程面临两个严峻的挑战。首先，随着模型参数量的急剧增加，大模型 fine-tuning 所需要的计算资源将变得非常巨大，普通开发者通常无法负担。其次，随着 AIoT 的发展，越来越多 AI 应用从云端往边缘设备、端设备迁移，而大模型却无法直接部署在这些存储和算力都极其有限的硬件上。\n\n针对预训练大模型落地所面临的问题，百度提出统一特征表示优化技术（UFO：Unified Feature Optimization），在充分利用大数据和大模型的同时，兼顾大模型落地成本及部署效率。VIMER-UFO 2.0 技术方案的主要内容包括：\n  * Task MoE: 飞桨多任务超网络分布式训练架构，支持训练任务动态扩展，特定任务任意切分，保证多任务之间信息有效借鉴，负载均衡，高效协同。\n  * All in One：行业最大 170 亿参数视觉多任务模型，覆盖人脸、人体、车辆、商品、食物细粒度分类等 20+ CV 基础任务，单模型 28 个公开测试集效果 SOTA。\n  * One for All：首创针对视觉多任务的超网络与训练方案，支持各类任务、各类硬件的灵活部署，解决大模型参数量大，推理性能差的问题。\n\n![图1:UFO整体架构](./img/UFO_v2_1.png)\n\n## 模型效果\n文心VIMER-UFO 2.0大模型是基于飞桨的Task MoE架构构建多任务超网络，模型参数量达到170亿，单模型28项公开数据集SOTA。基于飞桨Task MoE架构，可以根据任务的不同自动选择激活最优的区域，从而实现100倍参数压缩，同时支持下游任务快速扩展，是行业最大的视觉多任务统一大模型。尽管 VIMER-UFO 2.0 大模型参数量达到了170 亿，得益于 Task-MoE 稀疏结构，每个任务推理时只需激活部分参数，计算量相当于 6 亿参数模型规模，加速比接近 30 倍。更多细节请参看[VIMER-UFO 2.0](https://github.com/PaddlePaddle/VIMER/tree/develop/UFO)。\n\n![图2:UFO_Result](./img/UFO_v2_2.png)\n\n## 飞桨Task MoE分布式训练架构\n如此大的参数规模和任务数，给模型的训练带来了巨大的挑战。文心VIMER-UFO 2.0大模型采用稀疏门控混合专家设计，仅参数存储就需要68G，给训练时的模型存储带来了压力；该模型在前向反向时所有计算节点间会进行同步等待的All-to-All通信，使得通信负担明显加大；此外，该模型的多任务数目是动态的，且多个任务之间样本严重不均衡，使得计算节点之间的同步等待较长，影响并发效率。\n\n针对这些挑战，飞桨提出了Task MoE分布式训练架构，不仅实现多级并行存储稀疏参数，还支持硬件拓扑感知通信，使得层次化All-to-All通信效率提升20%。同时飞桨还创新性地提出了基于Task的负载均衡机制，支持任务数量的动态扩展、特定任务的任意切分以及多个任务在不同的专家下的并发训练，同等实验环境下训练性能比PyTorch提升66%。同时，该方案保障多任务之间信息借鉴机制的有效性，使得VIMER-UFO 2.0模型精度大幅提升。此外，在推理阶段，基于飞桨Task MoE架构构建的多任务多路径的超网络，可支持任务粒度的路径选择，方便灵活部署。\n\n![图3:UFO_Perf](./img/UFO_Perf.png)\n\n\n## 使用方案\n1. 有关UFO的更多细节原理请参看[VIMER-UFO 2.0](https://github.com/PaddlePaddle/VIMER/tree/develop/UFO)。\n2. VIMER-UFO 2.0 相关的模型、训练代码和评测脚本均已开源，更多细节正在逐渐完善中，了解详细信息可访问：https://github.com/PaddlePaddle/VIMER/tree/main/UFO/OneForAll\n"
  },
  {
    "path": "projects/vit/README.md",
    "content": "# Vision Transformer\n\nThis project implements the (Vision Transformer) proposed by google [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929).\n\n\n## How to pretrain from scratch on imagenet2012\n\n### Go to the main repo directory\nAll commands are executed in the home directory.\n```\ncd /path/to/PaddleFleetX\n```\n\n### Data\nThe imagenet 1k dataset needs to be prepared first and will be organized into the following directory structure.\n\n```\nILSVRC2012\n├── train/\n├── train_list.txt\n├── val/\n└── val_list.txt\n```\n\nThen configure the path.\n\n```shell\nmkdir -p dataset\nln -s /path/to/ILSVRC2012 dataset/ILSVRC2012\n```\n\n### Train ViT-B/16\n\nNote: ViT-B/16 needs run on 2 nodes with 16 A100 GPUs. If you only have a low-memory GPU, you can use gradient accumulation by setting `accumulate_steps` in yaml.\n\n\nThe following commands need to be run on each node.\n```shell\npython -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" tools/train.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml\n```\n\n## Finetune ViT-B/16\n\n### [Optional] Download checkpoint\n```shell\nmkdir -p pretrained/vit/\nwget -O ./pretrained/vit/imagenet2012-ViT-B_16-224.pdparams https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-224.pdparams\n```\n\n\n### Finetune on imagenet2012\nFinetune is similar to pre-training on ImageNet2012 dataset, we have provided the configured yaml file.\n\n```shell\npython -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" tools/train.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml\n```\n\n### Finetune on cifar10\n\nNote: CIFAR10 dataset is automatically downloaded and cached.\n\n```shell\npython -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" tools/train.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_cifar10_1n8c_dp_fp16o2.yaml\n```\n\n### Quantization Aware Training on ImageNet2012\n\n\n```shell\npython -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" tools/train.py \\\n    -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml \\\n    -o Model.model.drop_rate=0.0 \\\n    -o Data.Train.sampler.batch_size=16 \\\n    -o Optimizer.lr.learning_rate=5e-05 \\\n    -o Optimizer.weight_decay=0.0002 \n```\n\n量化训练的参数详细介绍见[模型压缩介绍](../../../docs/compression.md)。\n\n\n## Model\n\n| Model    | Phase    | Size   | Dataset      | Resolution | GPUs        | Img/sec | Top1 Acc | Pre-trained checkpoint                                                                             | Fine-tuned checkpoint | Log                                                                                      |\n|----------|----------|--------|--------------|------------|-------------|---------|----------|----------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------|\n| ViT-B_16 | pretrain | 167MiB | ImageNet2012 | 224        | A100*N2C16  | 7350    | 74.75%   | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-224.pdparams) | -                                                                                               | [log](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-224.log) |\n| ViT-B_16 | finetune | 167MiB | ImageNet2012 | 384        | A100*N2C16  | 1580    | 77.68%   | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-224.pdparams) | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-384.pdparams)          | [log](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-384.log) |\n| ViT-L_16 | finetune | 582MiB | ImageNet2012 | 384        | A100*N2C16  | 519     | 85.13%   | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet21k-jax-ViT-L_16-224.pdparams) | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet21k+imagenet2012-ViT-L_16-384.pdparams)          | [log](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet21k+imagenet2012-ViT-L_16-384.log) |\n| Quantized ViT-B_16 | finetune | 167MiB | ImageNet2012 | 384         | A100*N2C16  | 1580     |  77.71%  | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-384.pdparams) | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/quantized_imagenet2012-ViT-B_16-384.pdparams)          | [log](https://paddlefleetx.bj.bcebos.com/model/vision/vit/quantized_imagenet2012-ViT-B_16-384.log) |\n\n\n\n# 推理部署\n\n参考[这里](./docs/inference.md)\n\n"
  },
  {
    "path": "projects/vit/auto_vit_patch16_224_dp8.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nlog_dir=log_auto\nrm -rf $log_dir\n\n# tiny_patch16_224+dp8 run_pretrain\npython -m paddle.distributed.launch --log_dir $log_dir --devices \"0,1,2,3,4,5,6,7\" \\\n    ./tools/auto.py \\\n    -c ppfleetx/configs/vis/vit/auto/ViT_tiny_patch16_224_ci_cifar10_1n8c_dp_fp16o2.yaml\n"
  },
  {
    "path": "projects/vit/docs/inference.md",
    "content": "# 推理部署\n\n模型训练完成后，可使用飞桨高性能推理引擎Paddle Inference通过如下方式进行推理部署。\n\n```bash\nsh projects/vit/run_inference_base_patch16_224.sh\n```\n分解步骤如下：\n\n## 1. 模型导出\n\n首先将模型导出为用于部署的推理模型，可通过`tools/export.py`进行模型导出，通过`-c`指定需要导出的模型的配置文件，通过`-o Engine.save_load.ckpt_dir=`指定导出模型时使用的权重。\n\n以`VIT-224`模型为例，通过如下方式下载PaddleFleetX发布的训练好的权重。若你已下载或使用训练过程中的权重，可跳过此步。\n\n```bash\nmkdir -p ckpt\nwget -O ckpt/model.pdparams https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-224.pdparams\n```\n\n通过如下方式进行推理模型导出\n\n```bash\npython tools/export.py \\\n    -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_inference.yaml \\\n    -o Engine.save_load.ckpt_dir=./ckpt/\n```\n\n导出的模型默认保存在`./output`目录，可通过配置文件中`Engine.save_load.output_dir`或通过`-o Engine.save_load.output_dir=`指定\n\n\n## 2. 推理部署\n\n模型导出后，可通过`projects/vit/inference.py`脚本进行推理部署。\n\n```bash\npython projects/vit/inference.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_inference.yaml\n```"
  },
  {
    "path": "projects/vit/export_qat.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" tools/export.py \\\n    -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml \\\n    -o Model.model.drop_rate=0.0 \\\n    -o Data.Train.sampler.batch_size=16 \\\n    -o Optimizer.lr.learning_rate=5e-05 \\\n    -o Optimizer.weight_decay=0.0002\n"
  },
  {
    "path": "projects/vit/inference.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\nimport numpy as np\nfrom PIL import Image\nimport paddle\n\nfrom paddle.distributed import fleet\nimport paddle.distributed as dist\n\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))\n\nfrom ppfleetx.utils import config\nfrom ppfleetx.distributed.apis import env\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.data import build_dataloader, tokenizers\nfrom ppfleetx.models import build_module\nfrom ppfleetx.core import EagerEngine\n\ndef softmax(x):\n    exp_x = np.exp(x)\n    return exp_x/np.sum(exp_x)\n\ndef preprocess(img_path):\n        \"\"\"preprocess\n        Preprocess to the input.\n        Args: img_path: Image path.\n        Returns: Input data after preprocess.\n        \"\"\"\n        with open(img_path, \"rb\") as f:\n            img = Image.open(f)\n            img = img.convert(\"RGB\")\n        # ResizeImage\n        img = img.resize((224,224), Image.BILINEAR)\n\n        # NormalizeImage\n        scale = np.float32(1.0/255.0)\n        mean = [0.5, 0.5, 0.5]\n        std = [0.5, 0.5, 0.5]\n        shape = (1, 1, 3)\n        mean = np.array(mean).reshape(shape).astype('float32')\n        std = np.array(std).reshape(shape).astype('float32')\n        img = (img * scale - mean) / std\n\n        # ToNCHW\n        img = img.transpose((2, 0, 1))\n        img = np.expand_dims(img, axis=0)\n        return img\n\nif __name__ == \"__main__\":\n    args = config.parse_args()\n    cfg = config.get_config(args.config, overrides=args.override, show=False)\n    env.set_seed(cfg.Global.seed)\n    np.random.seed(1)\n    img_path = 'projects/vit/images/demo.jpg'\n    img = preprocess(img_path)\n    \n    if(os.path.exists('shape.pbtxt')==False):\n        cfg.Inference.TensorRT.collect_shape = True\n        module = build_module(cfg)\n        engine = EagerEngine(configs=cfg,module=module, mode='inference')\n        outs = engine.inference([img])\n\n    cfg.Inference.TensorRT.collect_shape = False\n    module = build_module(cfg)\n    config.print_config(cfg)\n    engine = EagerEngine(configs=cfg,module=module, mode='inference')\n    outs = engine.inference([img])\n    res = softmax(outs['linear_99.tmp_1'])\n    max_index = np.argmax(res, axis=-1)\n    print(\"类型: \", max_index[0],)\n    print(\"概率: \", res[0][max_index[0]])\n\n    \n"
  },
  {
    "path": "projects/vit/run_finetune.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" tools/train.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml\n#python -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" tools/train.py -c ppfleetx/configs/vis/vit/ViT_large_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml\n"
  },
  {
    "path": "projects/vit/run_finetune_fused_attention.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n\npython -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" tools/train.py \\\n       -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml \\\n       -o Model.model.use_fused_attn=True\n"
  },
  {
    "path": "projects/vit/run_inference_base_patch16_224.sh",
    "content": "echo \"step 1: download parameters\"\nmkdir -p ckpt\nwget -O ckpt/model.pdparams https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-224.pdparams\n\necho \"step 2: export model\"\npython tools/export.py \\\n    -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_inference.yaml \\\n    -o Engine.save_load.ckpt_dir=./ckpt/\n\necho \"step 3: run VIT inference\"\npython projects/vit/inference.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_inference.yaml\n"
  },
  {
    "path": "projects/vit/run_pretrain.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" tools/train.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml\n"
  },
  {
    "path": "projects/vit/run_pretrained_fused_attention.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n\npython -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" tools/train.py \\\n       -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml \\\n       -o Model.model.use_fused_attn=True\n"
  },
  {
    "path": "projects/vit/run_qat.sh",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nexport CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\npython -m paddle.distributed.launch --gpus=\"0,1,2,3,4,5,6,7\" tools/train.py \\\n    -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml \\\n    -o Model.model.drop_rate=0.0 \\\n    -o Data.Train.sampler.batch_size=16 \\\n    -o Optimizer.lr.learning_rate=5e-05 \\\n    -o Optimizer.weight_decay=0.0002\n"
  },
  {
    "path": "requirements.txt",
    "content": "paddleslim @ https://paddle-qa.bj.bcebos.com/PaddleSlim/paddleslim-0.0.0.dev0-py3-none-any.whl\npaddlenlp @ https://paddlenlp.bj.bcebos.com/wheels/paddlenlp-ci-py3-none-any.whl\nrequests==2.25.1\nregex==2022.7.25\ncolorlog==6.6.0\ncolorama==0.4.5\nomegaconf==2.2.2\ntqdm>=4.62.1\npybind11==2.10.0\nnumpy>=1.19.5,<=1.21.6\nopencv-python>=4.2.0.32\nPillow==9.3.0\nblobfile==1.3.3\n"
  },
  {
    "path": "setup.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom setuptools import setup, Extension, find_packages\n\nfrom ppfleetx.data.data_tools.cpp.compile import compile_helper\ncompile_helper()\n\n\ndef fetch_requirements(path):\n    with open(path, 'r') as fd:\n        return [r.strip() for r in fd.readlines()]\n\n\ninstall_requires = fetch_requirements('requirements.txt')\n\nsetup(\n    name='ppfleetx',\n    version='0.0.0',\n    description='PaddleFleetX',\n    author='PaddlePaddle Authors',\n    url='https://github.com/PaddlePaddle/PaddleFleetX',\n    install_requires=install_requires,\n    package_data={\n        'ppfleetx.data.data_tools.cpp': ['fast_index_map_helpers.so']\n    },\n    packages=find_packages())\n"
  },
  {
    "path": "tasks/gpt/generation.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport argparse\nimport math\nimport os\nimport random\nimport time\nimport sys\nimport yaml\nimport numpy as np\n\nimport paddle\nfrom paddle.distributed import fleet\nimport paddle.distributed as dist\n\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))\n\nfrom ppfleetx.utils import config\nfrom ppfleetx.models import build_module\nfrom ppfleetx.distributed.apis import env\n\nif __name__ == \"__main__\":\n    args = config.parse_args()\n    cfg = config.get_config(args.config, overrides=args.override, show=False)\n\n    if dist.get_world_size() > 1:\n        env.init_dist_env(cfg)\n\n    env.set_seed(cfg.Global.seed)\n\n    module = build_module(cfg)\n    config.print_config(cfg)\n\n    module.model.eval()\n\n    ckpt_dir = cfg.Engine.save_load.ckpt_dir\n    if ckpt_dir is not None:\n        model_path = os.path.join(ckpt_dir, \"model.pdparams\")\n        model_dict = paddle.load(model_path)\n\n        for key, value in model_dict.items():\n            model_dict[key] = model_dict[key].astype(paddle.float32)\n\n        module.model.set_state_dict(model_dict)\n\n    input_text = 'Hi, GPT2. Tell me who Jack Ma is.'\n    result = module.generate(input_text)\n\n    print(f'Prompt: {input_text}')\n    print(f'Generation: {result[0]}')\n"
  },
  {
    "path": "tasks/gpt/inference.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\n\nfrom paddle.distributed import fleet\nimport paddle.distributed as dist\n\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))\n\nfrom ppfleetx.utils import config\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.data import build_dataloader, tokenizers\nfrom ppfleetx.models import build_module\nfrom ppfleetx.core import EagerEngine\nfrom ppfleetx.distributed.apis import env\n\nif __name__ == \"__main__\":\n    args = config.parse_args()\n    cfg = config.get_config(args.config, overrides=args.override, show=False)\n\n    if dist.get_world_size() > 1:\n        env.init_dist_env(cfg)\n\n    env.set_seed(cfg.Global.seed)\n\n    module = build_module(cfg)\n    config.print_config(cfg)\n\n    tokenizer = tokenizers.GPTTokenizer.from_pretrained(\"gpt2\")\n    engine = EagerEngine(configs=cfg, module=module, mode='inference')\n\n    input_text = 'Hi, GPT2. Tell me who Jack Ma is.'\n    input_ids = [tokenizer.encode(input_text)]\n\n    outs = engine.inference([input_ids])\n\n    ids = list(outs.values())[0]\n    out_ids = [int(x) for x in ids[0]]\n    result = tokenizer.decode(out_ids)\n    result = input_text + result\n\n    print('Prompt:', input_text)\n    print('Generation:', result)\n"
  },
  {
    "path": "tasks/gpt/run_generation.sh",
    "content": "#!/usr/bin/env bash\n\n# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# for single card generation\n\nexport CUDA_VISIBLE_DEVICES=0\npython tasks/gpt/generation.py -c ./ppfleetx/configs/nlp/gpt/generation_gpt_345M_single_card.yaml\n"
  },
  {
    "path": "tools/auto.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\nimport copy\nimport random\nimport paddle\nimport numpy as np\nimport paddle.distributed as dist\n\nfrom paddle.distributed import fleet\n\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../')))\n\nfrom ppfleetx.utils import config\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.models import build_module\nfrom ppfleetx.data import build_auto_dataset\nfrom ppfleetx.core import AutoEngine\n\n#init_logger()\n\nif __name__ == \"__main__\":\n    args = config.parse_args()\n    cfg = config.get_auto_config(\n        args.config, overrides=args.override, show=False)\n\n    if dist.get_world_size() > 1:\n        fleet.init(is_collective=True)\n\n    module = build_module(cfg)\n    config.print_config(cfg)\n\n    train_data = build_auto_dataset(cfg.Data, \"Train\")\n    eval_data = build_auto_dataset(cfg.Data, \"Eval\")\n\n    cfg.Optimizer.lr.update({\n        'epochs': cfg.Engine.num_train_epochs,\n        'step_each_epoch': len(train_data)\n    })\n\n    engine = AutoEngine(configs=cfg, module=module)\n\n    if cfg.Engine.save_load.ckpt_dir is not None:\n        engine.load()\n\n    if cfg.get('Tuning', None) and cfg.Tuning.enable:\n        engine.tune(train_data)\n    else:\n        engine.fit(train_dataset=train_data,\n                   valid_dataset=eval_data,\n                   epoch=cfg.Engine.num_train_epochs)\n"
  },
  {
    "path": "tools/auto_export.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\nimport copy\nimport random\nimport paddle\nimport numpy as np\n\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../')))\n\nfrom ppfleetx.utils import config\nfrom ppfleetx.models import build_module\nfrom ppfleetx.core import AutoEngine\n\nif __name__ == \"__main__\":\n    args = config.parse_args()\n    cfg = config.get_auto_config(\n        args.config, overrides=args.override, show=False)\n\n    if cfg.get('Model', None) is not None:\n        module = build_module(cfg)\n        config.print_config(cfg)\n\n        engine = AutoEngine(configs=cfg, module=module, mode=\"export\")\n\n        if cfg.Engine.save_load.ckpt_dir is not None:\n            engine.load()\n\n        engine.export()\n    else:\n        engine = AutoEngine(configs=cfg, mode=\"export\")\n        if cfg.Engine.save_load.ckpt_dir is None:\n            raise ValueError(\"invalid ckpt_dir.\")\n\n        engine.export_from_prog()\n"
  },
  {
    "path": "tools/eval.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\n\nfrom paddle.distributed import fleet\nimport paddle.distributed as dist\n\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../')))\n\nfrom ppfleetx.utils import config\nfrom ppfleetx.data import build_dataloader\nfrom ppfleetx.models import build_module\nfrom ppfleetx.core import EagerEngine\nfrom ppfleetx.distributed.apis import env\n\nif __name__ == \"__main__\":\n    args = config.parse_args()\n    cfg = config.get_config(args.config, overrides=args.override, show=False)\n\n    if dist.get_world_size() > 1:\n        env.init_dist_env(cfg)\n\n    env.set_seed(cfg.Global.seed)\n\n    module = build_module(cfg)\n    config.print_config(cfg)\n\n    engine = EagerEngine(configs=cfg, module=module, mode='eval')\n\n    valid_data_loader = build_dataloader(cfg.Data, \"Eval\")\n\n    if cfg.Engine.save_load.ckpt_dir is not None:\n        engine.load()\n\n    engine.evaluate(\n        valid_data_loader=valid_data_loader, epoch=cfg.Engine.num_train_epochs)\n"
  },
  {
    "path": "tools/export.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\n\nfrom paddle.distributed import fleet\nimport paddle.distributed as dist\n\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../')))\n\nfrom ppfleetx.utils import config\nfrom ppfleetx.models import build_module\nfrom ppfleetx.core import EagerEngine\nfrom ppfleetx.distributed.apis import env\n\nif __name__ == \"__main__\":\n    args = config.parse_args()\n    cfg = config.get_config(args.config, overrides=args.override, show=False)\n\n    if dist.get_world_size() > 1:\n        env.init_dist_env(cfg)\n\n    env.set_seed(cfg.Global.seed)\n\n    module = build_module(cfg)\n    config.print_config(cfg)\n\n    engine = EagerEngine(configs=cfg, module=module, mode='export')\n\n    if cfg.Engine.save_load.ckpt_dir is not None:\n        engine.load()\n\n    engine.export()\n"
  },
  {
    "path": "tools/inference.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n#     http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\n\nfrom paddle.distributed import fleet\nimport paddle.distributed as dist\n\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../')))\n\nfrom ppfleetx.utils import config\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.data import build_dataloader\nfrom ppfleetx.models import build_module\nfrom ppfleetx.core import EagerEngine\nfrom ppfleetx.distributed.apis import env\n\n# init_logger()\n\nif __name__ == \"__main__\":\n    args = config.parse_args()\n    cfg = config.get_config(args.config, overrides=args.override, show=False)\n\n    if dist.get_world_size() > 1:\n        env.init_dist_env(cfg)\n\n    env.set_seed(cfg.Global.seed)\n\n    module = build_module(cfg)\n    config.print_config(cfg)\n\n    engine = EagerEngine(configs=cfg, module=module, mode='inference')\n\n    test_data_loader = build_dataloader(cfg.Data, \"Test\")\n    for iter_id, data in enumerate(test_data_loader()):\n        outs = engine.inference(data)\n\n        if iter_id >= cfg.Engine.test_iters:\n            break\n\n    logger.info(\"The inference process is complete.\")\n    del test_data_loader\n"
  },
  {
    "path": "tools/train.py",
    "content": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport os\nimport sys\nimport copy\n\nimport paddle\nfrom paddle.distributed import fleet\nimport paddle.distributed as dist\n\n__dir__ = os.path.dirname(os.path.abspath(__file__))\nsys.path.append(os.path.abspath(os.path.join(__dir__, '../')))\n\nfrom ppfleetx.utils import config\nfrom ppfleetx.utils.log import logger\nfrom ppfleetx.data import build_dataloader\nfrom ppfleetx.models import build_module\nfrom ppfleetx.core import EagerEngine\nfrom ppfleetx.distributed.apis import env\n\n\ndef set_default_flags(flags):\n    for flag_name, flag_value in flags.items():\n        if os.getenv(flag_name) is None:\n            paddle.set_flags({flag_name: flag_value})\n\n\nif __name__ == \"__main__\":\n    args = config.parse_args()\n    cfg = config.get_config(args.config, overrides=args.override, show=False)\n\n    paddle.set_device(cfg[\"Global\"][\"device\"])\n    if dist.get_world_size() > 1:\n        env.init_dist_env(cfg)\n\n    env.set_seed(cfg.Global.seed)\n\n    module = build_module(cfg)\n    config.print_config(cfg)\n\n    train_data_loader = build_dataloader(cfg.Data, \"Train\")\n    eval_data_loader = build_dataloader(cfg.Data, \"Eval\")\n\n    cfg.Optimizer.lr.update({\n        'epochs': cfg.Engine.num_train_epochs,\n        'step_each_epoch': len(train_data_loader),\n        'total_steps': cfg.Engine.max_steps,\n    })\n\n    engine = EagerEngine(configs=cfg, module=module)\n\n    if cfg.Engine.save_load.ckpt_dir is not None:\n        engine.load()\n\n    engine.fit(train_data_loader=train_data_loader,\n               valid_data_loader=eval_data_loader,\n               epoch=cfg.Engine.num_train_epochs)\n"
  }
]