Repository: zai-org/CogVLM
Branch: main
Commit: f7283b2c8d26
Files: 50
Total size: 370.8 KB

Directory structure:
gitextract_7gke4omx/

├── .deepspeed_env
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.yaml
│   │   └── feature-request.yaml
│   └── PULL_REQUEST_TEMPLATE/
│       └── pr_template.md
├── .gitignore
├── LICENSE
├── MODEL_LICENSE
├── README.md
├── README_zh.md
├── assets/
│   └── WECHAT.md
├── basic_demo/
│   ├── cli_demo_hf.py
│   ├── cli_demo_sat.py
│   └── web_demo.py
├── composite_demo/
│   ├── client.py
│   ├── conversation.py
│   ├── demo_agent_cogagent.py
│   ├── demo_chat_cogagent.py
│   ├── demo_chat_cogvlm.py
│   ├── main.py
│   └── utils.py
├── dataset.md
├── dataset_zh.md
├── finetune_demo/
│   ├── evaluate_cogagent.sh
│   ├── evaluate_cogagent_demo.py
│   ├── evaluate_cogvlm.sh
│   ├── evaluate_cogvlm_demo.py
│   ├── finetune_cogagent_demo.py
│   ├── finetune_cogagent_lora.sh
│   ├── finetune_cogvlm_demo.py
│   ├── finetune_cogvlm_lora.sh
│   └── test_config_bf16.json
├── openai_demo/
│   ├── openai_api.py
│   └── openai_api_request.py
├── requirements.txt
└── utils/
    ├── __init__.py
    ├── merge_model.py
    ├── models/
    │   ├── __init__.py
    │   ├── cogagent_model.py
    │   ├── cogvlm_model.py
    │   ├── eva_clip_L_hf.py
    │   ├── eva_clip_model.py
    │   └── mixin.py
    ├── split_dataset.py
    └── utils/
        ├── __init__.py
        ├── chat.py
        ├── dataset.py
        ├── grounding_parser.py
        ├── language.py
        ├── template.py
        └── vision.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .deepspeed_env
================================================
SAT_HOME=~/.sat_models
LOCAL_WORLD_SIZE=8

================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.yaml
================================================
name: "\U0001F41B Bug Report"
description: Submit a bug report to help us improve ChatGLM3 / 提交一个 Bug 问题报告来帮助我们改进 ChatGLM3
body:
  - type: textarea
    id: system-info
    attributes:
      label: System Info / 系統信息
      description: Your operating environment / 您的运行环境信息
      placeholder: Includes Cuda version, Transformers version, Python version, operating system, hardware information (if you suspect a hardware problem)... / 包括Cuda版本，Transformers版本，Python版本，操作系统，硬件信息(如果您怀疑是硬件方面的问题)...
    validations:
      required: true

  - type: textarea
    id: who-can-help
    attributes:
      label: Who can help? / 谁可以帮助到您？
      description: |
        Your issue will be replied to more quickly if you can figure out the right person to tag with @
        All issues are read by one of the maintainers, so if you don't know who to tag, just leave this blank and our maintainer will ping the right person.
    
        Please tag fewer than 3 people.
        
        如果您能找到合适的标签 @，您的问题会更快得到回复。
        所有问题都会由我们的维护者阅读，如果您不知道该标记谁，只需留空，我们的维护人员会找到合适的开发组成员来解决问题。
        
        标记的人数应该不超过 1 个人。

        Related demo leader / 相关demo负责人 :
        - finetune_demo: @1049451037
        - composite_demo: @zR
        - openai_demo: @zR
        
        
        If it's not a bug in these three subsections, you may not specify the helper. Our maintainer will find the right person in the development group to solve the problem.
        
        如果不是这三个子版块的bug，您可以不指明帮助者，我们的维护人员会找到合适的开发组成员来解决问题。

      placeholder: "@Username ..."

  - type: checkboxes
    id: information-scripts-examples
    attributes:
      label: Information / 问题信息
      description: 'The problem arises when using: / 问题出现在'
      options:
        - label: "The official example scripts / 官方的示例脚本"
        - label: "My own modified scripts / 我自己修改的脚本和任务"

  - type: textarea
    id: reproduction
    validations:
      required: true
    attributes:
      label: Reproduction / 复现过程
      description: |
        Please provide a code example that reproduces the problem you encountered, preferably with a minimal reproduction unit.
        If you have code snippets, error messages, stack traces, please provide them here as well.
        Please format your code correctly using code tags. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
        Do not use screenshots, as they are difficult to read and (more importantly) do not allow others to copy and paste your code.
        
        请提供能重现您遇到的问题的代码示例,最好是最小复现单元。
        如果您有代码片段、错误信息、堆栈跟踪，也请在此提供。
        请使用代码标签正确格式化您的代码。请参见 https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
        请勿使用截图，因为截图难以阅读，而且（更重要的是）不允许他人复制粘贴您的代码。
      placeholder: |
        Steps to reproduce the behavior/复现Bug的步骤:
          
          1.
          2.
          3.

  - type: textarea
    id: expected-behavior
    validations:
      required: true
    attributes:
      label: Expected behavior / 期待表现
      description: "A clear and concise description of what you would expect to happen. /简单描述您期望发生的事情。"

================================================
FILE: .github/ISSUE_TEMPLATE/feature-request.yaml
================================================
name: "\U0001F680 Feature request"
description: Submit a request for a new ChatGLM3 feature / 提交一个新的 ChatGLM3 的功能建议
labels: [ "feature" ]
body:
  - type: textarea
    id: feature-request
    validations:
      required: true
    attributes:
      label: Feature request  / 功能建议
      description: |
        A brief description of the functional proposal. Links to corresponding papers and code are desirable.
        对功能建议的简述。最好提供对应的论文和代码链接

  - type: textarea
    id: motivation
    validations:
      required: true
    attributes:
      label: Motivation / 动机
      description: |
        Your motivation for making the suggestion. If that motivation is related to another GitHub issue, link to it here.
        您提出建议的动机。如果该动机与另一个 GitHub 问题有关，请在此处提供对应的链接。

  - type: textarea
    id: contribution
    validations:
      required: true
    attributes:
      label: Your contribution / 您的贡献
      description: |
        
        Your PR link or any other link you can help with.
        您的PR链接或者其他您能提供帮助的链接。

================================================
FILE: .github/PULL_REQUEST_TEMPLATE/pr_template.md
================================================
#  Raise valuable PR / 提出有价值的PR

## Caution/ 注意事项:
Users should keep the following points in mind when submitting PRs:

1. The proposed PR should be about this project. 
2. the proposed PR should be relevant, if there are multiple ideas and optimizations, they should be assigned to different PRs.

用户在提交PR时候应该注意以下几点:

1. 提出的PR应该是关于本项目的。
2. 提出的PR应该具有针对性，如果具有多个不同的想法和优化方案，应该分配到不同的PR中。

## 不应该提出的PR / PRs that should not be proposed

If a developer proposes a PR about any of the following, it may be closed or Rejected.

1. those that don't describe improvement options.
2. multiple issues of different types combined in one PR.
3. The proposed PR is highly duplicative of already existing PRs.

如果开发者提出关于以下方面的PR，则可能会被直接关闭或拒绝通过。

1. 没有说明改进方案的。
2. 多个不同类型的问题合并在一个PR中的。
3. 提出的PR与已经存在的PR高度重复的。


# 检查您的PR
- [ ] Have you read the Contributor Guidelines, Pull Request section? / 您是否阅读了贡献者指南、Pull Request 部分？
- [ ] Has this been discussed/approved via a Github issue or forum? If so, add a link. / 是否通过 Github 问题或论坛讨论/批准过？如果是，请添加链接。
- [ ] Did you make sure you updated the documentation with your changes? Here are the Documentation Guidelines, and here are the Documentation Formatting Tips. /您是否确保根据您的更改更新了文档？这里是文档指南，这里是文档格式化技巧。
- [ ] Did you write new required tests? / 您是否编写了新的必要测试？
- [ ]  Are your PRs for only one issue / 您的PR是否仅针对一个问题

================================================
FILE: .gitignore
================================================
.hypothesis/
__pycache__
output.png
fewshot-data/
checkpoints/
records.db
server.py
examples/*grounding.png
archive*
hostfile
runs/
*.idea/
.DS_Store

================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright 2024 CogVLM team @ Zhipu AI

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.

================================================
FILE: MODEL_LICENSE
================================================
The CogVLM License

1. Definitions

“Licensor” means the CogVLM Model Team that distributes its Software.

“Software” means the CogVLM model parameters made available under this license.

2. License Grant

Under the terms and conditions of this license, the Licensor hereby grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license.
This license permits you to use all open-source models in this repository for academic research free. Users who wish to use the models for commercial purposes must register [here](https://open.bigmodel.cn/mla/form).
Registered users may use the models for commercial activities free of charge, but must comply with all terms and conditions of this license.
The license notice shall be included in all copies or substantial portions of the Software.

3. Restriction

You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any military, or illegal purposes.

You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings.

4. Disclaimer

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

5. Limitation of Liability

EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.

6. Dispute Resolution

This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing.

Note that the license is subject to update to a more comprehensive version.  For any questions related to the license and copyright, please contact us at license@zhipuai.cn.

7. Llama2 and EVA-CLIP2 License

For CogVLM-17B version, Llama2 license conditions (https://ai.meta.com/llama/license/) and EVA license conditions (MIT, https://github.com/baaivision/EVA/blob/master/LICENSE) Also applies to model weights.


1. 定义

“许可方”是指分发其软件的 CogVLM 模型团队。

“软件”是指根据本许可提供的 CogVLM 模型参数。

2. 许可授予

根据本许可的条款和条件，许可方特此授予您非排他性、全球性、不可转让、不可再许可、可撤销、免版税的版权许可。
本许可允许您免费使用本仓库中的所有开源模型进行学术研究，对于希望将模型用于商业目的的用户，需在[这里](https://open.bigmodel.cn/mla/form)完成登记。
经过登记的用户可以免费使用本模型进行商业活动，但必须遵守本许可的所有条款和条件。
上述版权声明和本许可声明应包含在本软件的所有副本或重要部分中。

3.限制

您不得出于任何军事或非法目的使用、复制、修改、合并、发布、分发、复制或创建本软件的全部或部分衍生作品。

您不得利用本软件从事任何危害国家安全和国家统一、危害社会公共利益、侵犯人身权益的行为。

4.免责声明

本软件“按原样”提供，不提供任何明示或暗示的保证，包括但不限于对适销性、特定用途的适用性和非侵权性的保证。 在任何情况下，作者或版权持有人均不对任何索赔、损害或其他责任负责，无论是在合同诉讼、侵权行为还是其他方面，由软件或软件的使用或其他交易引起、由软件引起或与之相关 软件。

5. 责任限制

除适用法律禁止的范围外，在任何情况下且根据任何法律理论，无论是基于侵权行为、疏忽、合同、责任或其他原因，任何许可方均不对您承担任何直接、间接、特殊、偶然、示范性、 或间接损害，或任何其他商业损失，即使许可人已被告知此类损害的可能性。

6.争议解决

本许可受中华人民共和国法律管辖并按其解释。 因本许可引起的或与本许可有关的任何争议应提交北京市海淀区人民法院。

请注意，许可证可能会更新到更全面的版本。 有关许可和版权的任何问题，请通过 license@zhipuai.cn 与我们联系。

7. Llama2 和 EVA-CLIP2 许可

针对 CogVLM-17B 版本， Llama2 许可条件 (https://ai.meta.com/llama/license/) 和 EVA 许可条件 (MIT, https://github.com/baaivision/EVA/blob/master/LICENSE) 同时适用于模型权重。

================================================
FILE: README.md
================================================
# CogVLM & CogAgent

📗 [中文版README](./README_zh.md)

🌟 **Jump to detailed introduction: [Introduction to CogVLM](#introduction-to-cogvlm)，
🆕 [Introduction to CogAgent](#introduction-to-cogagent)**

📔 For more detailed usage information, please refer to: [CogVLM & CogAgent's technical documentation (in Chinese)](https://zhipu-ai.feishu.cn/wiki/LXQIwqo1OiIVTykMh9Lc3w1Fn7g) 

<table>
  <tr>
    <td>
      <h2> CogVLM </h2>
      <p> 📖  Paper: <a href="https://arxiv.org/abs/2311.03079">CogVLM: Visual Expert for Pretrained Language Models</a></p>
      <p><b>CogVLM</b> is a powerful open-source visual language model (VLM). CogVLM-17B has 10 billion visual parameters and 7 billion language parameters, <b>supporting image understanding and multi-turn dialogue with a resolution of 490*490</b>.</p>
      <p><b>CogVLM-17B achieves state-of-the-art performance on 10 classic cross-modal benchmarks</b>, including NoCaps, Flicker30k captioning, RefCOCO, RefCOCO+, RefCOCOg, Visual7W, GQA, ScienceQA, VizWiz VQA and TDIUC.</p>
    </td>
    <td>
      <h2> CogAgent </h2>
      <p> 📖  Paper: <a href="https://arxiv.org/abs/2312.08914">CogAgent: A Visual Language Model for GUI Agents </a></p>
      <p><b>CogAgent</b> is an open-source visual language model improved based on CogVLM. CogAgent-18B has 11 billion visual parameters and 7 billion language parameters, <b>supporting image understanding at a resolution of 1120*1120</b>. <b>On top of the capabilities of CogVLM, it further possesses GUI image Agent capabilities</b>.</p>
      <p> <b>CogAgent-18B achieves state-of-the-art generalist performance on 9 classic cross-modal benchmarks</b>, including VQAv2, OK-VQ, TextVQA, ST-VQA, ChartQA, infoVQA, DocVQA, MM-Vet, and POPE. <b>It significantly surpasses existing models on GUI operation datasets</b> including AITW and Mind2Web.</p>
    </td>
  </tr>
  <tr>
    <td colspan="2" align="center">
      <p>🌐 Web Demo for both CogVLM2: <a href="http://36.103.203.44:7861">this link</a></p>
    </td>
  </tr>
</table>


**Table of Contents**

- [CogVLM \& CogAgent](#cogvlm--cogagent)
    - [Release](#release)
    - [Get Started](#get-started)
        - [Option 1: Inference Using Web Demo.](#option-1-inference-using-web-demo)
        - [Option 2：Deploy CogVLM / CogAgent by yourself](#option-2deploy-cogvlm--cogagent-by-yourself)
            - [Situation 2.1 CLI (SAT version)](#situation-21-cli-sat-version)
            - [Situation 2.2 CLI (Huggingface version)](#situation-22-cli-huggingface-version)
            - [Situation 2.3 Web Demo](#situation-23-web-demo)
        - [Option 3：Finetuning CogAgent / CogVLM](#option-3finetuning-cogagent--cogvlm)
        - [Option 4: OpenAI Vision format](#option-4-openai-vision-format)
        - [Hardware requirement](#hardware-requirement)
        - [Model checkpoints](#model-checkpoints)
    - [Introduction to CogVLM](#introduction-to-cogvlm)
        - [Examples](#examples)
    - [Introduction to CogAgent](#introduction-to-cogagent)
        - [GUI Agent Examples](#gui-agent-examples)
    - [Cookbook](#cookbook)
        - [Task Prompts](#task-prompts)
        - [Which --version to use](#which---version-to-use)
        - [FAQ](#faq)
    - [License](#license)
    - [Citation \& Acknowledgements](#citation--acknowledgements)

## Release
- 🔥🔥🔥  **News**: ```2024/5/20```: We released the **next generation of model, [CogVLM2](https://github.com/THUDM/CogVLM2)**, which is based on llama3-8b and on the par of (or better than) GPT-4V in most cases! DOWNLOAD and TRY!
- 🔥🔥  **News**: ```2024/4/5```: [CogAgent](https://arxiv.org/abs/2312.08914) was selected as a CVPR 2024 Highlights!
- 🔥  **News**: ```2023/12/26```: We have released the [CogVLM-SFT-311K](dataset.md) dataset, 
  which contains over 150,000 pieces of data that we used for **CogVLM v1.0 only** training. Welcome to follow and use.
- **News**: ```2023/12/18```: **New Web UI Launched!** We have launched a new web UI based on Streamlit,
  users can painlessly talk to CogVLM, CogAgent in our UI. Have a better user experience.
- **News**: ```2023/12/15```: **CogAgent Officially Launched!** CogAgent is an image understanding model developed
  based on CogVLM. It features **visual-based GUI Agent capabilities** and has further enhancements in image
  understanding. It supports image input with a resolution of 1120*1120, and possesses multiple abilities including
  multi-turn dialogue with images, GUI Agent, Grounding, and more.

- **News**: ```2023/12/8``` We have updated the checkpoint of cogvlm-grounding-generalist to
  cogvlm-grounding-generalist-v1.1, with image augmentation during training, therefore more robust.
  See [details](#introduction-to-cogvlm).

- **News**: ```2023/12/7``` CogVLM supports **4-bit quantization** now! You can inference with just **11GB** GPU memory!

- **News**: ```2023/11/20``` We have updated the checkpoint of cogvlm-chat to cogvlm-chat-v1.1, unified the versions of
  chat and VQA, and refreshed the SOTA on various datasets. See [details](#introduction-to-cogvlm)

- **News**: ```2023/11/20``` We release **[cogvlm-chat](https://huggingface.co/THUDM/cogvlm-chat-hf)**, **[cogvlm-grounding-generalist](https://huggingface.co/THUDM/cogvlm-grounding-generalist-hf)/[base](https://huggingface.co/THUDM/cogvlm-grounding-base-hf)**, **[cogvlm-base-490](https://huggingface.co/THUDM/cogvlm-base-490-hf)/[224](https://huggingface.co/THUDM/cogvlm-base-224-hf)** on 🤗Huggingface. you can infer with transformers in [a few lines of code](#situation-22-cli-huggingface-version)now!

- ```2023/10/27``` CogVLM bilingual version is available [online](https://chatglm.cn/)! Welcome to try it out!

- ```2023/10/5``` CogVLM-17B released。

## Get Started

### Option 1: Inference Using Web Demo.

* Click here to enter [CogVLM2 Demo](http://36.103.203.44:7861/)。

If you need to use Agent and Grounding functions, please refer to [Cookbook - Task Prompts](#task-prompts)

### Option 2：Deploy CogVLM / CogAgent by yourself

We support two GUIs for model inference, **CLI** and **web demo** . If you want to use it in your python code, it is
easy to modify the CLI scripts for your case.

First, we need to install the dependencies.

```bash
# CUDA >= 11.8
pip install -r requirements.txt
python -m spacy download en_core_web_sm
```

**All code for inference is located under the ``basic_demo/`` directory. Please switch to this directory first before
proceeding with further operations.**

#### Situation 2.1 CLI (SAT version)

Run CLI demo via:

```bash
# CogAgent
python cli_demo_sat.py --from_pretrained cogagent-chat --version chat --bf16  --stream_chat
python cli_demo_sat.py --from_pretrained cogagent-vqa --version chat_old --bf16  --stream_chat

# CogVLM
python cli_demo_sat.py --from_pretrained cogvlm-chat --version chat_old --bf16  --stream_chat
python cli_demo_sat.py --from_pretrained cogvlm-grounding-generalist --version base --bf16  --stream_chat
```

The program will automatically download the sat model and interact in the command line. You can generate replies by
entering instructions and pressing enter.
Enter `clear` to clear the conversation history and `stop` to stop the program.

We also support model parallel inference, which splits model to multiple (2/4/8) GPUs. `--nproc-per-node=[n]` in the
following command controls the number of used GPUs.

```
torchrun --standalone --nnodes=1 --nproc-per-node=2 cli_demo_sat.py --from_pretrained cogagent-chat --version chat --bf16
```

- If you want to manually download the weights, you can replace the path after ``--from_pretrained`` with the model
  path.

- Our model supports SAT's **4-bit quantization** and **8-bit quantization**.
  You can change ``--bf16`` to ``--fp16``, or ``--fp16 --quant 4``, or ``--fp16 --quant 8``.

  For example

    ```bash
    python cli_demo_sat.py --from_pretrained cogagent-chat --fp16 --quant 8 --stream_chat
    python cli_demo_sat.py --from_pretrained cogvlm-chat-v1.1 --fp16 --quant 4 --stream_chat
    # In SAT version，--quant should be used with --fp16
    ```

- The program provides the following hyperparameters to control the generation process:
    ```
    usage: cli_demo_sat.py [-h] [--max_length MAX_LENGTH] [--top_p TOP_P] [--top_k TOP_K] [--temperature TEMPERATURE]

    optional arguments:
    -h, --help            show this help message and exit
    --max_length MAX_LENGTH
                            max length of the total sequence
    --top_p TOP_P         top p for nucleus sampling
    --top_k TOP_K         top k for top k sampling
    --temperature TEMPERATURE
                            temperature for sampling
    ```

- Click [here](#which---version-to-use) to view the correspondence between different models and the ``--version``
  parameter.

#### Situation 2.2 CLI (Huggingface version)

Run CLI demo via:

```bash
# CogAgent
python cli_demo_hf.py --from_pretrained THUDM/cogagent-chat-hf --bf16
python cli_demo_hf.py --from_pretrained THUDM/cogagent-vqa-hf --bf16

# CogVLM
python cli_demo_hf.py --from_pretrained THUDM/cogvlm-chat-hf --bf16
python cli_demo_hf.py --from_pretrained THUDM/cogvlm-grounding-generalist-hf --bf16
```

- If you want to manually download the weights, you can replace the path after ``--from_pretrained`` with the model
  path.

- You can change ``--bf16`` to ``--fp16``, or ``--quant 4``. For example, our model supports Huggingface's **4-bit
  quantization**:

    ```bash
    python cli_demo_hf.py --from_pretrained THUDM/cogvlm-chat-hf --quant 4
    ```

#### Situation 2.3 Web Demo

We also offer a local web demo based on Gradio. First, install Gradio by running: `pip install gradio`. Then download
and enter this repository and run `web_demo.py`. See the next section for detailed usage:

```bash
python web_demo.py --from_pretrained cogagent-chat --version chat --bf16
python web_demo.py --from_pretrained cogagent-vqa --version chat_old --bf16
python web_demo.py --from_pretrained cogvlm-chat-v1.1 --version chat_old --bf16
python web_demo.py --from_pretrained cogvlm-grounding-generalist --version base --bf16
```

The GUI of the web demo looks like:

<div align="center">
    <img src=assets/web_demo-min.png width=70% />
</div>

### Option 3：Finetuning CogAgent / CogVLM

You may want to use CogVLM in your own task, which needs a **different output style or domain knowledge**. **All code
for finetuning is located under the ``finetune_demo/`` directory.**

We here provide a finetuning example for **Captcha Recognition** using lora.

1. Start by downloading the [Captcha Images dataset](https://www.kaggle.com/datasets/aadhavvignesh/captcha-images). Once
   downloaded, extract the contents of the ZIP file.

2. To create a train/validation/test split in the ratio of 80/5/15, execute the following:
    ```bash
    python utils/split_dataset.py
    ```

3. Start the fine-tuning process with this command:

    ```bash
    bash finetune_demo/finetune_(cogagent/cogvlm)_lora.sh
    ```

4. Merge the model to `model_parallel_size=1`: (replace the 4 below with your training `MP_SIZE`)

    ```bash
    torchrun --standalone --nnodes=1 --nproc-per-node=4 utils/merge_model.py --version base --bf16 --from_pretrained ./checkpoints/merged_lora_(cogagent/cogvlm490/cogvlm224)
    ```

5. Evaluate the performance of your model.
    ```bash
    bash finetune_demo/evaluate_(cogagent/cogvlm).sh
    ```

### Option 4: OpenAI Vision format

We provide the same API examples as `GPT-4V`, which you can view in `openai_demo`.

1. First, start the node

```
python openai_demo/openai_api.py
```

2. Next, run the request example node, which is an example of a continuous dialogue

```
python openai_demo/openai_api_request.py
```

3. You will get output similar to the following

```
This image showcases a tranquil natural scene with a wooden pathway leading through a field of lush green grass. In the distance, there are trees and some scattered structures, possibly houses or small buildings. The sky is clear with a few scattered clouds, suggesting a bright and sunny day.
```

### Hardware requirement

* Model Inference:

  For INT4 quantization: 1 * RTX 3090(24G)   (CogAgent takes ~ 12.6GB, CogVLM takes ~ 11GB)

  For FP16: 1 * A100(80G) or 2 * RTX 3090(24G)

* Finetuning:

  For FP16: 4 * A100(80G) *[Recommend]* or 8* RTX 3090(24G).

### Model checkpoints

If you run the `basic_demo/cli_demo*.py` from the code repository, it will automatically download SAT or Hugging Face
weights. Alternatively, you can choose to manually download the necessary weights.

- CogAgent

  |   Model name    | Input resolution |                             Introduction                             | Huggingface model | SAT model |
  | :-----------: | :----: | :----------------------------------------------------------: | :------: | :-------: |
  | cogagent-chat |  1120  | Chat version of CogAgent. Supports GUI Agent, multiple-round  chat and visual grounding. |  [HF link](https://huggingface.co/THUDM/cogagent-chat-hf) <br> [OpenXLab link](https://openxlab.org.cn/models/detail/THUDM/cogagent-chat-hf)    |   [HF link](https://huggingface.co/THUDM/CogAgent/tree/main)<br> [OpenXLab link](https://openxlab.org.cn/models/detail/THUDM/CogAgent)           |
  | cogagent-vqa |  1120  | VQA version of CogAgent. Has stronger capabilities in single-turn visual dialogue. Recommended for VQA benchmarks. |  [HF link](https://huggingface.co/THUDM/cogagent-vqa-hf)<br> [OpenXLab link](https://openxlab.org.cn/models/detail/THUDM/cogagent-vqa-hf)        |    [HF link](https://huggingface.co/THUDM/CogAgent/tree/main) <br> [OpenXLab link](https://openxlab.org.cn/models/detail/THUDM/CogAgent)      |
c
- CogVLM

  |          Model name           | Input resolution |                           Introduction                            | Huggingface model | SAT model |
  | :-------------------------: | :----: | :-------------------------------------------------------: | :------: | :-------: |
  |         cogvlm-chat-v1.1         |  490   |  Supports multiple rounds of chat and vqa simultaneously, with different prompts.   |  [HF link](https://huggingface.co/THUDM/cogvlm-chat-hf) <br> [OpenXLab link](https://openxlab.org.cn/models/detail/THUDM/cogvlm-chat-hf)        |    [HF link](https://huggingface.co/THUDM/CogVLM/tree/main)  <br> [OpenXLab link](https://openxlab.org.cn/models/detail/THUDM/CogVLM)       |
  |       cogvlm-base-224       |  224   |               The original checkpoint after text-image pretraining.               |   [HF link](https://huggingface.co/THUDM/cogvlm-base-224-hf) <br> [OpenXLab link](https://openxlab.org.cn/models/detail/THUDM/cogvlm-base-224-hf)       |     [HF link](https://huggingface.co/THUDM/CogVLM/tree/main) <br> [OpenXLab link](https://openxlab.org.cn/models/detail/THUDM/CogVLM)       |
  |       cogvlm-base-490       |  490   |      Amplify the resolution to 490 through position encoding interpolation from `cogvlm-base-224`.      |   [HF link](https://huggingface.co/THUDM/cogvlm-base-490-hf) <br> [OpenXLab link](https://openxlab.org.cn/models/detail/THUDM/cogvlm-base-490-hf)       |     [HF link](https://huggingface.co/THUDM/CogVLM/tree/main) <br> [OpenXLab link](https://openxlab.org.cn/models/detail/THUDM/CogVLM)       |
  | cogvlm-grounding-generalist |  490   | This checkpoint supports different visual grounding tasks, e.g. REC, Grounding Captioning, etc.  |    [HF link](https://huggingface.co/THUDM/cogvlm-grounding-generalist-hf)  <br> [OpenXLab link](https://openxlab.org.cn/models/detail/THUDM/cogvlm-grounding-generalist-hf)       |     [HF link](https://huggingface.co/THUDM/CogVLM/tree/main)   <br> [OpenXLab link](https://openxlab.org.cn/models/detail/THUDM/CogVLM)     |

## Introduction to CogVLM

- CogVLM is a powerful **open-source visual language model** (**VLM**). CogVLM-17B has 10 billion vision parameters and
  7 billion language parameters.

- CogVLM-17B achieves state-of-the-art performance on 10 classic cross-modal benchmarks, including NoCaps, Flicker30k
  captioning, RefCOCO, RefCOCO+, RefCOCOg, Visual7W, GQA, ScienceQA, VizWiz VQA and TDIUC, and rank the 2nd on VQAv2,
  OKVQA, TextVQA, COCO captioning, etc., **surpassing or matching PaLI-X 55B**. CogVLM can
  also [chat with you](http://36.103.203.44:7861) about images.

<div align="center">
    <img src=assets/metrics-min.png width=50% />
</div>

<details>
<summary>Click to view results on MM-VET, POPE, TouchStone. </summary>

<table>
    <tr>
        <td>Method</td>
        <td>LLM</td>
        <td>MM-VET</td>
        <td>POPE(adversarial)</td>
        <td>TouchStone</td>
    </tr>
    <tr>
        <td>BLIP-2</td>
        <td>Vicuna-13B</td>
        <td>22.4</td>
        <td>-</td>
        <td>-</td>
    </tr>
    <tr>
        <td>Otter</td>
        <td>MPT-7B</td>
        <td>24.7</td>
        <td>-</td>
        <td>-</td>
    </tr>
    <tr>
        <td>MiniGPT4</td>
        <td>Vicuna-13B</td>
        <td>24.4</td>
        <td>70.4</td>
        <td>531.7</td>
    </tr>
    <tr>
        <td>InstructBLIP</td>
        <td>Vicuna-13B</td>
        <td>25.6</td>
        <td>77.3</td>
        <td>552.4</td>
    </tr>
    <tr>
        <td>LLaMA-Adapter v2</td>
        <td>LLaMA-7B</td>
        <td>31.4</td>
        <td>-</td>
        <td>590.1</td>
    </tr>
    <tr>
        <td>LLaVA</td>
        <td>LLaMA2-7B</td>
        <td>28.1</td>
        <td>66.3</td>
        <td>602.7</td>
    </tr>
    <tr>
        <td>mPLUG-Owl</td>
        <td>LLaMA-7B</td>
        <td>-</td>
        <td>66.8</td>
        <td>605.4</td>
    </tr>
    <tr>
        <td>LLaVA-1.5</td>
        <td>Vicuna-13B</td>
        <td>36.3</td>
        <td>84.5</td>
        <td>-</td>
    </tr>
    <tr>
        <td>Emu</td>
        <td>LLaMA-13B</td>
        <td>36.3</td>
        <td>-</td>
        <td>-</td>
    </tr>
    <tr>
        <td>Qwen-VL-Chat</td>
        <td>-</td>
        <td>-</td>
        <td>-</td>
        <td>645.2</td>
    </tr>
    <tr>
        <td>DreamLLM</td>
        <td>Vicuna-7B</td>
        <td>35.9</td>
        <td>76.5</td>
        <td>-</td>
    </tr>
    <tr>
        <td>CogVLM</td>
        <td>Vicuna-7B</td>
        <td> <b>52.8</b> </td>
        <td><b>87.6</b></td>
        <td><b>742.0</b></td>
    </tr>
</table>

</details>

<details>
<summary>Click to view results of cogvlm-grounding-generalist-v1.1. </summary>

<table>
    <tr>
        <td></td>
        <td>RefCOCO</td>
        <td></td>
        <td></td>
        <td>RefCOCO+</td>
        <td></td>
        <td></td>
        <td>RefCOCOg</td>
        <td></td>
        <td>Visual7W</td>
    </tr>
    <tr>
        <td></td>
        <td>val</td>
        <td>testA</td>
        <td>testB</td>
        <td>val</td>
        <td>testA</td>
        <td>testB</td>
        <td>val</td>
        <td>test</td>
        <td>test</td>
    </tr>
    <tr>
        <td>cogvim-grounding-generalist</td>
        <td>92.51</td>
        <td>93.95</td>
        <td>88.73</td>
        <td>87.52</td>
        <td>91.81</td>
        <td>81.43</td>
        <td>89.46</td>
        <td>90.09</td>
        <td>90.96</td>
    </tr>
    <tr>
        <td>cogvim-grounding-generalist-v1.1</td>
        <td>**92.76**</td>
        <td>**94.75**</td>
        <td>**88.99**</td>
        <td>**88.68**</td>
        <td>**92.91**</td>
        <td>**83.39**</td>
        <td>**89.75**</td>
        <td>**90.79**</td>
        <td>**91.05**</td>
    </tr>
</table>
</details>

### Examples

<!-- CogVLM is powerful for answering various types of visual questions, including **Detailed Description & Visual Question Answering**,  **Complex Counting**, **Visual Math Problem Solving**, **OCR-Free Reasonging**, **OCR-Free Visual Question Answering**, **World Knowledge**, **Referring Expression Comprehension**, **Programming with Visual Input**, **Grounding with Caption**, **Grounding Visual Question Answering**, etc. -->

* CogVLM can accurately describe images in details with **very few hallucinations**.
    <details>
    <summary>Click for comparison with LLAVA-1.5 and MiniGPT-4.</summary>

    <img src=assets/llava-comparison-min.png width=50% />

    </details>
    <br>

* CogVLM can understand and answer various types of questions, and has a **visual grounding** version.

<div align="center">
    <img src=assets/pear_grounding.png width=50% />
</div>

<br>

* CogVLM sometimes captures more detailed content than GPT-4V(ision).

<div align="center">
    <img src=assets/compare-min.png width=50% />
</div>

<!-- ![compare](assets/compare.png) -->
<br> 

<details>
<summary>Click to expand more examples.</summary>

![Chat Examples](assets/chat.png)

</details>

## Introduction to CogAgent

CogAgent is an open-source visual language model improved based on CogVLM. CogAgent-18B has 11 billion visual parameters
and 7 billion language parameters

CogAgent-18B achieves state-of-the-art generalist performance on 9 classic cross-modal benchmarks, including VQAv2,
OK-VQ, TextVQA, ST-VQA, ChartQA, infoVQA, DocVQA, MM-Vet, and POPE. It significantly surpasses existing models on GUI
operation datasets such as AITW and Mind2Web.

In addition to all the features already present in CogVLM (visual multi-round dialogue, visual grounding), CogAgent:

1. Supports higher resolution visual input and dialogue question-answering. **It supports ultra-high-resolution image
   inputs of 1120x1120.**

2. **Possesses the capabilities of a visual Agent**, being able to return a plan, next action, and specific operations
   with coordinates for any given task on any GUI screenshot.

3. **Enhanced GUI-related question-answering capabilities**, allowing it to handle questions about any GUI screenshot,
   such as web pages, PC apps, mobile applications, etc.

4. Enhanced capabilities in OCR-related tasks through improved pre-training and fine-tuning.

<div align="center">
    <img src=assets/cogagent_function.jpg width=60% />
</div>

### GUI Agent Examples

<div align="center">
    <img src=assets/cogagent_main_demo.jpg width=90% />
</div>

## Cookbook

### Task Prompts

1. **General Multi-Round Dialogue**: Say whatever you want.

2. **GUI Agent Task**: Use the [Agent template](https://github.com/THUDM/CogVLM/blob/main/utils/utils/template.py#L761)
   and replace \<TASK\> with the task instruction enclosed in double quotes. This query can make CogAgent infer Plan and
   Next Action. If adding ``(with grounding)`` at the end of the query, the model will return a formalized action
   representation with coordinates.

For example, to ask the model how to complete the task "Search for CogVLM" on a current GUI screenshot, follow these
steps:

1. Randomly select a template from
   the [Agent template](https://github.com/THUDM/CogVLM/blob/main/utils/utils/template.py#L761). Here, we
   choose ``What steps do I need to take to <TASK>?``.

2. Replace <TASK> with the task instruction enclosed in double quotes, for
   example, ``What steps do I need to take to "Search for CogVLM"?`` . Inputting this to the model yields:

> Plan: 1. Type 'CogVLM' into the Google search bar. 2. Review the search results that appear. 3. Click on a relevant
> result to read more about CogVLM or access further resources.
>
> Next Action: Move the cursor to the Google search bar, and type 'CogVLM' into it.

3. If adding ``(with grounding)`` at the end, i.e. changing the input
   to ``What steps do I need to take to "Search for CogVLM"?(with grounding)``, the output of CogAgent would be:

> Plan: 1. Type 'CogVLM' into the Google search bar. 2. Review the search results that appear. 3. Click on a relevant
> result to read more about CogVLM or access further resources.
>
> Next Action: Move the cursor to the Google search bar, and type 'CogVLM' into it.
> Grounded Operation:[combobox] Search -> TYPE: CogVLM at the box [[212,498,787,564]]

Tip: For GUI Agent tasks, it is recommended to conduct only single-round dialogues for each image for better results.

3. **Visual Grounding**. Three modes of grounding are supported:

    - Image description with grounding coordinates (bounding box). Use any template
      from [caption_with_box template](https://github.com/THUDM/CogVLM/blob/main/utils/utils/template.py#L537) as model
      input. For example:

   > Can you provide a description of the image and include the coordinates [[x0,y0,x1,y1]] for each mentioned object?

    - Returning grounding coordinates (bounding box) based on the description of objects. Use any template
      from [caption2box template](https://github.com/THUDM/CogVLM/blob/main/utils/utils/template.py#L345),
      replacing ``<expr>`` with the object's description. For example:

   > Can you point out *children in blue T-shirts* in the image and provide the bounding boxes of their location?

    - Providing a description based on bounding box coordinates. Use a template
      from [box2caption template](https://github.com/THUDM/CogVLM/blob/main/utils/utils/template.py#L400),
      replacing ``<objs>`` with the position coordinates. For example:

   > Tell me what you see within the designated area *[[086,540,400,760]]* in the picture.

**Format of coordination:** The bounding box coordinates in the model's input and output use the
format ``[[x1, y1, x2, y2]]``, with the origin at the top left corner, the x-axis to the right, and the y-axis
downward. (x1, y1) and (x2, y2) are the top-left and bottom-right corners, respectively, with values as relative
coordinates multiplied by 1000 (prefixed with zeros to three digits).

### Which --version to use

Due to differences in model functionalities, different model versions may have distinct ``--version`` specifications for
the text processor, meaning the format of the prompts used varies.

|         model name          | --version |
|:---------------------------:|:---------:|
|        cogagent-chat        |   chat    |
|        cogagent-vqa         | chat_old  |
|         cogvlm-chat         | chat_old  |
|      cogvlm-chat-v1.1       | chat_old  |
| cogvlm-grounding-generalist |   base    |
|       cogvlm-base-224       |   base    |
|       cogvlm-base-490       |   base    |

### FAQ

* If you have trouble in accessing huggingface.co, you can add `--local_tokenizer /path/to/vicuna-7b-v1.5` to load the
  tokenizer.
* If you have trouble in automatically downloading model with 🔨[SAT](https://github.com/THUDM/SwissArmyTransformer), try
  downloading from 🤖[modelscope](https://www.modelscope.cn/models/ZhipuAI/CogVLM/summary) or
  🤗[huggingface](https://huggingface.co/THUDM/CogVLM) or 💡[wisemodel](https://www.wisemodel.cn/models/ZhipuAI/CogVLM)
  manually.
* Download model using 🔨[SAT](https://github.com/THUDM/SwissArmyTransformer), the model will be saved to the default
  location `~/.sat_models`. Change the default location by setting the environment variable `SAT_HOME`. For example, if
  you want to save the model to `/path/to/my/models`, you can run `export SAT_HOME=/path/to/my/models` before running
  the python command.

## License

The code in this repository is open source under the [Apache-2.0 license](./LICENSE), while the use of the CogVLM model
weights must comply with the [Model License](./MODEL_LICENSE).

## Citation & Acknowledgements

If you find our work helpful, please consider citing the following papers

```
@misc{wang2023cogvlm,
      title={CogVLM: Visual Expert for Pretrained Language Models}, 
      author={Weihan Wang and Qingsong Lv and Wenmeng Yu and Wenyi Hong and Ji Qi and Yan Wang and Junhui Ji and Zhuoyi Yang and Lei Zhao and Xixuan Song and Jiazheng Xu and Bin Xu and Juanzi Li and Yuxiao Dong and Ming Ding and Jie Tang},
      year={2023},
      eprint={2311.03079},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{hong2023cogagent,
      title={CogAgent: A Visual Language Model for GUI Agents}, 
      author={Wenyi Hong and Weihan Wang and Qingsong Lv and Jiazheng Xu and Wenmeng Yu and Junhui Ji and Yan Wang and Zihan Wang and Yuxiao Dong and Ming Ding and Jie Tang},
      year={2023},
      eprint={2312.08914},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

```

In the instruction fine-tuning phase of the CogVLM, there are some English image-text data from
the [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4), [LLAVA](https://github.com/haotian-liu/LLaVA), [LRV-Instruction](https://github.com/FuxiaoLiu/LRV-Instruction), [LLaVAR](https://github.com/SALT-NLP/LLaVAR)
and [Shikra](https://github.com/shikras/shikra) projects, as well as many classic cross-modal work datasets. We
sincerely thank them for their contributions.


================================================
FILE: README_zh.md
================================================
# CogVLM & CogAgent

📗 [README in English](./README.md)

🌟 **跳转到详细介绍: [CogVLM介绍](#introduction-to-cogvlm)，
🆕 [CogAgent的介绍](#introduction-to-cogagent)**

📔 如需获取更详细的使用信息，请参阅: [CogVLM&CogAgent技术文档](https://zhipu-ai.feishu.cn/wiki/LXQIwqo1OiIVTykMh9Lc3w1Fn7g)

<table>
  <tr>
    <td>
      <h2> CogVLM </h2>
      <p> 📖  Paper: <a href="https://arxiv.org/abs/2311.03079">CogVLM: Visual Expert for Pretrained Language Models</a></p>
      <p><b>CogVLM</b> 是一个强大的开源视觉语言模型（VLM）。CogVLM-17B拥有100亿的视觉参数和70亿的语言参数，支持490*490分辨率的图像理解和多轮对话。</p>
      <p><b>CogVLM-17B 17B在10个经典的跨模态基准测试中取得了最先进的性能</b>包括NoCaps, Flicker30k captioning, RefCOCO, RefCOCO+, RefCOCOg, Visual7W, GQA, ScienceQA, VizWiz VQA 和 TDIUC 基准测试。</p>
    </td>
    <td>
      <h2> CogAgent </h2>
      <p> 📖  Paper: <a href="https://arxiv.org/abs/2312.08914">CogAgent: A Visual Language Model for GUI Agents </a></p>
      <p><b>CogAgent</b> 是一个基于CogVLM改进的开源视觉语言模型。CogAgent-18B拥有110亿的视觉参数和70亿的语言参数, <b>支持1120*1120分辨率的图像理解。在CogVLM的能力之上，它进一步拥有了GUI图像Agent的能力。</b></p>
      <p> <b>CogAgent-18B 在9个经典的跨模态基准测试中实现了最先进的通用性能，</b>包括 VQAv2, OK-VQ, TextVQA, ST-VQA, ChartQA, infoVQA, DocVQA, MM-Vet, 和 POPE 测试基准。它在包括AITW和Mind2Web在内的GUI操作数据集上显著超越了现有的模型。</p>
    </td>
  </tr>
  <tr>
    <td colspan="2" align="center">
      <p>🌐 CogVLM2 在线体验: <a href="http://36.103.203.44:7861">this link</a></p>
    </td>
  </tr>
</table>


**目录**

- [CogVLM \& CogAgent](#cogvlm--cogagent)
    - [Release](#发布)
    - [开始使用](#开始使用)
        - [选项1：使用网页演示进行推理](#选项1使用网页演示进行推理)
        - [选项2：自行部署CogVLM / CogAgent](#选项2自行部署cogvlm--cogagent)
            - [Situation 2.1 CLI (SAT version)](#situation-21-cli-sat-version)
            - [Situation 2.2 CLI (Huggingface version)](#situation-22-cli-huggingface-version)
            - [Situation 2.3 Web Demo](#situation-23-web-demo)
        - [选项3：微调 CogAgent / CogVLM](#选项3微调-cogagent--cogvlm)
        - [选项4：OpenAI格式](#选项4OpenAI格式)
        - [硬件需求](#硬件需求)
        - [Model checkpoints](#model-checkpoints)
    - [Introduction to CogVLM](#introduction-to-cogvlm)
        - [示例](#示例)
    - [Introduction to CogAgent](#introduction-to-cogagent)
        - [GUI Agent Examples](#gui-agent-examples)
    - [Cookbook](#cookbook)
        - [Task Prompts](#task-prompts)
        - [选择适合的模型](#选择适合的模型)
    - [License](#license)
    - [Citation \& Acknowledgements](#citation--acknowledgements)

## 发布
- 🔥🔥🔥  **News**: ```2024/4/5```: [CogAgent](https://arxiv.org/abs/2312.08914) 成功被评选为CVPR 2024 Highlights!
- 🔥🔥 **News**: ```2023/12/26```:我们公开了 [CogVLM-SFT-311K](dataset_zh.md) 数据集，它包含了超过15万条我们用于训练 **CogVLM v1.0(仅该模型)** 的数据。欢迎关注和使用。
- 🔥 **News**: ```2023/12/18```: **新的Streamlit用户界面**已经上线！我们已经基于Streamlit推出了新的网页用户界面，用户可以在我们的界面上轻松与CogVLM，CogAgent交谈。带来更好的用户体验。
- 🔥 **News**: ```2023/12/15```: **CogAgent 正式发布！** CogAgent是基于CogVLM开发的图像理解模型。它具有基于视觉的GUI
  Agent功能，并在图像理解方面进行了进一步的增强。它支持分辨率为1120*1120的图像输入，并具有包括与图像进行多轮对话、GUI
  Agent、Grounding等多种能力。

- **News**: ```2023/12/8```:
  我们已将cogvlm-grounding-generalist的检查点更新为cogvlm-grounding-generalist-v1.1，训练过程中增加了图像增强，因此更加稳健。查看[详情](#introduction-to-cogvlm)。

- **News**: ```2023/12/7``` CogVLM现在支持**4-bit**量化！您只需要11GB的GPU内存就可以进行推理！

- **News**: ```2023/11/20```我们已将cogvlm-chat的检查点更新为cogvlm-chat-v1.1，统一了聊天和VQA的版本，并刷新了各种数据集上的SOTA，查看[详情](#introduction-to-cogvlm)。

- **News**: ```2023/11/20``` 我们在🤗Huggingface上发布了 **[cogvlm-chat](https://huggingface.co/THUDM/cogvlm-chat-hf)**, **[cogvlm-grounding-generalist](https://huggingface.co/THUDM/cogvlm-grounding-generalist-hf)/[base](https://huggingface.co/THUDM/cogvlm-grounding-base-hf)**, **[cogvlm-base-490](https://huggingface.co/THUDM/cogvlm-base-490-hf)/[224](https://huggingface.co/THUDM/cogvlm-base-224-hf)**，使用transformers 快速 [推理](#situation-22-cli-huggingface-version)。

- ```2023/10/27``` CogVLM双语版本已经在线上可用！欢迎[试用](https://chatglm.cn/)。

- ```2023/10/5``` CogVLM-17B v1.0 发布。

## 开始使用

### 选项1：使用网页演示进行推理

* 点击此处进入 [CogVLM2 Web Demo](http://36.103.203.44:7861/)。

如果您需要使用代理和接地功能，请参考[Cookbook - Task Prompts](#task-prompts)。

### 选项2：自行部署CogVLM / CogAgent

我们支持两种模型推理的图形用户界面，命令行界面和网络演示。如果你想在你的Python代码中使用它，修改命令行脚本以适应你的情况。
首先，我们需要安装依赖项。

```bash
# CUDA >= 11.8
pip install -r requirements.txt
python -m spacy download en_core_web_sm
```

**所有的推理代码都位于 `basic_demo/` 目录下。请在进行进一步操作之前，先切换到这个目录。**

#### Situation 2.1 CLI (SAT version)

通过以下方式运行CLI演示：

```bash
# CogAgent
python cli_demo_sat.py --from_pretrained cogagent-chat --version chat --bf16  --stream_chat
python cli_demo_sat.py --from_pretrained cogagent-vqa --version chat_old --bf16  --stream_chat

# CogVLM
python cli_demo_sat.py --from_pretrained cogvlm-chat --version chat_old --bf16  --stream_chat
python cli_demo_sat.py --from_pretrained cogvlm-grounding-generalist --version base --bf16  --stream_chat
```

该程序将自动下载卫星模型并在命令行中进行交互。您可以通过输入指令并按回车来生成回复。输入`clear` 以清除对话历史，输入`stop` 以停止程序。

我们也支持模型并行推理，该推理将模型分割到多个（2/4/8）GPU上。使用 `--nproc-per-node=[n]` 控制使用的GPU数量。

```
torchrun --standalone --nnodes=1 --nproc-per-node=2 cli_demo_sat.py --from_pretrained cogagent-chat --version chat --bf16
```

- 如果你想手动下载权重，你可以用模型路径替换 ``--from_pretrained`` 后的路径。

- 我们的模型支持SAT的4位量化和8位量化。你可以将 ``--bf16`` 更改为 ``--fp16``, 或 ``--fp16 --quant 4``, 或 ``--fp16 --quant 8``.

  例如

    ```bash
    python cli_demo_sat.py --from_pretrained cogagent-chat --fp16 --quant 8 --stream_chat
    python cli_demo_sat.py --from_pretrained cogvlm-chat-v1.1 --fp16 --quant 4 --stream_chat
    # In SAT version，--quant should be used with --fp16
    ```

- 该程序提供以下超参数来控制生成过程：
    ```
    usage: cli_demo_sat.py [-h] [--max_length MAX_LENGTH] [--top_p TOP_P] [--top_k TOP_K] [--temperature TEMPERATURE]

    optional arguments:
    -h, --help            show this help message and exit
    --max_length MAX_LENGTH
                            max length of the total sequence
    --top_p TOP_P         top p for nucleus sampling
    --top_k TOP_K         top k for top k sampling
    --temperature TEMPERATURE
                            temperature for sampling
    ```

- 点击 [这里](#which---version-to-use) 查看不同模型与 ``--version``  参数之间的对应关系的对应关系。

#### Situation 2.2 CLI (Huggingface version)

通过以下方式运行CLI演示：

```bash
# CogAgent
python cli_demo_hf.py --from_pretrained THUDM/cogagent-chat-hf --bf16
python cli_demo_hf.py --from_pretrained THUDM/cogagent-vqa-hf --bf16

# CogVLM
python cli_demo_hf.py --from_pretrained THUDM/cogvlm-chat-hf --bf16
python cli_demo_hf.py --from_pretrained THUDM/cogvlm-grounding-generalist --bf16
```

- 如果你想手动下载权重，你可以将 ``--from_pretrained`` 后的路径替换为模型路径。

- 你可以将 ``--bf16`` 更改为 ``--fp16``, 或者 ``--quant 4``。例如，我们的模型支持Huggingface的**4-bit quantization**:
    ```bash
    python cli_demo_hf.py --from_pretrained THUDM/cogvlm-chat-hf --quant 4
    ```

#### Situation 2.3 Web Demo

我们还提供了一个基于Gradio的本地网络演示。首先，通过运行 `pip install gradio` 来安装Gradio。然后下载并进入这个仓库，运行 `web_demo.py`。
详细的使用方法请参见下一节：

```bash
python web_demo.py --from_pretrained cogagent-chat --version chat --bf16
python web_demo.py --from_pretrained cogagent-vqa --version chat_old --bf16
python web_demo.py --from_pretrained cogvlm-chat-v1.1 --version chat_old --bf16
python web_demo.py --from_pretrained cogvlm-grounding-generalist --version base --bf16
```

网页演示的图形用户界面如下：

<div align="center">
    <img src=assets/web_demo-min.png width=70% />
</div>

### 选项3：微调 CogAgent / CogVLM

你可能想在你自己的任务中使用CogVLM，这需要 **不同的输出风格或领域知识**. **所有用于微调的代码都位于  ``finetune_demo/`` 目录中。**

我们在这里提供了一个使用lora进行 **验证码识别** 的微调示例。

1. 首先下载 [Captcha Images](https://www.kaggle.com/datasets/aadhavvignesh/captcha-images)数据集。下载完成后，解压ZIP文件的内容。

2. 要创建一个以80/5/15的比例进行训练/验证/测试划分，请执行以下操作：
    ```bash
    python utils/split_dataset.py
    ```

3. 使用此命令开始微调：

    ```bash
    bash finetune_demo/finetune_(cogagent/cogvlm)_lora.sh
    ```

4. 将模型合并到  `model_parallel_size=1`: (用你的训练 `MP_SIZE` 替换下面的4)

    ```bash
    torchrun --standalone --nnodes=1 --nproc-per-node=4 utils/merge_model.py --version base --bf16 --from_pretrained ./checkpoints/merged_lora_(cogagent/cogvlm490/cogvlm224)
    ```

5. 估你的模型的性能。
    ```bash
    bash finetune_demo/evaluate_(cogagent/cogvlm).sh
    ```

### 选项4：OpenAI格式

We provide the same API examples as `GPT-4V`, which you can view in `openai_demo`.

1. 首先，启动节点

```
python openai_demo/openai_api.py
```

2. 接下来，运行请求示例节点，这是一个连续对话的例子

```
python openai_demo/openai_api_request.py
```

3. 你将得到类似于以下的输出

```
This image showcases a tranquil natural scene with a wooden pathway leading through a field of lush green grass. In the distance, there are trees and some scattered structures, possibly houses or small buildings. The sky is clear with a few scattered clouds, suggesting a bright and sunny day.
```

### 硬件需求

* 模型推理:

  For INT4 quantization: 1 * RTX 3090(24G)   (CogAgent takes ~ 12.6GB, CogVLM takes ~ 11GB)

  For FP16: 1 * A100(80G) or 2 * RTX 3090(24G)

* 微调:

  For FP16: 4 * A100(80G) *[Recommend]* or 8* RTX 3090(24G).

### Model checkpoints

如果你从代码仓库运行 `basic_demo/cli_demo*.py`，它将自动下载SAT或Hugging Face的权重。或者，你也可以选择手动下载必要的权重。

- CogAgent

  |   模型名称    | 输入分辨率 |                             介绍                             | Huggingface model | SAT model |
  | :-----------: | :----: | :----------------------------------------------------------: | :------: | :-------: |
  | cogagent-chat |  1120  | CogAgent的聊天版本。支持GUI代理，多轮聊天和视觉定位。 |  [link](https://huggingface.co/THUDM/cogagent-chat-hf)       |    [link](https://huggingface.co/THUDM/CogAgent/tree/main)       |
  | cogagent-vqa |  1120  | CogAgent的VQA版本。在单轮视觉对话中具有更强的能力。推荐用于VQA基准测试。 |  [link](https://huggingface.co/THUDM/cogagent-vqa-hf)       |    [link](https://huggingface.co/THUDM/CogAgent/tree/main)       |

- CogVLM

  |          模型名称            | 输入分辨率 |                                               介绍                                                | Huggingface model | SAT model |
  | :-------------------------: | :----: |:-----------------------------------------------------------------------------------------------:| :------: | :-------: |
  |         cogvlm-chat-v1.1         |  490   |                    支持同时进行多轮聊天和视觉问答，支持自由的提示词。                                                    |  [link](https://huggingface.co/THUDM/cogvlm-chat-hf)        |    [link](https://huggingface.co/THUDM/CogVLM/tree/main)        |
  |       cogvlm-base-224       |  224   |      文本-图像预训练后的原始检查点。             |   [link](https://huggingface.co/THUDM/cogvlm-base-224-hf)      |     [link](https://huggingface.co/THUDM/CogVLM/tree/main)       |
  |       cogvlm-base-490       |  490   |  通过从 cogvlm-base-224 进行位置编码插值，将分辨率提升到490。  |   [link](https://huggingface.co/THUDM/cogvlm-base-490-hf)      |     [link](https://huggingface.co/THUDM/CogVLM/tree/main)       |
  | cogvlm-grounding-generalist |  490   | 此检查点支持不同的视觉定位任务，例如REC，定位字幕等。 |    [link](https://huggingface.co/THUDM/cogvlm-grounding-generalist-hf)     |     [link](https://huggingface.co/THUDM/CogVLM/tree/main)       |

## Introduction to CogVLM

- CogVLM是一个强大的开源视觉语言模型（VLM）。CogVLM-17B拥有100亿的视觉参数和70亿的语言参数。
- CogVLM-17B在10个经典的跨模态基准测试中取得了最佳性能，包括 NoCaps, Flicker30k captioning, RefCOCO, RefCOCO+, RefCOCOg, Visual7W, GQA, ScienceQA, VizWiz VQA and TDIUC, 并在 VQAv2, OKVQA, TextVQA, COCO 字幕等方面排名第二., **超越或匹敌 PaLI-X 55B**. CogVLM还可以和你聊关于图片的话题。 

<div align="center">
    <img src=assets/metrics-min.png width=50% />
</div>

<details>
<summary>点击查看MM-VET，POPE，TouchStone的结果。 </summary>

<table>
    <tr>
        <td>Method</td>
        <td>LLM</td>
        <td>MM-VET</td>
        <td>POPE(adversarial)</td>
        <td>TouchStone</td>
    </tr>
    <tr>
        <td>BLIP-2</td>
        <td>Vicuna-13B</td>
        <td>22.4</td>
        <td>-</td>
        <td>-</td>
    </tr>
    <tr>
        <td>Otter</td>
        <td>MPT-7B</td>
        <td>24.7</td>
        <td>-</td>
        <td>-</td>
    </tr>
    <tr>
        <td>MiniGPT4</td>
        <td>Vicuna-13B</td>
        <td>24.4</td>
        <td>70.4</td>
        <td>531.7</td>
    </tr>
    <tr>
        <td>InstructBLIP</td>
        <td>Vicuna-13B</td>
        <td>25.6</td>
        <td>77.3</td>
        <td>552.4</td>
    </tr>
    <tr>
        <td>LLaMA-Adapter v2</td>
        <td>LLaMA-7B</td>
        <td>31.4</td>
        <td>-</td>
        <td>590.1</td>
    </tr>
    <tr>
        <td>LLaVA</td>
        <td>LLaMA2-7B</td>
        <td>28.1</td>
        <td>66.3</td>
        <td>602.7</td>
    </tr>
    <tr>
        <td>mPLUG-Owl</td>
        <td>LLaMA-7B</td>
        <td>-</td>
        <td>66.8</td>
        <td>605.4</td>
    </tr>
    <tr>
        <td>LLaVA-1.5</td>
        <td>Vicuna-13B</td>
        <td>36.3</td>
        <td>84.5</td>
        <td>-</td>
    </tr>
    <tr>
        <td>Emu</td>
        <td>LLaMA-13B</td>
        <td>36.3</td>
        <td>-</td>
        <td>-</td>
    </tr>
    <tr>
        <td>Qwen-VL-Chat</td>
        <td>-</td>
        <td>-</td>
        <td>-</td>
        <td>645.2</td>
    </tr>
    <tr>
        <td>DreamLLM</td>
        <td>Vicuna-7B</td>
        <td>35.9</td>
        <td>76.5</td>
        <td>-</td>
    </tr>
    <tr>
        <td>CogVLM</td>
        <td>Vicuna-7B</td>
        <td> <b>52.8</b> </td>
        <td><b>87.6</b></td>
        <td><b>742.0</b></td>
    </tr>
</table>

</details>

<details>
<summary>点击查看cogvlm-grounding-generalist-v1.1的结果。</summary>

<table>
    <tr>
        <td></td>
        <td>RefCOCO</td>
        <td></td>
        <td></td>
        <td>RefCOCO+</td>
        <td></td>
        <td></td>
        <td>RefCOCOg</td>
        <td></td>
        <td>Visual7W</td>
    </tr>
    <tr>
        <td></td>
        <td>val</td>
        <td>testA</td>
        <td>testB</td>
        <td>val</td>
        <td>testA</td>
        <td>testB</td>
        <td>val</td>
        <td>test</td>
        <td>test</td>
    </tr>
    <tr>
        <td>cogvim-grounding-generalist</td>
        <td>92.51</td>
        <td>93.95</td>
        <td>88.73</td>
        <td>87.52</td>
        <td>91.81</td>
        <td>81.43</td>
        <td>89.46</td>
        <td>90.09</td>
        <td>90.96</td>
    </tr>
    <tr>
        <td>cogvim-grounding-generalist-v1.1</td>
        <td>**92.76**</td>
        <td>**94.75**</td>
        <td>**88.99**</td>
        <td>**88.68**</td>
        <td>**92.91**</td>
        <td>**83.39**</td>
        <td>**89.75**</td>
        <td>**90.79**</td>
        <td>**91.05**</td>
    </tr>
</table>
</details>

### 示例

<!-- CogVLM is powerful for answering various types of visual questions, including **Detailed Description & Visual Question Answering**,  **Complex Counting**, **Visual Math Problem Solving**, **OCR-Free Reasonging**, **OCR-Free Visual Question Answering**, **World Knowledge**, **Referring Expression Comprehension**, **Programming with Visual Input**, **Grounding with Caption**, **Grounding Visual Question Answering**, etc. -->

* CogVLM能够准确地详细描述图像，几乎不会产生幻觉。
    <details>
    <summary>点击以与LLAVA-1.5和MiniGPT-4进行比较。.</summary>

    <img src=assets/llava-comparison-min.png width=50% />

    </details>
    <br>

* CogVLM能理解并回答各种类型的问题，并且有一个视觉基础版本。

<div align="center">
    <img src=assets/pear_grounding.png width=50% />
</div>

<br>

* CogVLM有时比GPT-4V(ision)捕获更详细的内容。

<div align="center">
    <img src=assets/compare-min.png width=50% />
</div>

<!-- ![compare](assets/compare.png) -->
<br> 

<details>
<summary>点击以展开更多示例。</summary>

![Chat Examples](assets/chat.png)

</details>

## Introduction to CogAgent

CogAgent是一个基于CogVLM改进的开源视觉语言模型。CogAgent-18B拥有110亿的视觉参数和70亿的语言参数。

CogAgent-18B在9个经典的跨模态基准测试中实现了最先进的全能性能，包括VQAv2、OK-VQ、TextVQA、ST-VQA、ChartQA、infoVQA、DocVQA、MM-Vet和POPE。它在如AITW和Mind2Web等GUI操作数据集上显著超越了现有的模型。

除了CogVLM已有的所有功能（视觉多轮对话，视觉定位）之外，CogAgent：

1. 支持**更高分辨率**的视觉输入和对话式问答。它支持超高分辨率的图像输入，达到**1120x1120**。

2. **拥有视觉Agent的能力**，能够在任何图形用户界面截图上，为任何给定任务返回一个计划，下一步行动，以及带有坐标的特定操作。

3. **增强了与图形用户界面相关的问答能力**，使其能够处理关于任何图形用户界面截图的问题，例如网页、PC应用、移动应用等。

4. 通过改进预训练和微调，提高了OCR相关任务的能力。

<div align="center">
    <img src=assets/cogagent_function.jpg width=60% />
</div>

### GUI Agent Examples

<div align="center">
    <img src=assets/cogagent_main_demo.jpg width=90% />
</div>

## Cookbook

### Task Prompts

1. **通用多轮对话**: 随便你说什么.

2. **GUI代理任务**: 使用 [代理模板](https://github.com/THUDM/CogVLM/blob/main/utils/utils/template.py#L761)并用双引号括起来的任务指令替换 `\<TASK\>`。这个查询可以让CogAgent推断出计划和下一步行动。如果在查询的末尾添加`(with grounding)` 模型将返回一个带有坐标的正式化动作表示。

例如，要询问模型如何完成"在当前GUI截图上搜索CogVLM"的任务，请按照以下步骤操作：

1. 从代理模板中随机选择一个模板。这里，我们选择了``What steps do I need to take to <TASK>?``.

2. 请用双引号中的任务指令替换，例如， ``What steps do I need to take to "Search for CogVLM"?``。将此输入到模型会产生：

> Plan: 1. Type 'CogVLM' into the Google search bar. 2. Review the search results that appear. 3. Click on a relevant
> result to read more about CogVLM or access further resources.
>
> Next Action: Move the cursor to the Google search bar, and type 'CogVLM' into it.

3. 如果在末尾添加 ``(with grounding)`` 即将输入改为``What steps do I need to take to "Search for CogVLM"?(with grounding)``,那么CogAgent的输出将会是:

> Plan: 1. Type 'CogVLM' into the Google search bar. 2. Review the search results that appear. 3. Click on a relevant
> result to read more about CogVLM or access further resources.
>
> Next Action: Move the cursor to the Google search bar, and type 'CogVLM' into it.
> Grounded Operation:[combobox] Search -> TYPE: CogVLM at the box [[212,498,787,564]]

提示：对于GUI代理任务，建议每个图像只进行一轮对话以获得更好的结果。

3. **视觉定位**. T支持三种定位模式：

    - 带有定位坐标（边界框）的图像描述。使用caption_with_box模板中的任何模板作为模型输入。例如:

   > Can you provide a description of the image and include the coordinates [[x0,y0,x1,y1]] for each mentioned object?

    - 根据物体的描述返回接地坐标（边界框）。使用caption2box模板中的任何模板，将 <expr> 替换为物体的描述。例如:

   > Can you point out *children in blue T-shirts* in the image and provide the bounding boxes of their location?

    - 根据边界框坐标提供描述。使用box2caption模板中的模板，将 <objs> 替换为位置坐标。例如：

   > Tell me what you see within the designated area *[[086,540,400,760]]* in the picture.

**坐标格式:** 模型的输入和输出中的边界框坐标使用 `[[x1, y1, x2, y2]]` 格式，原点位于左上角，x轴向右，y轴向下。 (x1, y1) 和 (x2, y2) 分别是左上角和右下角，其值为相对坐标乘以1000（前缀为零，三位数）。

### 选择适合的模型

由于模型功能的差异，不同的模型版本可能会有不同的文本处理器 `--version`，这意味着使用的提示格式会有所不同。

|         model name          | --version |
|:---------------------------:|:---------:|
|        cogagent-chat        |   chat    |
|        cogagent-vqa         | chat_old  |
|         cogvlm-chat         | chat_old  |
|      cogvlm-chat-v1.1       | chat_old  |
| cogvlm-grounding-generalist |   base    |
|       cogvlm-base-224       |   base    |
|       cogvlm-base-490       |   base    |

### 常见问题

* 如果你在访问huggingface.co时遇到问题，你可以添加 `--local_tokenizer /path/to/vicuna-7b-v1.5` 来加载分词器。
* 如果你在使用🔨 [SAT](https://github.com/THUDM/SwissArmyTransformer)自动下载模型时遇到问题 , 尝试从 🤖[modelscope](https://www.modelscope.cn/models/ZhipuAI/CogVLM/summary) 或
  🤗[huggingface](https://huggingface.co/THUDM/CogVLM) or 💡[wisemodel](https://www.wisemodel.cn/models/ZhipuAI/CogVLM) 手动下载。
* 使用🔨 SAT下载模型，模型将被保存到默认位置 `~/.sat_models` 。通过设置环境变量 `SAT_HOME` 来更改默认位置。例如，如果你想将模型保存到 `/path/to/my/models` ，你可以在运行python命令之前运行 `export SAT_HOME=/path/to/my/models`。

## License

此仓库中的代码是在[Apache-2.0 license](./LICENSE)的开源代码，而使用CogVLM模型权重必须遵守[模型许可](./MODEL_LICENSE).

## Citation & Acknowledgements

如果你发现我们的工作对你有所帮助，请引用以下论文
```
@misc{wang2023cogvlm,
      title={CogVLM: Visual Expert for Pretrained Language Models}, 
      author={Weihan Wang and Qingsong Lv and Wenmeng Yu and Wenyi Hong and Ji Qi and Yan Wang and Junhui Ji and Zhuoyi Yang and Lei Zhao and Xixuan Song and Jiazheng Xu and Bin Xu and Juanzi Li and Yuxiao Dong and Ming Ding and Jie Tang},
      year={2023},
      eprint={2311.03079},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{hong2023cogagent,
      title={CogAgent: A Visual Language Model for GUI Agents}, 
      author={Wenyi Hong and Weihan Wang and Qingsong Lv and Jiazheng Xu and Wenmeng Yu and Junhui Ji and Yan Wang and Zihan Wang and Yuxiao Dong and Ming Ding and Jie Tang},
      year={2023},
      eprint={2312.08914},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

```

在CogVLM的指令微调阶段，我们使用了来自 [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4), [LLAVA](https://github.com/haotian-liu/LLaVA), [LRV-Instruction](https://github.com/FuxiaoLiu/LRV-Instruction), [LLaVAR](https://github.com/SALT-NLP/LLaVAR) 和 [Shikra](https://github.com/shikras/shikra)项目的一些英文图像-文本数据，以及许多经典的跨模态工作数据集。我们衷心感谢他们的贡献。

================================================
FILE: assets/WECHAT.md
================================================
<div align="center">
<img src=wechat.jpg width="60%"/>

<p> 扫码关注公众号，加入「ChatGLM交流群」 </p>
<p> Scan the QR code to follow the official account and join the "ChatGLM Discussion Group" </p>
</div>


================================================
FILE: basic_demo/cli_demo_hf.py
================================================
"""
This is a demo for using CogAgent and CogVLM in CLI
Make sure you have installed vicuna-7b-v1.5 tokenizer model (https://huggingface.co/lmsys/vicuna-7b-v1.5), full checkpoint of vicuna-7b-v1.5 LLM is not required.
In this demo, We us chat template, you can use others to replace such as 'vqa'.
Strongly suggest to use GPU with bfloat16 support, otherwise, it will be slow.
Mention that only one picture can be processed at one conversation, which means you can not replace or insert another picture during the conversation.
"""

import argparse
import torch

from PIL import Image
from transformers import AutoModelForCausalLM, LlamaTokenizer

parser = argparse.ArgumentParser()
parser.add_argument("--quant", choices=[4], type=int, default=None, help='quantization bits')
parser.add_argument("--from_pretrained", type=str, default="THUDM/cogagent-chat-hf", help='pretrained ckpt')
parser.add_argument("--local_tokenizer", type=str, default="lmsys/vicuna-7b-v1.5", help='tokenizer path')
parser.add_argument("--fp16", action="store_true")
parser.add_argument("--bf16", action="store_true")

args = parser.parse_args()
MODEL_PATH = args.from_pretrained
TOKENIZER_PATH = args.local_tokenizer
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = LlamaTokenizer.from_pretrained(TOKENIZER_PATH)
if args.bf16:
    torch_type = torch.bfloat16
else:
    torch_type = torch.float16

print("========Use torch type as:{} with device:{}========\n\n".format(torch_type, DEVICE))

if args.quant:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        torch_dtype=torch_type,
        low_cpu_mem_usage=True,
        load_in_4bit=True,
        trust_remote_code=True
    ).eval()
else:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        torch_dtype=torch_type,
        low_cpu_mem_usage=True,
        load_in_4bit=args.quant is not None,
        trust_remote_code=True
    ).to(DEVICE).eval()

text_only_template = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"

while True:
    image_path = input("image path >>>>> ")
    if image_path == '':
        print('You did not enter image path, the following will be a plain text conversation.')
        image = None
        text_only_first_query = True    
    else:
        image = Image.open(image_path).convert('RGB')
    
    history = []

    while True:
        query = input("Human:")
        if query == "clear":
            break

        if image is None:
            if text_only_first_query:
                query = text_only_template.format(query)
                text_only_first_query = False
            else:
                old_prompt = ''
                for _, (old_query, response) in enumerate(history):
                    old_prompt += old_query + " " + response + "\n"
                query = old_prompt + "USER: {} ASSISTANT:".format(query)

        if image is None:
            input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history, template_version='base')
        else:
            input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history, images=[image])

        inputs = {
            'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE),
            'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE),
            'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE),
            'images': [[input_by_model['images'][0].to(DEVICE).to(torch_type)]] if image is not None else None,
        }
        if 'cross_images' in input_by_model and input_by_model['cross_images']:
            inputs['cross_images'] = [[input_by_model['cross_images'][0].to(DEVICE).to(torch_type)]]

        # add any transformers params here.
        gen_kwargs = {"max_length": 2048,
                      "do_sample": False} # "temperature": 0.9
        with torch.no_grad():
            outputs = model.generate(**inputs, **gen_kwargs)
            outputs = outputs[:, inputs['input_ids'].shape[1]:]
            response = tokenizer.decode(outputs[0])
            response = response.split("</s>")[0]
            print("\nCog:", response)
        history.append((query, response))


================================================
FILE: basic_demo/cli_demo_sat.py
================================================
# -*- encoding: utf-8 -*-
import os, sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import torch
import argparse
from sat.model.mixins import CachedAutoregressiveMixin
from sat.quantization.kernels import quantize
from sat.model import AutoModel


from utils.utils import chat, llama2_tokenizer, llama2_text_processor_inference, get_image_processor
from utils.models import CogAgentModel, CogVLMModel

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--max_length", type=int, default=2048, help='max length of the total sequence')
    parser.add_argument("--top_p", type=float, default=0.4, help='top p for nucleus sampling')
    parser.add_argument("--top_k", type=int, default=1, help='top k for top k sampling')
    parser.add_argument("--temperature", type=float, default=.8, help='temperature for sampling')
    parser.add_argument("--chinese", action='store_true', help='Chinese interface')
    parser.add_argument("--version", type=str, default="chat", choices=['chat', 'vqa', 'chat_old', 'base'], help='version of language process. if there is \"text_processor_version\" in model_config.json, this option will be overwritten')
    parser.add_argument("--quant", choices=[8, 4], type=int, default=None, help='quantization bits')

    parser.add_argument("--from_pretrained", type=str, default="cogagent-chat", help='pretrained ckpt')
    parser.add_argument("--local_tokenizer", type=str, default="lmsys/vicuna-7b-v1.5", help='tokenizer path')
    parser.add_argument("--fp16", action="store_true")
    parser.add_argument("--bf16", action="store_true")
    parser.add_argument("--stream_chat", action="store_true")
    args = parser.parse_args()
    rank = int(os.environ.get('RANK', 0))
    world_size = int(os.environ.get('WORLD_SIZE', 1))
    args = parser.parse_args()

    # load model
    model, model_args = AutoModel.from_pretrained(
        args.from_pretrained,
        args=argparse.Namespace(
        deepspeed=None,
        local_rank=rank,
        rank=rank,
        world_size=world_size,
        model_parallel_size=world_size,
        mode='inference',
        skip_init=True,
        use_gpu_initialization=True if (torch.cuda.is_available() and args.quant is None) else False,
        device='cpu' if args.quant else 'cuda',
        **vars(args)
    ), overwrite_args={'model_parallel_size': world_size} if world_size != 1 else {})
    model = model.eval()
    from sat.mpu import get_model_parallel_world_size
    assert world_size == get_model_parallel_world_size(), "world size must equal to model parallel size for cli_demo!"

    language_processor_version = model_args.text_processor_version if 'text_processor_version' in model_args else args.version
    print("[Language processor version]:", language_processor_version)
    tokenizer = llama2_tokenizer(args.local_tokenizer, signal_type=language_processor_version)
    image_processor = get_image_processor(model_args.eva_args["image_size"][0])
    cross_image_processor = get_image_processor(model_args.cross_image_pix) if "cross_image_pix" in model_args else None
    
    if args.quant:
        quantize(model, args.quant)
        if torch.cuda.is_available():
            model = model.cuda()


    model.add_mixin('auto-regressive', CachedAutoregressiveMixin())

    text_processor_infer = llama2_text_processor_inference(tokenizer, args.max_length, model.image_length)

    if args.chinese:
        if rank == 0:
            print('欢迎使用 CogAgent-CLI ，输入图像URL或本地路径读图，继续输入内容对话，clear 重新开始，stop 终止程序')
    else:
        if rank == 0:
            print('Welcome to CogAgent-CLI. Enter an image URL or local file path to load an image. Continue inputting text to engage in a conversation. Type "clear" to start over, or "stop" to end the program.')
    with torch.no_grad():
        while True:
            history = None
            cache_image = None
            if args.chinese:
                if rank == 0:
                    image_path = [input("请输入图像路径或URL： ")]
                else:
                    image_path = [None]
            else:
                if rank == 0:
                    image_path = [input("Please enter the image path or URL: ")]
                else:
                    image_path = [None]
            if world_size > 1:
                torch.distributed.broadcast_object_list(image_path, 0)
            image_path = image_path[0]
            assert image_path is not None

            if image_path == 'stop':
                break

            if args.chinese:
                if rank == 0:
                    query = [input("用户：")]
                else:
                    query = [None]
            else:
                if rank == 0:
                    query = [input("User: ")]
                else:
                    query = [None]
            if world_size > 1:
                torch.distributed.broadcast_object_list(query, 0)
            query = query[0]
            assert query is not None
            
            while True:
                if query == "clear":
                    break
                if query == "stop":
                    sys.exit(0)
                try:
                    response, history, cache_image = chat(
                        image_path,
                        model,
                        text_processor_infer,
                        image_processor,
                        query,
                        history=history,
                        cross_img_processor=cross_image_processor,
                        image=cache_image,
                        max_length=args.max_length,
                        top_p=args.top_p,
                        temperature=args.temperature,
                        top_k=args.top_k,
                        invalid_slices=text_processor_infer.invalid_slices,
                        args=args
                        )
                except Exception as e:
                    print(e)
                    break
                if rank == 0 and not args.stream_chat:
                    if args.chinese:
                        print("模型："+response)
                    else:
                        print("Model: "+response)
                image_path = None
                if args.chinese:
                    if rank == 0:
                        query = [input("用户：")]
                    else:
                        query = [None]
                else:
                    if rank == 0:
                        query = [input("User: ")]
                    else:
                        query = [None]
                if world_size > 1:
                    torch.distributed.broadcast_object_list(query, 0)
                query = query[0]
                assert query is not None


if __name__ == "__main__":
    main()


================================================
FILE: basic_demo/web_demo.py
================================================
"""
This script is a simple web demo of the CogVLM and CogAgent models, designed for easy and quick demonstrations.
For a more sophisticated user interface, users are encouraged to refer to the 'composite_demo',
which is built with a more aesthetically pleasing Streamlit framework.

Usage:
- Use the interface to upload images and enter text prompts to interact with the models.

Requirements:
- Gradio (only 3.x,4.x is not support) and other necessary Python dependencies must be installed.
- Proper model checkpoints should be accessible as specified in the script.

Note: This demo is ideal for a quick showcase of the CogVLM and CogAgent models. For a more comprehensive and interactive
experience, refer to the 'composite_demo'.
"""
import gradio as gr
import os, sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from PIL import Image
import torch
import time
from sat.model.mixins import CachedAutoregressiveMixin
from sat.mpu import get_model_parallel_world_size
from sat.model import AutoModel


from utils.utils import chat, llama2_tokenizer, llama2_text_processor_inference, get_image_processor, parse_response
from utils.models import CogAgentModel, CogVLMModel


DESCRIPTION = '''<h1 style='text-align: center'> <a href="https://github.com/THUDM/CogVLM">CogVLM / CogAgent</a> </h1>'''

NOTES = '<h3> This app is adapted from <a href="https://github.com/THUDM/CogVLM">https://github.com/THUDM/CogVLM</a>. It would be recommended to check out the repo if you want to see the detail of our model, CogVLM & CogAgent. </h3>'

MAINTENANCE_NOTICE1 = 'Hint 1: If the app report "Something went wrong, connection error out", please turn off your proxy and retry.<br>Hint 2: If you upload a large size of image like 10MB, it may take some time to upload and process. Please be patient and wait.'


AGENT_NOTICE = 'Hint 1: To use <strong>Agent</strong> function, please use the <a href="https://github.com/THUDM/CogVLM/blob/main/utils/utils/template.py#L761">prompts for agents</a>.'

GROUNDING_NOTICE = 'Hint 2: To use <strong>Grounding</strong> function, please use the <a href="https://github.com/THUDM/CogVLM/blob/main/utils/utils/template.py#L344">prompts for grounding</a>.'


default_chatbox = [("", "Hi, What do you want to know about this image?")]


model = image_processor = text_processor_infer = None

is_grounding = False

def process_image_without_resize(image_prompt):
    image = Image.open(image_prompt)
    # print(f"height:{image.height}, width:{image.width}")
    timestamp = int(time.time())
    file_ext = os.path.splitext(image_prompt)[1]
    filename_grounding = f"examples/{timestamp}_grounding{file_ext}"
    return image, filename_grounding

from sat.quantization.kernels import quantize

def load_model(args): 
    model, model_args = AutoModel.from_pretrained(
        args.from_pretrained,
        args=argparse.Namespace(
        deepspeed=None,
        local_rank=0,
        rank=0,
        world_size=world_size,
        model_parallel_size=world_size,
        mode='inference',
        fp16=args.fp16,
        bf16=args.bf16,
        skip_init=True,
        use_gpu_initialization=True if (torch.cuda.is_available() and args.quant is None) else False,
        device='cpu' if args.quant else 'cuda'),
        overwrite_args={'model_parallel_size': world_size} if world_size != 1 else {}
    )
    model = model.eval()
    assert world_size == get_model_parallel_world_size(), "world size must equal to model parallel size for cli_demo!"

    language_processor_version = model_args.text_processor_version if 'text_processor_version' in model_args else args.version
    tokenizer = llama2_tokenizer(args.local_tokenizer, signal_type=language_processor_version)
    image_processor = get_image_processor(model_args.eva_args["image_size"][0])
    cross_image_processor = get_image_processor(model_args.cross_image_pix) if "cross_image_pix" in model_args else None

    if args.quant:
        quantize(model, args.quant)
        if torch.cuda.is_available():
            model = model.cuda()
    model.add_mixin('auto-regressive', CachedAutoregressiveMixin())

    text_processor_infer = llama2_text_processor_inference(tokenizer, args.max_length, model.image_length)

    return model, image_processor, cross_image_processor, text_processor_infer


def post(
        input_text,
        temperature,
        top_p,
        top_k,
        image_prompt,
        result_previous,
        hidden_image,
        state
        ):
    result_text = [(ele[0], ele[1]) for ele in result_previous]
    for i in range(len(result_text)-1, -1, -1):
        if result_text[i][0] == "" or result_text[i][0] == None:
            del result_text[i]
    print(f"history {result_text}")
    
    global model, image_processor, cross_image_processor, text_processor_infer, is_grounding

    try:
        with torch.no_grad():
            pil_img, image_path_grounding = process_image_without_resize(image_prompt)
            response, _, cache_image = chat(
                    image_path="", 
                    model=model, 
                    text_processor=text_processor_infer,
                    img_processor=image_processor,
                    query=input_text, 
                    history=result_text, 
                    cross_img_processor=cross_image_processor,
                    image=pil_img, 
                    max_length=2048, 
                    top_p=top_p, 
                    temperature=temperature,
                    top_k=top_k,
                    invalid_slices=text_processor_infer.invalid_slices if hasattr(text_processor_infer, "invalid_slices") else [],
                    no_prompt=False,
                    args=state['args']
            )
    except Exception as e:
        print("error message", e)
        result_text.append((input_text, 'Timeout! Please wait a few minutes and retry.'))
        return "", result_text, hidden_image

    answer = response
    if is_grounding:
        parse_response(pil_img, answer, image_path_grounding)
        new_answer = answer.replace(input_text, "")
        result_text.append((input_text, new_answer))
        result_text.append((None, (image_path_grounding,)))
    else:
        result_text.append((input_text, answer))
    print(result_text)
    print('finished')
    return "", result_text, hidden_image


def clear_fn(value):
    return "", default_chatbox, None

def clear_fn2(value):
    return default_chatbox


def main(args):
    global model, image_processor, cross_image_processor, text_processor_infer, is_grounding
    model, image_processor, cross_image_processor, text_processor_infer = load_model(args)
    is_grounding = 'grounding' in args.from_pretrained
    
    gr.close_all()

    with gr.Blocks(css='style.css') as demo:
        state = gr.State({'args': args})

        gr.Markdown(DESCRIPTION)
        gr.Markdown(NOTES)
        

        with gr.Row():
            with gr.Column(scale=5):
                with gr.Group():
                    gr.Markdown(AGENT_NOTICE)
                    gr.Markdown(GROUNDING_NOTICE)
                    input_text = gr.Textbox(label='Input Text', placeholder='Please enter text prompt below and press ENTER.')
                    
                    with gr.Row():
                        run_button = gr.Button('Generate')
                        clear_button = gr.Button('Clear')

                    image_prompt = gr.Image(type="filepath", label="Image Prompt", value=None)

                with gr.Row():
                    temperature = gr.Slider(maximum=1, value=0.8, minimum=0, label='Temperature')
                    top_p = gr.Slider(maximum=1, value=0.4, minimum=0, label='Top P')
                    top_k = gr.Slider(maximum=100, value=10, minimum=1, step=1, label='Top K')

            with gr.Column(scale=5):
                result_text = gr.components.Chatbot(label='Multi-round conversation History', value=[("", "Hi, What do you want to know about this image?")], height=600)
                hidden_image_hash = gr.Textbox(visible=False)


        gr.Markdown(MAINTENANCE_NOTICE1)

        print(gr.__version__)
        run_button.click(fn=post,inputs=[input_text, temperature, top_p, top_k, image_prompt, result_text, hidden_image_hash, state],
                         outputs=[input_text, result_text, hidden_image_hash])
        input_text.submit(fn=post,inputs=[input_text, temperature, top_p, top_k, image_prompt, result_text, hidden_image_hash, state],
                         outputs=[input_text, result_text, hidden_image_hash])
        clear_button.click(fn=clear_fn, inputs=clear_button, outputs=[input_text, result_text, image_prompt])
        image_prompt.upload(fn=clear_fn2, inputs=clear_button, outputs=[result_text])
        image_prompt.clear(fn=clear_fn2, inputs=clear_button, outputs=[result_text])


    # demo.queue(concurrency_count=10)
    demo.launch()


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--max_length", type=int, default=2048, help='max length of the total sequence')
    parser.add_argument("--top_p", type=float, default=0.4, help='top p for nucleus sampling')
    parser.add_argument("--top_k", type=int, default=1, help='top k for top k sampling')
    parser.add_argument("--temperature", type=float, default=.8, help='temperature for sampling')
    parser.add_argument("--version", type=str, default="chat", choices=['chat', 'vqa', 'chat_old', 'base'], help='version of language process. if there is \"text_processor_version\" in model_config.json, this option will be overwritten')
    parser.add_argument("--quant", choices=[8, 4], type=int, default=None, help='quantization bits')
    parser.add_argument("--from_pretrained", type=str, default="cogagent-chat", help='pretrained ckpt')
    parser.add_argument("--local_tokenizer", type=str, default="lmsys/vicuna-7b-v1.5", help='tokenizer path')
    parser.add_argument("--fp16", action="store_true")
    parser.add_argument("--bf16", action="store_true")
    parser.add_argument("--stream_chat", action="store_true")
    args = parser.parse_args()
    rank = int(os.environ.get('RANK', 0))
    world_size = int(os.environ.get('WORLD_SIZE', 1))
    args = parser.parse_args()   
    main(args)


================================================
FILE: composite_demo/client.py
================================================
from __future__ import annotations
from threading import Thread

import streamlit as st
import torch
import warnings
import os

from typing import Any, Protocol
from collections.abc import Iterable
from huggingface_hub.inference._text_generation import TextGenerationStreamResponse, Token
from transformers import AutoTokenizer, TextIteratorStreamer, AutoModelForCausalLM
from conversation import Conversation

# Check if GPU supports bfloat16

if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
    torch_type = torch.bfloat16
else:
    torch_type = torch.float16
    warnings.warn("Your GPU does not support bfloat16 type, use fp16 instead")

# if you use all of Our model, include cogagent-chat cogvlm-chat cogvlm-grounding and put it in different devices, you can do like this.
models_info = {
    'tokenizer': {
        'path': os.environ.get('TOKENIZER_PATH', 'lmsys/vicuna-7b-v1.5'),
    },
    'agent_chat': {
        'path': os.environ.get('MODEL_PATH_AGENT_CHAT', 'THUDM/cogagent-chat-hf'),
        'device': ['cuda:0']
    },
    'vlm_chat': {
        'path': os.environ.get('MODEL_PATH_VLM_CHAT', 'THUDM/cogvlm-chat-hf'),
        'device': ['cuda:3']
    },
    'vlm_grounding': {
        'path': os.environ.get('MODEL_PATH_VLM_GROUNDING','THUDM/cogvlm-grounding-generalist-hf'),
        'device': ['cuda:6']
    }
}


# if you just use one model, use like this
# models_info = {
#     'tokenizer': {
#         'path': os.environ.get('TOKENIZER_PATH', 'lmsys/vicuna-7b-v1.5'),
#     },
#     'agent_chat': {
#         'path': os.environ.get('MODEL_PATH_AGENT_CHAT', 'THUDM/cogagent-chat-hf'),
#         'device': ['cuda:0']
#     },


@st.cache_resource
def get_client() -> Client:
    client = HFClient(models_info)
    return client


def process_history(history: list[Conversation]):
    """
        Process the input history to extract the query and the history pairs.
        Args:
            History(list[Conversation]): A list of Conversation objects representing all conversations.
        Returns:
            query(str): The current user input string.
            history_pairs(list[(str,str)]): A list of (user, assistant) pairs.
            last_user_image(Image): The last user image. Only the latest image.

    """
    history_pairs = []
    query = ""
    last_user_image = None

    user_text = None
    for i, conversation in enumerate(history):
        if conversation.role == conversation.role.USER:
            user_text = conversation.content
            if conversation.image:
                last_user_image = conversation.image

            if i == len(history) - 1:
                query = conversation.content

        else:
            if user_text is not None:
                history_pairs.append((user_text, conversation.content))
                user_text = None
    return query, history_pairs, last_user_image


class Client(Protocol):
    def generate_stream(self,
                        history: list[Conversation],
                        grounding: bool = False,
                        model_use: str = 'agent_chat',
                        **parameters: Any
                        ) -> Iterable[TextGenerationStreamResponse]:
        ...


class HFClient(Client):
    """
        The HFClient class manages the interaction with various large language models
        for text generation tasks. It supports handling multiple models, each designated
        for a specific task like chatting or grounding.

        Args:
            models_info (dict): A dictionary containing the configuration for each model.
                The dictionary format is:
                    - 'tokenizer': Path and settings for the tokenizer.
                    - 'agent_chat': Path and settings for the CogAgent-chat-18B model.
                    - 'vlm_chat': Path and settings for the CogVLM-chat-17B model.
                    - 'vlm_grounding': Path and settings for the CogVLM-grounding-17B model.

        The class loads each model based on the provided information and assigns it to the
        specified CUDA device. It also handles the tokenizer used across all models.
        """
    def __init__(self, models_info):
        self.models = {}
        self.tokenizer = AutoTokenizer.from_pretrained(models_info['tokenizer']['path'], trust_remote_code=True)
        for model_name, model_info in models_info.items():
            if model_name != 'tokenizer':
                self.models[model_name] = []
                for device in model_info['device']:
                    model = AutoModelForCausalLM.from_pretrained(
                        model_info['path'],
                        torch_dtype=torch_type,
                        low_cpu_mem_usage=True,
                        trust_remote_code=True,
                    ).to(device).eval()
                    self.models[model_name].append(model)

    def select_best_gpu(self, model_name):
        min_memory_used = None
        selected_model = None

        for model in self.models[model_name]:
            device = next(model.parameters()).device
            mem_used = torch.cuda.memory_allocated(device=device)

            if min_memory_used is None or mem_used < min_memory_used:
                min_memory_used = mem_used
                selected_model = model

        return selected_model

    def generate_stream(self,
                        history: list,
                        grounding: bool = False,
                        model_use: str = 'agent_chat',
                        **parameters: Any
                        ) -> Iterable[TextGenerationStreamResponse]:
        """
        Generates a stream of text responses based on the input history and selected model.

        This method facilitates a chat-like interaction with the models. Depending on the
        model selected and whether grounding is enabled, it alters the behavior of the text
        generation process.

        Args:
            history (list[Conversation]): A list of Conversation objects representing the
                dialogue history.
            grounding (bool, optional): A flag to indicate whether grounding should be used
                in the generation process. Defaults to False.
            model_use (str, optional): The key name of the model to be used for the generation.
                Defaults to 'agent_chat'.
            **parameters (Any): Additional parameters that may be required for the generation
                process.

        Yields:
            Iterable[TextGenerationStreamResponse]: A stream of text generation responses, each
            encapsulating a generated piece of text.

        The method selects the appropriate model based on `model_use`, processes the input
        history, and feeds it into the model to generate text. It uses threading to handle
        the generation process efficiently.
        """
        query, history, image = process_history(history)
        if grounding:
            query += "(with grounding)"

        model = self.select_best_gpu(model_use)
        device = next(model.parameters()).device

        # Print user input info

        print("\n== Input ==\n", query)
        print("\n==History==\n", history)
        print("\n== Model ==\n\n", model.config.name_or_path)
        print("\n== Device ==\n\n", device)

        input_by_model = model.build_conversation_input_ids(
            self.tokenizer,
            query=query,
            history=history,
            images=[image]
        )
        inputs = {
            'input_ids': input_by_model['input_ids'].unsqueeze(0).to(device),
            'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(device),
            'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(device),
            'images': [[input_by_model['images'][0].to(device).to(torch_type)]],
        }

        # CogVLM model do not have param 'cross_images', Only CogAgent have.

        if 'cross_images' in input_by_model and input_by_model['cross_images']:
            inputs['cross_images'] = [[input_by_model['cross_images'][0].to(device).to(torch_type)]]

        # Use TextIteratorStreamer for streaming generation like huggingface.

        streamer = TextIteratorStreamer(self.tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
        parameters['streamer'] = streamer
        gen_kwargs = {**parameters, **inputs}
        with torch.no_grad():
            thread = Thread(target=model.generate, kwargs=gen_kwargs)
            thread.start()
            for next_text in streamer:
                yield TextGenerationStreamResponse(
                    token=Token(
                        id=0,
                        logprob=0,
                        text=next_text,
                        special=False,
                    )
                )


================================================
FILE: composite_demo/conversation.py
================================================
import requests
import re
import streamlit as st

from dataclasses import dataclass
from enum import auto, Enum
from PIL.Image import Image
from PIL import ImageDraw
from streamlit.delta_generator import DeltaGenerator


class Role(Enum):
    """
    CogVLM | CogAgent Only have 2 roles: USER, ASSISTANT

    Represents the roles in a conversation, specifically for CogVLM and CogAgent applications.

    There are two roles available:
    - USER: The user of the system, typically the one asking questions or initiating conversation.
    - ASSISTANT: The system or AI assistant responding to the user's queries.

    Methods:
        get_message(self):
            Retrieves a Streamlit chat message component based on the role. For the USER role, it
            returns a chat message with the name "user" and user avatar. For the ASSISTANT role,
            it returns a chat message with the name "assistant" and assistant avatar.
    """

    USER = auto()
    ASSISTANT = auto()

    def get_message(self):

        match self.value:
            case Role.USER.value:
                return st.chat_message(name="user", avatar="user")
            case Role.ASSISTANT.value:
                return st.chat_message(name="assistant", avatar="assistant")
            case _:
                st.error(f'Unexpected role: {self}')


@dataclass
class Conversation:
    """
    Represents a single conversation turn within a dialogue.
    Attributes:
        role (Role): The role of the speaker in the conversation (USER or ASSISTANT).
        content (str): The textual content of the conversation turn.
        image (Image, optional): An optional image associated with the conversation turn.
        content_show (str, optional): The content to be displayed in the WebUI. This may differ
            from `content` if translation or other processing is applied.
        translate （bool, optional): Whether to translate the content of the conversation turn.

    Methods:
        __str__(self) -> str:
            Returns a string representation of the conversation turn, including the role and content.

        show(self, placeholder: DeltaGenerator | None = None) -> str:
            Displays the conversation turn in the WebUI. If `placeholder` is provided, the content
            is shown in the specified Streamlit container. Otherwise, it uses the message style
            determined by the role.
    """

    role: Role = Role.USER
    content: str = ""
    image: Image | None = None
    content_show: str | None = None
    translate: bool = False

    def __str__(self) -> str:
        print(self.role, self.content)
        match self.role:
            case Role.USER | Role.ASSISTANT:
                return f'{self.role}\n{self.content}'

    def show(self, placeholder: DeltaGenerator | None = None) -> str:
        """
        show in markdown formate
        """
        if placeholder:
            message = placeholder
        else:
            message = self.role.get_message()

        # for Chinese WebUI show
        if self.role == Role.USER:
            if self.translate:
                self.content = translate_baidu(self.content_show, source_lan="zh", target_lan="en")
                if self.content == "error":
                    self.content_show = "Please Enter your Baidu Translation API Key in function translate_baidu()"
            else:
                self.content = self.content_show
        if self.role == Role.ASSISTANT:
            if self.translate:
                self.content_show = translate_baidu(self.content, source_lan="en", target_lan="zh")
            else:
                self.content_show = self.content

            self.content_show = self.content_show.replace('\n', '  \n')

        message.markdown(self.content_show)
        if self.image:
            message.image(self.image)


def preprocess_text(history: list[Conversation], ) -> str:
    """
    Prepares the conversation history for processing by concatenating the content of each turn.
     Args:
        history (list[Conversation]): The conversation history, a list of Conversation objects.

    Returns:
        str: A single string that concatenates the content of each conversation turn, followed by
        the ASSISTANT role indicator. This string is suitable for use as input to a text generation model.
    """

    prompt = ""
    for conversation in history:
        prompt += f'{conversation}'
    prompt += f'{Role.ASSISTANT}\n'
    return prompt


def postprocess_text(template: str, text: str) -> str:
    """
    Post-processes the generated text by incorporating it into a given template.
    Args:
        template (str): A template string containing a placeholder for the generated text.
        text (str): The generated text to be incorporated into the template.

    Returns:
        str: The template with the generated text replacing the placeholder.
    """
    quoted_text = f'"{text.strip()}"'
    return template.replace("<TASK>", quoted_text).strip() if template != "" else text.strip()


def postprocess_image(text: str, img: Image) -> (str, Image):
    """
    Processes the given text to identify and draw bounding boxes on the provided image.
    This function searches for patterns in the text that represent coordinates for bounding
    boxes and draws rectangles on the image at these coordinates. Each box is drawn in a
    different color for distinction.
    Args:
        text (str): The text containing bounding box coordinates in a specific pattern.
        img (Image): The image on which to draw the bounding boxes.
    Returns:
        tuple[str, Image]: The processed text with additional annotations for each bounding
        box, and the image with the drawn bounding boxes.
    """
    colors = ["red", "green", "blue", "yellow", "purple", "orange"]

    # Updated pattern to match single or multiple coordinate groups
    pattern = r"\[\[([\d,]+(?:;[\d,]+)*)\]\]"
    matches = re.findall(pattern, text)
    draw = ImageDraw.Draw(img)

    if not matches:
        return text, None

    for i, match in enumerate(matches):
        # Splitting the matched string into individual coordinate groups
        coords_groups = match.split(';')

        # Determining the color for the current match
        color = colors[i % len(colors)]

        for coords_str in coords_groups:
            coords = coords_str.split(',')

            if len(coords) == 4:  # Rectangle
                scaled_coords = (
                    int(float(coords[0]) * 0.001 * img.width),
                    int(float(coords[1]) * 0.001 * img.height),
                    int(float(coords[2]) * 0.001 * img.width),
                    int(float(coords[3]) * 0.001 * img.height)
                )
                draw.rectangle(scaled_coords, outline=color, width=3)
            elif len(coords) == 2:  # Point
                scaled_coords = (
                    int(float(coords[0]) * 0.001 * img.width),
                    int(float(coords[1]) * 0.001 * img.height)
                )
                radius = 5
                draw.ellipse([scaled_coords[0] - radius, scaled_coords[1] - radius,
                              scaled_coords[0] + radius, scaled_coords[1] + radius],
                             fill=color)

    return text, img

def translate_baidu(translate_text, source_lan, target_lan):
    """
        Translates text using Baidu's translation service. (if you are not use English)

        This function sends a request to the Baidu translation API to translate the provided text
        from the source language to the target language.

        Args:
            translate_text (str): The text to be translated.
            source_lan (str): The source language code (e.g., "en" for English).
            target_lan (str): The target language code (e.g., "zh" for Chinese).

        Returns:
            str: The translated text. Returns "error" in case of an exception.
        """
    url = "https://aip.baidubce.com/rpc/2.0/mt/texttrans/v1?access_token="
    headers = {'Content-Type': 'application/json'}
    payload = {
        'q': translate_text,
        'from': source_lan,
        'to': target_lan
    }
    try:
        r = requests.post(url, json=payload, headers=headers)
        result = r.json()
        final_translation = ''

        for item in result['result']['trans_result']:
            final_translation += item['dst'] + '\n'
    except Exception as e:
        print(e)
        return "error"
    return final_translation


================================================
FILE: composite_demo/demo_agent_cogagent.py
================================================
from io import BytesIO
import base64
import streamlit as st
import re

from streamlit.delta_generator import DeltaGenerator
from client import get_client
from conversation import postprocess_text, Conversation, Role, postprocess_image
from PIL import Image
from utils import images_are_same

client = get_client()


def append_conversation(
        conversation: Conversation,
        history: list[Conversation],
        placeholder: DeltaGenerator | None = None,
) -> None:
    history.append(conversation)
    conversation.show(placeholder)


def main(
        top_p: float = 0.8,
        temperature: float = 0.95,
        prompt_text: str = "",
        metadata: str = "",
        top_k: int = 2,
        max_new_tokens: int = 2048,
        grounding: bool = False,
        retry: bool = False,
        template: str = ""
):
    if 'chat_history' not in st.session_state:
        st.session_state.chat_history = []

    if prompt_text == "" and retry == False:
        print("\n== Clean ==\n")
        st.session_state.chat_history = []
        return

    history: list[Conversation] = st.session_state.chat_history
    for conversation in history:
        conversation.show()

    if retry:
        print("\n== Retry ==\n")
        last_user_conversation_idx = None
        for idx, conversation in enumerate(history):
            if conversation.role == Role.USER:
                last_user_conversation_idx = idx
        if last_user_conversation_idx is not None:
            prompt_text = history[last_user_conversation_idx].content_show
            del history[last_user_conversation_idx:]

    if prompt_text:
        image = Image.open(BytesIO(base64.b64decode(metadata))).convert('RGB') if metadata else None
        image.thumbnail((1120, 1120))
        image_input = image
        if history and image:
            last_user_image = next(
                (conv.image for conv in reversed(history) if conv.role == Role.USER and conv.image), None)
            if last_user_image and images_are_same(image, last_user_image):
                image_input = None

            # Not necessary to clear history
            # else:
            #     # new picture means new conversation
            #     st.session_state.chat_history = []
            #     history = []

        # Set conversation
        if re.search('[\u4e00-\u9fff]', prompt_text):
            translate = True
        else:
            translate = False

        user_conversation = Conversation(
            role=Role.USER,
            translate=translate,
            content_show=prompt_text.strip() if retry else postprocess_text(template=template,
                                                                            text=prompt_text.strip()),
            image=image_input
        )
        append_conversation(user_conversation, history)
        placeholder = st.empty()
        assistant_conversation = placeholder.chat_message(name="assistant", avatar="assistant")
        assistant_conversation = assistant_conversation.empty()

        # steam Answer
        output_text = ''
        for response in client.generate_stream(
                model_use='agent_chat',
                grounding=grounding,
                history=history,
                do_sample=True,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_p=top_p,
                top_k=top_k,
        ):
            output_text += response.token.text
            assistant_conversation.markdown(output_text.strip() + '▌')

        ## Final Answer with image.
        print("\n==Output:==\n", output_text)
        content_output, image_output = postprocess_image(output_text, image)
        assistant_conversation = Conversation(
            role=Role.ASSISTANT,
            content=content_output,
            image=image_output,
            translate=translate,
        )
        append_conversation(
            conversation=assistant_conversation,
            history=history,
            placeholder=placeholder.chat_message(name="assistant", avatar="assistant"),
        )


================================================
FILE: composite_demo/demo_chat_cogagent.py
================================================
import streamlit as st
import base64
import re

from PIL import Image
from io import BytesIO
from streamlit.delta_generator import DeltaGenerator
from client import get_client
from utils import images_are_same
from conversation import Conversation, Role, postprocess_image, postprocess_text

client = get_client()


def append_conversation(
        conversation: Conversation,
        history: list[Conversation],
        placeholder: DeltaGenerator | None = None,
) -> None:
    history.append(conversation)
    conversation.show(placeholder)


def main(
        top_p: float = 0.8,
        temperature: float = 0.95,
        prompt_text: str = "",
        metadata: str = "",
        top_k: int = 2,
        max_new_tokens: int = 2048,
        grounding: bool = False,
        retry: bool = False,
        template: str = "",
):
    if 'chat_history' not in st.session_state:
        st.session_state.chat_history = []

    if prompt_text == "" and retry == False:
        print("\n== Clean ==\n")
        st.session_state.chat_history = []
        return

    history: list[Conversation] = st.session_state.chat_history
    for conversation in history:
        conversation.show()
    if retry:
        last_user_conversation_idx = None
        for idx, conversation in enumerate(history):
            if conversation.role == Role.USER:
                last_user_conversation_idx = idx
        if last_user_conversation_idx is not None:
            prompt_text = history[last_user_conversation_idx].content_show
            del history[last_user_conversation_idx:]

    if prompt_text:
        image = Image.open(BytesIO(base64.b64decode(metadata))).convert('RGB') if metadata else None
        image.thumbnail((1120, 1120))
        image_input = image
        if history and image:
            last_user_image = next(
                (conv.image for conv in reversed(history) if conv.role == Role.USER and conv.image), None)
            if last_user_image and images_are_same(image, last_user_image):
                image_input = None
            else:
                st.session_state.chat_history = []
                history = []

        # Set conversation
        if re.search('[\u4e00-\u9fff]', prompt_text):
            translate = True
        else:
            translate = False

        user_conversation = Conversation(
            role=Role.USER,
            translate=translate,
            content_show=prompt_text.strip() if retry else postprocess_text(template=template,
                                                                            text=prompt_text.strip()),
            image=image_input
        )
        append_conversation(user_conversation, history)
        placeholder = st.empty()
        assistant_conversation = placeholder.chat_message(name="assistant", avatar="assistant")
        assistant_conversation = assistant_conversation.empty()

        # steam Answer
        output_text = ''
        for response in client.generate_stream(
                model_use='agent_chat',
                grounding=grounding,
                history=history,
                do_sample=True,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_p=top_p,
                top_k=top_k,
        ):
            output_text += response.token.text
            assistant_conversation.markdown(output_text.strip() + '▌')

        print("\n==Output:==\n", output_text)
        content_output, image_output = postprocess_image(output_text, image)
        assistant_conversation = Conversation(
            role=Role.ASSISTANT,
            content=content_output,
            image=image_output,
            translate=translate
        )
        append_conversation(
            conversation=assistant_conversation,
            history=history,
            placeholder=placeholder.chat_message(name="assistant", avatar="assistant")
        )


================================================
FILE: composite_demo/demo_chat_cogvlm.py
================================================
import streamlit as st
import base64
import re

from PIL import Image
from io import BytesIO
from streamlit.delta_generator import DeltaGenerator
from client import get_client
from utils import images_are_same
from conversation import Conversation, Role, postprocess_image, postprocess_text

client = get_client()


def append_conversation(
        conversation: Conversation,
        history: list[Conversation],
        placeholder: DeltaGenerator | None = None,
) -> None:
    history.append(conversation)
    conversation.show(placeholder)


def main(
        top_p: float = 0.8,
        temperature: float = 0.95,
        prompt_text: str = "",
        metadata: str = "",
        top_k: int = 2,
        max_new_tokens: int = 2048,
        grounding: bool = False,
        retry: bool = False,
        template: str = "",
):
    if 'chat_history' not in st.session_state:
        st.session_state.chat_history = []

    if prompt_text == "" and retry == False:
        print("\n== Clean ==\n")
        st.session_state.chat_history = []
        return

    history: list[Conversation] = st.session_state.chat_history
    for conversation in history:
        conversation.show()
    if retry:
        last_user_conversation_idx = None
        for idx, conversation in enumerate(history):
            if conversation.role == Role.USER:
                last_user_conversation_idx = idx
        if last_user_conversation_idx is not None:
            prompt_text = history[last_user_conversation_idx].content_show
            del history[last_user_conversation_idx:]

    if prompt_text:
        image = Image.open(BytesIO(base64.b64decode(metadata))).convert('RGB') if metadata else None
        image.thumbnail((1120, 1120))
        image_input = image
        if history and image:
            last_user_image = next(
                (conv.image for conv in reversed(history) if conv.role == Role.USER and conv.image), None)
            if last_user_image and images_are_same(image, last_user_image):
                image_input = None
            else:
                st.session_state.chat_history = []
                history = []

        # Set conversation
        if re.search('[\u4e00-\u9fff]', prompt_text):
            translate = True
        else:
            translate = False

        user_conversation = Conversation(
            role=Role.USER,
            translate=translate,
            content_show=prompt_text.strip() if retry else postprocess_text(template=template,
                                                                            text=prompt_text.strip()),
            image=image_input
        )
        append_conversation(user_conversation, history)
        placeholder = st.empty()
        assistant_conversation = placeholder.chat_message(name="assistant", avatar="assistant")
        assistant_conversation = assistant_conversation.empty()

        # steam Answer
        output_text = ''
        for response in client.generate_stream(
                model_use='vlm_grounding' if grounding else 'vlm_chat',
                grounding=False,
                history=history,
                do_sample=True,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_p=top_p,
                top_k=top_k,
        ):
            output_text += response.token.text
            assistant_conversation.markdown(output_text.strip() + '▌')

        print("\n==Output:==\n", output_text)
        content_output, image_output = postprocess_image(output_text, image)
        assistant_conversation = Conversation(
            role=Role.ASSISTANT,
            content=content_output,
            image=image_output,
            translate=translate
        )
        append_conversation(
            conversation=assistant_conversation,
            history=history,
            placeholder=placeholder.chat_message(name="assistant", avatar="assistant")
        )


================================================
FILE: composite_demo/main.py
================================================
"""
This is a demo using the chat version about CogAgent and CogVLM in WebDEMO

Make sure you have installed the vicuna-7b-v1.5 tokenizer model (https://huggingface.co/lmsys/vicuna-7b-v1.5),
and a full checkpoint of vicuna-7b-v1.5 LLM is not required.

Mention that only one image can be processed in a conversation, which means you cannot replace or insert another image
during the conversation.


The models_info parameter is explained as follows
   tokenizer: tokenizer model using vicuna-7b-v1.5 model
   agent_chat: Use the CogAgent-chat-18B model to complete the conversation task
   vlm_chat: Use the CogVLM-chat-17B model to complete the conversation task
   vlm_grounding: Use CogVLM-grounding-17B model to complete the Grounding task

Web Demo user operation logic is as follows:
    CogVLM-Chat -> grounding? - yes -> Choose a template -> CogVLM-grounding-17B
                              - no  -> CogVLM-chat-17B (without grounding)

    CogAgent-Chat  -> CogAgent-chat-18B (Only QA,without Grounding)

    CogAgent-Agent -> CogAgent-chat-18B
                   -> Choose a template -> grounding? - yes -> prompt + (with grounding)
                                                      - no  -> prompt

    CogAgent-vqa-hf are not included in this demo, but you can use it in the same way as CogAgent-chat-18B
    and used it in CogAgent-Chat
"""

import streamlit as st

st.set_page_config(
    page_title="CogVLM & CogAgent Demo",
    page_icon=":robot:",
    layout='centered',
    initial_sidebar_state='expanded',
)

from enum import Enum
from utils import encode_file_to_base64, templates_agent_cogagent, template_grounding_cogvlm
import demo_chat_cogvlm, demo_agent_cogagent, demo_chat_cogagent

st.markdown("<h3>CogAgent & CogVLM Chat Demo</h3>", unsafe_allow_html=True)
st.markdown(
    "<sub>更多使用方法请参考文档: https://lslfd0slxc.feishu.cn/wiki/WvQbwIJ9tiPAxGk8ywDck6yfnof \n\n 请根据文档的引导说明来尝试demo，以便理解demo的布局设计 </sub> \n",
    unsafe_allow_html=True)


class Mode(str, Enum):
    CogVLM_Chat, CogAgent_Chat, CogAgent_Agent = '💬CogVLM-Chat', '🧑‍💻 CogAgent-Chat', '💡 CogAgent-Agent'


with st.sidebar:
    top_p = st.slider(
        'top_p', 0.0, 1.0, 0.8, step=0.01
    )
    temperature = st.slider(
        'temperature', 0.01, 1.0, 0.90, step=0.01
    )
    top_k = st.slider(
        'top_k', 1, 20, 5, step=1
    )
    max_new_token = st.slider(
        'Output length', 1, 2048, 2048, step=1
    )

    uploaded_file = st.file_uploader("Choose an image...", type=['.jpg', '.png', '.jpeg'], accept_multiple_files=False)

    cols = st.columns(2)
    export_btn = cols[0]
    clear_history = cols[1].button("Clear History", use_container_width=True)
    retry = export_btn.button("Retry", use_container_width=True)

prompt_text = st.chat_input(
    'Chat with CogAgent | CogVLM',
    key='chat_input',
)

tab = st.radio(
    'Mode',
    [mode.value for mode in Mode],
    horizontal=True,
    label_visibility='hidden',
)

selected_template_grounding_cogvlm = ""
with st.sidebar:
    grounding = st.checkbox("Grounding")
    if tab == Mode.CogVLM_Chat or tab == Mode.CogAgent_Chat:
        if grounding:
            selected_template_grounding_cogvlm = st.selectbox("Template For Grounding", template_grounding_cogvlm)

if tab == Mode.CogAgent_Agent:
    with st.sidebar:
        selected_template_agent_cogagent = st.selectbox("Template For Agent", templates_agent_cogagent)

if clear_history or retry:
    prompt_text = ""

match tab:
    case Mode.CogVLM_Chat:
        st.info("This option uses cogvlm-chat and cogvlm-grounding model.")
        if uploaded_file is not None:
            demo_chat_cogvlm.main(
                retry=retry,
                top_p=top_p,
                top_k=top_k,
                temperature=temperature,
                prompt_text=prompt_text,
                metadata=encode_file_to_base64(uploaded_file),
                max_new_tokens=max_new_token,
                grounding=grounding,
                template=selected_template_grounding_cogvlm
            )
        else:
            st.error(f'Please upload an image to start')

    case Mode.CogAgent_Chat:
        st.info("This option uses cogagent-chat model.")
        if uploaded_file is not None:
            demo_chat_cogagent.main(
                retry=retry,
                top_p=top_p,
                top_k=top_k,
                temperature=temperature,
                prompt_text=prompt_text,
                metadata=encode_file_to_base64(uploaded_file),
                max_new_tokens=max_new_token,
                grounding=grounding,
                template=selected_template_grounding_cogvlm
            )
        else:
            st.error(f'Please upload an image to start')

    case Mode.CogAgent_Agent:
        st.info("This option uses cogagent-chat model with agent template.")
        if uploaded_file is not None:
            demo_agent_cogagent.main(
                retry=retry,
                top_p=top_p,
                top_k=top_k,
                temperature=temperature,
                prompt_text=prompt_text,
                metadata=encode_file_to_base64(uploaded_file),
                max_new_tokens=max_new_token,
                grounding=grounding,
                template=selected_template_agent_cogagent
            )
        else:
            st.error(f'Please upload an image to start')
    case _:
        st.error(f'Unexpected tab: {tab}')


================================================
FILE: composite_demo/utils.py
================================================
import base64
from io import BytesIO
from PIL import Image


def images_are_same(img1: Image, img2: Image) -> bool:
    """
        Compare two PIL images.
    """
    if img1.size != img2.size or img1.mode != img2.mode:
        return False
    return list(img1.getdata()) == list(img2.getdata())


def encode_file_to_base64(file):
    """
       Convert a file to base64.
    """
    buffer = BytesIO()
    buffer.write(file.read())
    return base64.b64encode(buffer.getvalue()).decode()


# The templates is for CogAgent_Agent Template
templates_agent_cogagent = [
    "Can you advise me on how to <TASK>?",
    "I'm looking for guidance on how to <TASK>.",
    "What steps do I need to take to <TASK>?",
    "Could you provide instructions for <TASK>?",
    "I'm wondering what the process is for <TASK>.",
    "How can I go about <TASK>?",
    "I need assistance with planning to <TASK>.",
    "Do you have any recommendations for <TASK>?",
    "Please share some tips for <TASK>.",
    "I'd like to know the best way to <TASK>.",
    "What's the most effective way to <TASK>?",
    "I'm seeking advice on accomplishing <TASK>.",
    "Could you guide me through the steps to <TASK>?",
    "I'm unsure how to start with <TASK>.",
    "Is there a strategy for successfully <TASK>?",
    "What's the proper procedure for <TASK>?",
    "How should I prepare for <TASK>?",
    "I'm not sure where to begin with <TASK>.",
    "I need some insights on <TASK>.",
    "Can you explain how to tackle <TASK>?",
    "I'm interested in the process of <TASK>.",
    "Could you enlighten me on <TASK>?",
    "What are the recommended steps for <TASK>?",
    "Is there a preferred method for <TASK>?",
    "I'd appreciate your advice on <TASK>.",
    "Can you shed light on <TASK>?",
    "What would be the best approach to <TASK>?",
    "How do I get started with <TASK>?",
    "I'm inquiring about the procedure for <TASK>.",
    "Could you share your expertise on <TASK>?",
    "I'd like some guidance on <TASK>.",
    "What's your recommendation for <TASK>?",
    "I'm seeking your input on how to <TASK>.",
    "Can you provide some insights into <TASK>?",
    "How can I successfully accomplish <TASK>?",
    "What steps are involved in <TASK>?",
    "I'm curious about the best way to <TASK>.",
    "Could you show me the ropes for <TASK>?",
    "I need to know how to go about <TASK>.",
    "What are the essential steps for <TASK>?",
    "Is there a specific method for <TASK>?",
    "I'd like to get some advice on <TASK>.",
    "Can you explain the process of <TASK>?",
    "I'm looking for guidance on how to approach <TASK>.",
    "What's the proper way to handle <TASK>?",
    "How should I proceed with <TASK>?",
    "I'm interested in your expertise on <TASK>.",
    "Could you walk me through the steps for <TASK>?",
    "I'm not sure where to begin when it comes to <TASK>.",
    "What should I prioritize when doing <TASK>?",
    "How can I ensure success with <TASK>?",
    "I'd appreciate some tips on <TASK>.",
    "Can you provide a roadmap for <TASK>?",
    "What's the recommended course of action for <TASK>?",
    "I'm seeking your guidance on <TASK>.",
    "Could you offer some suggestions for <TASK>?",
    "I'd like to know the steps to take for <TASK>.",
    "What's the most effective way to achieve <TASK>?",
    "How can I make the most of <TASK>?",
    "I'm wondering about the best approach to <TASK>.",
    "Can you share your insights on <TASK>?",
    "What steps should I follow to complete <TASK>?",
    "I'm looking for advice on <TASK>.",
    "What's the strategy for successfully completing <TASK>?",
    "How should I prepare myself for <TASK>?",
    "I'm not sure where to start with <TASK>.",
    "What's the procedure for <TASK>?",
    "Could you provide some guidance on <TASK>?",
    "I'd like to get some tips on how to <TASK>.",
    "Can you explain how to tackle <TASK> step by step?",
    "I'm interested in understanding the process of <TASK>.",
    "What are the key steps to <TASK>?",
    "Is there a specific method that works for <TASK>?",
    "I'd appreciate your advice on successfully completing <TASK>.",
    "Can you shed light on the best way to <TASK>?",
    "What would you recommend as the first step to <TASK>?",
    "How do I initiate <TASK>?",
    "I'm inquiring about the recommended steps for <TASK>.",
    "Could you share some insights into <TASK>?",
    "I'm seeking your expertise on <TASK>.",
    "What's your recommended approach for <TASK>?",
    "I'd like some guidance on where to start with <TASK>.",
    "Can you provide recommendations for <TASK>?",
    "What's your advice for someone looking to <TASK>?",
    "I'm seeking your input on the process of <TASK>.",
    "How can I achieve success with <TASK>?",
    "What's the best way to navigate <TASK>?",
    "I'm curious about the steps required for <TASK>.",
    "Could you show me the proper way to <TASK>?",
    "I need to know the necessary steps for <TASK>.",
    "What's the most efficient method for <TASK>?",
    "I'd appreciate your guidance on <TASK>.",
    "Can you explain the steps involved in <TASK>?",
    "I'm looking for recommendations on how to approach <TASK>.",
    "What's the right way to handle <TASK>?",
    "How should I manage <TASK>?",
    "I'm interested in your insights on <TASK>.",
    "Could you provide a step-by-step guide for <TASK>?",
    "I'm not sure how to start when it comes to <TASK>.",
    "What are the key factors to consider for <TASK>?",
    "How can I ensure a successful outcome with <TASK>?",
    "I'd like some tips and tricks for <TASK>.",
    "Can you offer a roadmap for accomplishing <TASK>?",
    "What's the preferred course of action for <TASK>?",
    "I'm seeking your expert advice on <TASK>.",
    "Could you suggest some best practices for <TASK>?",
    "I'd like to understand the necessary steps to complete <TASK>.",
    "What's the most effective strategy for <TASK>?",
]

template_grounding_cogvlm = [
    "Where is <TASK>?",
    "Where is <TASK> in the image?",
    "Where is <TASK>? answer in [[x0,y0,x1,y1]] format.",
    "Can you point out <TASK> in the image and provide the bounding boxes of its location?",
    "Help me to locate <TASK> in and give me its bounding boxes, please.",
    "In the given, could you find and tell me the bounding boxes of <TASK>?",
    "Guide me to the location of <TASK> within the image by providing its bounding boxes.",
    "I'd like to know the exact bounding boxes of <TASK> in the photo.",
    "Would you kindly provide the bounding boxes of <TASK> located in the picture?",
    "Can you find <TASK> in and give me the bounding boxes of where it is located?",
    "I'm trying to locate <TASK> in. Can you determine its bounding boxes for me?",
    "What are the bounding boxes of <TASK> in the image?",
    "Can you disclose the position of <TASK> in the photograph by stating its bounding boxes?",
    "In, could you let me know the location of <TASK> in the form of bounding boxes?",
    "I need the bounding boxes of <TASK> in, can you please assist me with that?",
    "Where in is <TASK> located? Provide me with its bounding boxes, please.",
    "May I have the bounding boxes of <TASK>?",
    "In the photograph, could you pinpoint the location of <TASK> and tell me its bounding boxes?",
    "Can you please search and find <TASK> in, then let me know its bounding boxes?",
    "Please, point out the position of <TASK> in the image by giving its bounding boxes.",
    "What are the exact bounding boxes of <TASK> in the provided picture?",
    "Detect the location of <TASK> in and share the bounding boxes with me, please.",
    "In the picture, I'd like you to locate <TASK> and provide its coordinates.",
    "Please indicate the location of <TASK> in the photo by giving bounding boxes.",
    "Find <TASK> in and share its coordinates with me.",
    "Could you please help me find the bounding boxes of <TASK> in the image?",
    "I am looking for the position of <TASK> in. Can you provide its bounding boxes?",
    "In the image, can you locate <TASK> and let me know its coordinates?",
    "I'd appreciate if you could find and tell me the bounding boxes of <TASK>.",
    "In, I need the bounding box bounding boxes of <TASK>.",
    "Point me to the location of <TASK> in the picture by providing its bounding boxes.",
    "Could you trace <TASK> in and tell me its bounding boxes?",
    "Can you assist me in locating <TASK> in, and then provide its bounding boxes?",
    "I'm curious, what are the bounding boxes of <TASK> in the photo?",
    "Kindly share the bounding boxes of <TASK> located in the image.",
    "I would like to find <TASK> in. Can you give me its bounding boxes?",
    "Can you spot <TASK> in and disclose its bounding boxes to me?",
    "Please, reveal the location of <TASK> in the provided photograph as coordinates.",
    "Help me locate and determine the bounding boxes of <TASK>.",
    "I request the bounding boxes of <TASK> in the image.",
    "In the given, can you find <TASK> and tell me its bounding boxes?",
    "I need to know the position of <TASK> in as bounding boxes.",
    "Locate <TASK> in and provide its bounding boxes, please.",
    "Assist me in finding <TASK> in the photo and provide the bounding box bounding boxes.",
    "In, can you guide me to the location of <TASK> by providing bounding boxes?",
    "I'd like the bounding boxes of <TASK> as it appears in the image.",
    "What location does <TASK> hold in the picture? Inform me of its bounding boxes.",
    "Identify the position of <TASK> in and share its bounding boxes.",
    "I'd like to request the bounding boxes of <TASK> within the photo.",
    "How can I locate <TASK> in the image? Please provide the bounding boxes.",
    "I am interested in knowing the bounding boxes of <TASK> in the picture.",
    "Assist me in locating the position of <TASK> in the photograph and its bounding box bounding boxes.",
    "In the image, I need to find <TASK> and know its bounding boxes. Can you please help?"
    "Can you give me a description of the region <TASK> in image?",
    "In the provided image, would you mind describing the selected area <TASK>?",
    "I need details about the area <TASK> located within image.",
    "Could you please share some information on the region <TASK> in this photograph?",
    "Describe what's happening within the coordinates <TASK> of the given image.",
    "What can you tell me about the selected region <TASK> in the photo?",
    "Please, can you help me understand what's inside the region <TASK> in image?",
    "Give me a comprehensive description of the specified area <TASK> in the picture.",
    "I'm curious about the area <TASK> in the following image. Can you describe it?",
    "Please elaborate on the area with the coordinates <TASK> in the visual.",
    "In the displayed image, help me understand the region defined by <TASK>.",
    "Regarding the image, what's going on in the section <TASK>?",
    "In the given photograph, can you explain the area with coordinates <TASK>?",
    "Kindly describe what I should be seeing in the area <TASK> of image.",
    "Within the input image, what can be found in the region defined by <TASK>?",
    "Tell me what you see within the designated area <TASK> in the picture.",
    "Please detail the contents of the chosen region <TASK> in the visual input.",
    "What's inside the area <TASK> of the provided graphic?",
    "I'd like some information about the specific region <TASK> in the image.",
    "Help me understand the details within the area <TASK> in photograph.",
    "Can you break down the region <TASK> in the image for me?",
    "What is taking place within the specified area <TASK> in this capture?",
    "Care to elaborate on the targeted area <TASK> in the visual illustration?",
    "What insights can you provide about the area <TASK> in the selected picture?",
    "What does the area <TASK> within the given visual contain?",
    "Analyze and describe the region <TASK> in the included photo.",
    "Please provide details for the area marked as <TASK> in this photographic.",
    "For the image, can you assess and describe what's happening at <TASK>?",
    "Fill me in about the selected portion <TASK> within the presented image.",
    "In the image, elaborate on the details found within the section <TASK>.",
    "Please interpret and describe the area <TASK> inside the given picture.",
    "What information can you give me about the coordinates <TASK> in image?",
    "Regarding the coordinates <TASK> in image, can you provide a description?",
    "In the photo, can you delve into the details of the region <TASK>?",
    "Please provide insights on the specified area <TASK> within the graphic.",
    "Detail the chosen region <TASK> in the depicted scene.",
    "Can you discuss the entities within the region <TASK> of image?",
    "I'd appreciate a breakdown of the area <TASK> in the displayed image.",
    "What's the story in the section <TASK> of the included visual?",
    "Please enlighten me about the region <TASK> in the given photo.",
    "Offer a thorough description of the area <TASK> within the illustration.",
    "What can you share about the area <TASK> in the presented image?",
    "Help me grasp the context of the region <TASK> within image.",
    "Kindly give an overview of the section <TASK> in photo.",
    "What details can you provide about the region <TASK> in the snapshot?",
    "Can you divulge the contents of the area <TASK> within the given image?",
    "In the submitted image, please give a synopsis of the area <TASK>.",
    "In the image, please describe the bounding box <TASK>.",
    "Please describe the region <TASK> in the picture.",
    "Describe the bbox <TASK> in the provided photo.",
    "What can you tell me about the area <TASK> within the image?",
    "Could you give me a description of the rectangular region <TASK> found in?",
    "In, what elements can be found within the coordinates <TASK>?",
    "Please provide details for the area within the bounding box <TASK> in.",
    "Can you generate a description for the selected region <TASK> in the image?",
    "Kindly describe the objects or scenery in the bounding box <TASK> within.",
    "What details can you provide for the rectangle defined by the coordinates <TASK> in?",
    "In relation to the picture, please describe the content of the area marked by <TASK>.",
    "I'd like to know more about the area <TASK> in the given image. Can you describe it?",
    "Can you help me by describing the part of that lies within the bounding box <TASK>?",
    "What's happening in the section of the photo enclosed by the coordinates <TASK>?",
    "Describe the image content present in the specified rectangular area <TASK> of.",
    "Please provide information about the area within the bounding box <TASK> in the picture.",
    "Could you offer a description of the contents in the selected area <TASK> of the image?",
    "I'm curious about the area <TASK> in. Can you provide a description of it?",
    "What can be observed in the rectangular region <TASK> in the photograph?",
    "Please explain what is contained in the portion of defined by the box <TASK>.",
    "In the photograph, can you describe the objects or scenery enclosed by <TASK>?",
    "Can you give a brief explanation of the specified area <TASK> in the image?",
    "What does the area <TASK> look like in the context of the image?",
    "Could you please describe the contents of the bounding box <TASK> in the given image?",
    "I would like to know more about the rectangular region <TASK> within the picture. Can you describe it?",
    "Please tell me about the area <TASK> in the image. What does it contain?",
    "Help me understand what's happening in the selected bounding box <TASK> within.",
    "Can you provide a description of the area <TASK> in the image?",
    "What sort of things can be seen in the region <TASK> of the photo?",
    "Describe what can be found within the bounds of <TASK> in the image.",
    "In, can you paint a picture of the area enclosed by coordinates <TASK>?",
    "Please provide a detailed account of the area covered by the bounding box <TASK> in.",
    "Give me a vivid description of what's happening in the area <TASK> within the snapshot.",
    "In the image, what do you observe within the rectangular box defined by the coordinates <TASK>?",
    "Could you give me a breakdown of the content in the specified area <TASK> of the picture?",
    "Please elucidate the area<TASK> of the image.",
    "I'd appreciate it if you could describe the portion of that lies within the rectangle <TASK>.",
    "Can you share some insights about the rectangular region <TASK> in the image?",
    "Help me visualize the section of the photo enclosed by the bounding box <TASK>.",
    "Would you kindly provide a description for the content within the rectangular area <TASK> of?",
    "In, can you tell me more about the area specified by the bounding box <TASK>?",
    "Please describe what can be seen in the rectangular region <TASK> of the image.",
    "Can you analyze the content of the area <TASK> within the photograph?",
    "In the provided image, please explain the content within the region <TASK>.",
    "I'm interested in the selected rectangle <TASK> in. Can you tell me more about it?",
    "Explain what can be found in the bounding box <TASK> in the context of the image.",
    "Kindly share your observations about the rectangular region <TASK> within.",
    "I'd like a thorough description of the area <TASK> in the image.",
    "Could you please provide a description of the rectangular area <TASK> in?",
    "Please describe the section of the picture defined by the bbox <TASK>.",
    "Tell me more about the scenery or objects within the rectangular region <TASK> in.",
    "Would you kindly describe the content of the area enclosed by <TASK> in the image?",
    "Help me understand the objects or scenery within the bounding box <TASK> in the image.",
    "I would like to know about the section of the image enclosed by the rectangle <TASK>. Can you describe it?",
    "Describe the selected rectangular area <TASK> in the photo.",
    "Tell me about the region <TASK> of the image.",
    "I request a description of the area <TASK> in the picture.",
    "Can you elaborate on the content of the bounding box <TASK> in?",
    "Please share details about the rectangular region <TASK> within the image.",
    "What can I find in the bbox <TASK> of the provided image?",
    "In the image, could you provide a description for the coordinates <TASK>?",
    "Could you tell me more about the area <TASK> in the snapshot?",
    "Fill me in on the details of the rectangular box <TASK> within the image.",
    "What's going on in the section of contained within the bounding box <TASK>?",
    "I would like a description of the content within the bbox <TASK> in.",
    "Please enlighten me about the area <TASK> in the photograph.",
    "Can you give me a visual rundown of the area <TASK> in?",
    "Describe the visual elements within the selected area <TASK> of the image.",
    "Tell me what you see in the area <TASK> within the context of the image.",
    "Explain the content within the rectangular region <TASK> of the image.",
    "I'd like some information about the bounding box <TASK> in the photo.",
    "What is happening within the rectangle defined by coordinates <TASK> in the image?",
    "Please describe the content within the area <TASK> displayed in the image.",
    "What can be seen in the bounding box <TASK> in the context of the provided image?",
    "Share some details about the objects or environment within the bounding box <TASK> in.",
    "Please describe the area <TASK> in the image for me.",
    "Can you generate a description of the contents within the selected region <TASK> in?",
    "What objects or scenery can be found in the area <TASK> in the image?",
    "Please tell me more about the rectangular section <TASK> in the photo.",
    "Could you describe the content of the bbox <TASK> in the image?",
    "What does the selected region <TASK> in the image encompass?",
    "I am interested in the region <TASK> of the image; please describe it.",
    "Can you provide some context for the area <TASK> within the picture?",
    "Please give me some details about the rectangle <TASK> in the image.",
    "In the photo, what can you see within the region defined by the bounding box <TASK>?",
    "I would like a detailed description of the portion of enclosed by the bbox <TASK>.",
    "Please help me understand the content present within the rectangle <TASK> in.",
    "Would you mind describing the rectangular area <TASK> in the provided image?"
]


================================================
FILE: dataset.md
================================================
# CogVLM-SFT-311K: Bilingual Visual Instruction Data in CogVLM SFT

CogVLM-SFT-311K is the primary aligned corpus used in the initial training of CogVLM v1.0. The process of constructing this dataset is as follows:
1. Approximately 3500 high-quality data samples were selected from the open source [MiniGPT-4](https://huggingface.co/datasets/Vision-CAIR/cc_sbu_align), known as minigpt4-3500.
2. Minigpt4-3500 was integrated with [Llava-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) and translated into Chinese through a language model.
3. We discovered significant noise in the detailed description part of minigpt4-3500 and Llava-instruct. Thus, we corrected these Chinese corpora and retranslated them into English.

## License

+ Due to non-commercial agreements, we did not use these data in the bilingual version of CogVLM or any other models involving commercialization.
+ The dataset license adheres to: <br> Attribution-NonCommercial 4.0 International. It should abide by the policy of OpenAI: https://openai.com/policies/terms-of-use
This will not allow you to use these data for any **commercial activitiesI**.

## Dataset Address

+ [CogVLM-SFT-311K](https://huggingface.co/datasets/THUDM/CogVLM-SFT-311K)

## Dataset Information

The dataset contains three folders corresponding to the mixed part of minigpt4-3500 and llava, the llava solo conversation, and the multi-turn conversation datasets. Their layout is as follows:
```
.CogVLM-SFT-311K
├── llava_details-minigpt4_3500_formate
├── llava_instruction_multi_conversations_formate
└── llava_instruction_single_conversation_formate
```
In our open-source data, the datasets are distributed as follows:
```
.llava_details-minigpt4_3500_formate
├── images
│   └── 00000001.jpg
└── labels
    └── 00000001.json
```
Images are stored in the images folder, while labels containing the corresponding image description or dialogue are stored in the labels folder.

## Dataset Quantity

+ llava_details-minigpt4_3500_formate: 22,464 images and descriptions
+ llava_instruction_multi_conversations_formate: 56,673 images and multi-turn conversations
+ llava_instruction_single_conversation_formate: 76,634 images and single-turn conversations

Dataset Format

Caption format for image description
```
{
  "captions": [
    {
      "role": "caption",
      "content": "The photograph features a beach scene with a group of people surfing in the ocean. There are ten individuals spaced out in the water, sitting or lying on various sizes of body boards. The surfers line up in a row, close to the sandy shoreline. Some of the body boards are visible floating on the surface of the water, with the surfers riding on them or nearby."
    }
  ]
}
```
Conversation format for image dialogue
```
{
  "conversations": [
    {
      "role": "user",
      "content": "What can be inferred about the zebras' behavior and surroundings?"
    },
    {
      "role": "assistant",
      "content": "Based on the image, we can infer that the two zebras are likely seeking relief from the sun's heat, as they are standing side by side under the branches of a thorny tree. This shade-providing tree offers some respite from the sun, possibly during the hottest part of the day. The zebras are in a green field with grass, providing them with an ideal environment to graze and eat while staying near their source of shelter. This shows that the zebras' behavior is influenced by the conditions and available resources in their surroundings. It also highlights that these animals adopt strategies to adapt to the fluctuating conditions of their environment, such as cooperation and seeking shelter, to survive and thrive in their natural habitat."
    }
  ]
}
```

## References
This project utilizes data and concepts based on the following research papers:
- Zhu, D., Chen, J., Shen, X., Li, X., & Elhoseiny, M. (2023). MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. arXiv preprint arXiv:2304.10592.
- Liu, H., Li, C., Wu, Q., & Lee, Y. J. (2023). Visual Instruction Tuning. arXiv:2304.08485. 

================================================
FILE: dataset_zh.md
================================================
# CogVLM-SFT-311K：CogVLM SFT 中的双语视觉指令数据集

CogVLM-SFT-311K 是我们在训练 **CogVLM v1.0** 最初版本时使用的主要对齐语料库。此数据集的构建过程如下：
1. 从开源的 [MiniGPT-4](https://huggingface.co/datasets/Vision-CAIR/cc_sbu_align) 中选取了大约3500个高质量数据样本，称为 minigpt4-3500。
2. 将 minigpt4-3500 与 [Llava-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) 整合，并通过语言模型翻译获得中文部分。
3. 我们发现在 minigpt4-3500 和 Llava-instruct 的详细描述部分存在许多噪声。因此，我们纠正了这两部分的中文语料，并将纠正后的语料重新翻译成英语。

## 许可证
+ 由于非商业协议限制，我们没有在 CogVLM的双语版本 和其他任何 涉及商业化的模型 中使用这些数据。 
+ 数据集许可证遵守：<br> Attribution-NonCommercial 4.0 International It should abide by the policy of OpenAI: https://openai.com/policies/terms-of-use
这将不允许你使用这些数据进行任何 **商业化行为**。

## 数据集地址

+ [CogVLM-SFT-311K](https://huggingface.co/datasets/THUDM/CogVLM-SFT-311K)

## 数据集信息
数据集共有三个文件夹，分别对应混合 minigpt4-3500 与llava混合的一部分数据集，llava 单论对话和多轮对话数据集。其布局如下：
```
.CogVLM-SFT-311K
├── llava_details-minigpt4_3500_formate
├── llava_instruction_multi_conversations_formate
└── llava_instruction_single_conversation_formate
```
在我们开源的数据中，数据集按照以下格式分布
```
.llava_details-minigpt4_3500_formate
├── images
│   └── 00000001.jpg
└── labels
    └──  00000001.json
```
其中，images存储图像数据，而labels存储这张图像对应的描述或对话。

## 数据集数量
+ llava_details-minigpt4_3500_formate 22464 张图片和描述
+ llava_instruction_muti_conversations_formate 56673 张图片和多轮连续对话
+ llava_instruction_single_conversation_formate 76634 张图片和单轮对话
## 数据集格式

图像描述 Caption 格式
```
{
  "captions": [
    {
      "role": "caption",
      "content": "这张照片展示了一男一女，他们都穿着红嘿色衣服，坐在桌子旁，在参加活动时面带微笑。这对情侣正在一起摆姿势，背景中还有其他人。在整个场景中可以看到另外几个人，一些人坐在附近的桌子旁，另一些人坐得离这对夫妇更近。各种餐桌设置，如酒杯和杯子，被放置在桌子上。排在最前面的男子打着领带。"
    }
  ]
}
```
图像对话 Json 格式
``` 
{
    "conversations": [
      {
        "role": "user",
        "content": "在尝试保持这个浴室有序时可能会遇到哪些挑战？",
      },
      {
        "role": "assistant",
        "content": "在尝试保持这空间和各种物品的存在，例如杯子和手机占用了可用的空间。在图像中，浴室柜台上只有一个水槽可用，这导致多个用户可能必须共享空间放置自己的物品。这可能导致杂乱和混乱外的储物解决方案，如架子、橱柜或墙壁挂架，以保持浴室有序并最小化柜台上的杂乱。"
      },
    ]
}
```

## References
This project utilizes data and concepts based on the following research papers:
- Zhu, D., Chen, J., Shen, X., Li, X., & Elhoseiny, M. (2023). MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. arXiv preprint arXiv:2304.10592.
- Liu, H., Li, C., Wu, Q., & Lee, Y. J. (2023). Visual Instruction Tuning. arXiv:2304.08485.

================================================
FILE: finetune_demo/evaluate_cogagent.sh
================================================
#! /bin/bash
# export PATH=/usr/local/cuda/bin:$PATH
# export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

NUM_GPUS_PER_WORKER=8
MP_SIZE=1

script_path=$(realpath $0)
script_dir=$(dirname $script_path)
main_dir=$(dirname $script_dir)
MODEL_TYPE="cogagent-chat"
VERSION="chat"
# Tips: max_length should be longer than 256, to accomodate low-resolution image tokens
MODEL_ARGS="--from_pretrained ./checkpoints/ft_cogagent_model \
    --max_length 400 \
    --local_tokenizer lmsys/vicuna-7b-v1.5 \
    --version $VERSION"

OPTIONS_SAT="SAT_HOME=~/.sat_models"
OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2 LOCAL_WORLD_SIZE=$NUM_GPUS_PER_WORKER"
HOST_FILE_PATH="hostfile"

train_data="./archive_split/train"
test_data="./archive_split/test"

gpt_options=" \
       --experiment-name finetune-$MODEL_TYPE \
       --model-parallel-size ${MP_SIZE} \
       --mode finetune \
       --train-iters 0 \
       --resume-dataloader \
       $MODEL_ARGS \
       --train-data ${train_data} \
       --test-data ${test_data} \
       --distributed-backend nccl \
       --lr-decay-style cosine \
       --warmup .02 \
       --checkpoint-activations \
       --save-interval 200 \
       --eval-interval 200 \
       --save "./checkpoints" \
       --strict-eval \
       --eval-batch-size 1 \
       --split 1. \
       --deepspeed_config test_config_bf16.json \
       --skip-init \
       --seed 2023
"

              
run_cmd="${OPTIONS_NCCL} ${OPTIONS_SAT} deepspeed --master_port 16666 --hostfile ${HOST_FILE_PATH} evaluate_cogagent_demo.py ${gpt_options}"
echo ${run_cmd}
eval ${run_cmd}

set +x

================================================
FILE: finetune_demo/evaluate_cogagent_demo.py
================================================
import os
import torch
import argparse
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from sat import mpu, get_args, get_tokenizer
from sat.training.deepspeed_training import training_main
from sat.helpers import print_rank0
from collections import defaultdict
from functools import partial

from utils.models import FineTuneTestCogAgentModel
from utils.utils import llama2_text_processor, llama2_text_processor_inference, get_image_processor


def data_collator(examples, cross_image_processor=None):
    def to_tensor(value):
        """Converts lists or numpy arrays to tensors."""
        if isinstance(value, list):
            return torch.tensor(value)
        elif isinstance(value, np.ndarray):
            return torch.from_numpy(value)
        return value
    
    def concatenate_tensors(attribute, key):
        """Concatenates tensors for a specific attribute and key."""
        if attribute is None:
            return torch.cat([ex[key] for ex in examples if isinstance(ex[key], torch.Tensor)])
        else:
            return torch.cat([ex[attribute][key] for ex in examples if isinstance(ex[attribute][key], torch.Tensor)])

    # Convert all lists and numpy arrays in examples to tensors
    for example in examples:
        for key, value in example.items():
            example[key] = to_tensor(value)

    # Extract and concatenate attributes from examples
    img_args = {}
    for attribute in ['vision', 'cross']:
        if attribute == 'cross' and cross_image_processor is None:
            continue

        if attribute in examples[-1]:  # Using the last example as reference
            for key in examples[-1][attribute]:
                tensor_key = f"{attribute}_{key}"
                tensors_to_concatenate = [ex[attribute][key] for ex in examples if isinstance(ex[attribute][key], torch.Tensor)]
                if tensors_to_concatenate:
                    img_args[tensor_key] = concatenate_tensors(attribute, key)
                else:
                    img_args[tensor_key] = examples[-1][attribute][key]

    # Remove 'vision' and 'cross' keys from examples
    for example in examples:
        example.pop('vision', None)
        example.pop('cross', None)

    # Create model_args by concatenating tensors and copying other attributes
    model_args = {key: concatenate_tensors(None, key) 
                  if isinstance(examples[-1][key], torch.Tensor) else examples[-1][key] 
                  for key in examples[-1]
                  }
    
    # Merge img_args into model_args
    model_args.update(img_args)
    return model_args

def broadcast_auto(data_dict):
    # Classify keys based on their data type
    tensor_keys_by_dtype = defaultdict(list)
    non_tensor_keys = []

    for key, value in data_dict.items():
        if isinstance(value, torch.Tensor):
            tensor_keys_by_dtype[value.dtype].append(key)
        else:
            non_tensor_keys.append(key)

    # Broadcast tensor data and collect in a new dictionary
    broadcasted_data = {}
    for dtype, keys in tensor_keys_by_dtype.items():
        broadcasted_data.update(mpu.broadcast_data(keys, data_dict, dtype))

    # Add non-tensor data to the new dictionary
    for key in non_tensor_keys:
        broadcasted_data[key] = data_dict[key]

    return broadcasted_data

def get_batch(data_iterator, args, timers):
    # Broadcast data.
    timers('data loader').start()
    if data_iterator is not None:
        data = next(data_iterator)
    else:
        data = None
    timers('data loader').stop()
    data_b = broadcast_auto(data)
    for k in data_b:
        if type(data_b[k]) is torch.Tensor and data_b[k].dtype is not torch.int32 and data_b[k].dtype is not torch.long:
            if args.fp16:
                data_b[k] = data_b[k].half()
            elif args.bf16:
                data_b[k] = data_b[k].bfloat16()
    return data_b

from torch.nn import CrossEntropyLoss
import numpy as np

from sat.model.mixins import CachedAutoregressiveMixin
from sat.generation.autoregressive_sampling import filling_sequence
from sat.generation.sampling_strategies import BaseStrategy, BeamSearchStrategy


def chat(model, tokenizer, tokens,
         max_length: int = 1800, num_beams=5, top_p=0.95, top_k=0, temperature=0.8, **kwargs):
    inputs = tokens.to(model.parameters().__next__().device)[0]
    seq = torch.cat(
        [inputs, torch.tensor([-1] * (max_length - len(inputs)), device=inputs.device)], dim=0
    )
    strategy = BaseStrategy(temperature=temperature, top_p=0.4, top_k=1, end_tokens=[tokenizer.eos_token_id])
    # strategy = BeamSearchStrategy(temperature=temperature, top_p=top_p, top_k=top_k, end_tokens=[tokenizer.eos_token_id],
    #                               num_beams=num_beams, consider_end=True)
    get_func = llama2_text_processor_inference.get_func(None, None, image_rope_mask=kwargs['image_rope_mask'])
    output = filling_sequence(
        model, seq,
        batch_size=1,
        strategy=strategy,
        get_masks_and_position_ids=get_func,
        **kwargs
    )[0]  # drop memory

    return output


def forward_step_eval(data_iterator, model, args, timers):
    def compute_metrics(eval_preds):
        preds, labels, device = eval_preds
        preds = preds.unsqueeze(0)
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        if args.ignore_pad_token_for_loss:
            # Replace -100 in the labels as we can't decode them.
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        score_dict = {
            "acc": [],
            "acc_w/o_case": [],
        }
        for pred, label in zip(decoded_preds, decoded_labels):
            if args.rank == 0:
                print('pred', pred, 'label', label, flush=True)
            if pred == label:
                score_dict['acc'].append(1.)
            else:
                score_dict['acc'].append(0.)
            if pred.lower() == label.lower():
                score_dict['acc_w/o_case'].append(1.)
            else:
                score_dict['acc_w/o_case'].append(0.)
            

        for k, v in score_dict.items():
            score_dict[k] = float(np.mean(v))
        return score_dict

    # Get the batch.
    timers('batch generator').start()
    data_b = get_batch(
        data_iterator, args, timers)
    timers('batch generator').stop()

    context_len = int(data_b['context_length'][0])
    tokens = data_b['input_ids'][:, :context_len]
    data_b['vision_expert_mask'] = data_b['vision_expert_mask'][:, :context_len]
    data_b['image_embed_mask'] = data_b['image_embed_mask'][:, :context_len]
    data_b['image_rope_mask'] = data_b['image_rope_mask'][:, :context_len]

    data_b.pop('input_ids')
    data_b.pop('attention_mask')
    data_b.pop('position_ids')
    labels = data_b.pop('labels')
    qid = data_b.pop('question_id')

    model.add_mixin('auto-regressive', CachedAutoregressiveMixin())
    outputs = chat(model, tokenizer, tokens, **data_b)[0][context_len:]
    # print(outputs)
    model.del_mixin('auto-regressive')

    return torch.tensor(0, device=outputs.device), {k: torch.tensor(v, device=outputs.device) for k, v in
                                                    compute_metrics(
                                                        (outputs.cpu(), labels.cpu(), outputs.device)).items()}


from torch.nn import CrossEntropyLoss
def forward_step(data_iterator, model, args, timers):
    """Forward step."""

    # Get the batch.
    timers('batch generator').start()
    data_b = get_batch(
        data_iterator, args, timers)
    labels = data_b.pop('labels')
    timers('batch generator').stop()
    logits = model(**data_b)[0]
    lm_logits = logits.to(torch.float32)
    # Shift so that tokens < n predict n
    shift_labels = labels[..., 1:].contiguous()
    shift_logits = lm_logits[..., -1-shift_labels.size(-1):-1, :].contiguous()
    # Flatten the tokens
    loss_fct = CrossEntropyLoss(ignore_index=-100)
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    loss = loss.to(torch.float32)

    return loss, {'loss': loss}

from utils.utils import ItemDataset
def create_dataset_function(image_processor, text_processor, cross_image_processor, path, args):
    dataset = ItemDataset(image_processor, text_processor, args, path, cross_image_processor=cross_image_processor)
    return dataset

if __name__ == '__main__':
    py_parser = argparse.ArgumentParser(add_help=False)
    py_parser.add_argument('--max_length', type=int)
    py_parser.add_argument('--ignore_pad_token_for_loss', action='store_false')
    py_parser.add_argument("--version", type=str, default="chat", help='version to interact with')
    py_parser.add_argument("--from_pretrained", type=str, default="cogagent-chat", help='pretrained ckpt')
    py_parser.add_argument("--local_tokenizer", type=str, default="lmsys/vicuna-7b-v1.5", help='tokenizer path')
    py_parser.add_argument("--vit_checkpoint_activations", action='store_true')
    py_parser = FineTuneTestCogAgentModel.add_model_specific_args(py_parser)
    known, args_list = py_parser.parse_known_args()
    args = get_args(args_list)
    args = argparse.Namespace(**vars(args), **vars(known))
    if args.use_qlora:
        args.device = 'cpu'

    model, args = FineTuneTestCogAgentModel.from_pretrained(args.from_pretrained, args, overwrite_args={'model_parallel_size': args.model_parallel_size} if args.model_parallel_size != 1 else {})
    if args.use_qlora and torch.cuda.is_available():
        model = model.to('cuda')
    from utils.utils import llama2_tokenizer
    tokenizer = llama2_tokenizer(args.local_tokenizer, signal_type=args.version)
    image_processor = get_image_processor(args.eva_args["image_size"][0])
    cross_image_processor = get_image_processor(args.cross_image_pix)
    text_processor = llama2_text_processor(tokenizer, args.max_length, args.image_length)

    training_main(args, model_cls=model, forward_step_function=forward_step, create_dataset_function=partial(create_dataset_function, image_processor, text_processor, cross_image_processor), collate_fn=partial(data_collator, cross_image_processor=cross_image_processor), forward_step_eval=forward_step_eval)

================================================
FILE: finetune_demo/evaluate_cogvlm.sh
================================================
#! /bin/bash
# export PATH=/usr/local/cuda/bin:$PATH
# export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

NUM_GPUS_PER_WORKER=8
MP_SIZE=1

script_path=$(realpath $0)
script_dir=$(dirname $script_path)
main_dir=$(dirname $script_dir)
MODEL_TYPE="cogvlm-base-490"
VERSION="base"
MODEL_ARGS="--from_pretrained ./checkpoints/merged_lora_490 \
    --max_length 1288 \
    --lora_rank 10 \
    --use_lora \
    --local_tokenizer lmsys/vicuna-7b-v1.5 \
    --version $VERSION"
# Tips: If training models of resolution 244, you can set --max_length smaller 


OPTIONS_SAT="SAT_HOME=~/.sat_models"
OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2 LOCAL_WORLD_SIZE=$NUM_GPUS_PER_WORKER"
HOST_FILE_PATH="hostfile"

train_data="./archive_split/train"
test_data="./archive_split/test"

gpt_options=" \
       --experiment-name finetune-$MODEL_TYPE \
       --model-parallel-size ${MP_SIZE} \
       --mode finetune \
       --train-iters 0 \
       --resume-dataloader \
       $MODEL_ARGS \
       --train-data ${train_data} \
       --test-data ${test_data} \
       --distributed-backend nccl \
       --lr-decay-style cosine \
       --warmup .02 \
       --checkpoint-activations \
       --save-interval 200 \
       --eval-interval 200 \
       --save "./checkpoints" \
       --strict-eval \
       --eval-batch-size 1 \
       --split 1. \
       --deepspeed_config test_config_bf16.json \
       --skip-init \
       --seed 2023
"

              
run_cmd="${OPTIONS_NCCL} ${OPTIONS_SAT} deepspeed --master_port 16666 --hostfile ${HOST_FILE_PATH} evaluate_cogvlm_demo.py ${gpt_options}"
echo ${run_cmd}
eval ${run_cmd}

set +x

================================================
FILE: finetune_demo/evaluate_cogvlm_demo.py
================================================
import os
import torch
import argparse
from functools import partial
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from sat import mpu, get_args, get_tokenizer
from sat.training.deepspeed_training import training_main
from sat.helpers import print_rank0
from utils.models import FineTuneTestCogVLMModel
from utils.utils import llama2_text_processor, llama2_text_processor_inference, get_image_processor


def data_collator(examples):
    examples = [ex for ex in examples if len(ex) > 0] # drop {}
    for example in examples:
        for k in example:
            if isinstance(example[k], list):
                example[k] = torch.tensor(example[k])
            elif isinstance(example[k], np.ndarray):
                example[k] = torch.from_numpy(example[k])
    img_args = {}
    tmp_example = examples[0]
    for k in tmp_example['vision']:
        if type(tmp_example['vision'][k]) is torch.Tensor:
            img_args['vision_'+k] = torch.cat([example['vision'][k] for example in examples])
        else:
            img_args['vision_'+k] = example['vision'][k]
    for example in examples:
        example.pop('vision')
        if 'cross' in example:
            example.pop('cross')

    model_args = {}
    tmp_example = examples[0]
    for k in tmp_example:
        if type(tmp_example[k]) is torch.Tensor:
            model_args[k] = torch.cat([example[k] for example in examples])
        else:
            model_args[k] = tmp_example[k]
    model_args.update(img_args)
    return model_args

from collections import defaultdict

def broadcast_auto(data_dict):
    type2list = defaultdict(list)
    other = []
    for k in data_dict:
        if type(data_dict[k]) is torch.Tensor:
            type2list[data_dict[k].dtype].append(k)
        else:
            other.append(k)
    new_data = {}
    for k in type2list:
        new_data.update(mpu.broadcast_data(type2list[k], data_dict, k))
    for k in other:
        new_data[k] = data_dict[k]
    return new_data

def get_batch(data_iterator, args, timers):
    # Broadcast data.
    timers('data loader').start()
    if data_iterator is not None:
        data = next(data_iterator)
    else:
        data = None
    timers('data loader').stop()
    data_b = broadcast_auto(data)
    for k in data_b:
        if type(data_b[k]) is torch.Tensor and data_b[k].dtype is not torch.int32 and data_b[k].dtype is not torch.long:
            if args.fp16:
                data_b[k] = data_b[k].half()
            elif args.bf16:
                data_b[k] = data_b[k].bfloat16()
    return data_b

from torch.nn import CrossEntropyLoss
import numpy as np

from sat.model.mixins import CachedAutoregressiveMixin
from sat.generation.autoregressive_sampling import filling_sequence
from sat.generation.sampling_strategies import BaseStrategy, BeamSearchStrategy


def chat(model, tokenizer, tokens,
         max_length: int = 1800, num_beams=5, top_p=0.95, top_k=0, temperature=0.8, **kwargs):
    inputs = tokens.to(model.parameters().__next__().device)[0]
    seq = torch.cat(
        [inputs, torch.tensor([-1] * (max_length - len(inputs)), device=inputs.device)], dim=0
    )
    strategy = BaseStrategy(temperature=temperature, top_p=0.4, top_k=1, end_tokens=[tokenizer.eos_token_id])
    # strategy = BeamSearchStrategy(temperature=temperature, top_p=top_p, top_k=top_k, end_tokens=[tokenizer.eos_token_id],
    #                               num_beams=num_beams, consider_end=True)
    get_func = llama2_text_processor_inference.get_func(None, None, image_rope_mask=kwargs['image_rope_mask'])
    output = filling_sequence(
        model, seq,
        batch_size=1,
        strategy=strategy,
        get_masks_and_position_ids=get_func,
        **kwargs
    )[0]  # drop memory

    return output


def forward_step_eval(data_iterator, model, args, timers):
    def compute_metrics(eval_preds):
        preds, labels, device = eval_preds
        preds = preds.unsqueeze(0)
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        if args.ignore_pad_token_for_loss:
            # Replace -100 in the labels as we can't decode them.
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        score_dict = {
            "acc": [],
            "acc_w/o_case": [],
        }
        for pred, label in zip(decoded_preds, decoded_labels):
            if args.rank == 0:
                print('pred', pred, 'label', label, flush=True)
            if pred == label:
                score_dict['acc'].append(1.)
            else:
                score_dict['acc'].append(0.)
            if pred.lower() == label.lower():
                score_dict['acc_w/o_case'].append(1.)
            else:
                score_dict['acc_w/o_case'].append(0.)
            

        for k, v in score_dict.items():
            score_dict[k] = float(np.mean(v))
        return score_dict

    # Get the batch.
    timers('batch generator').start()
    data_b = get_batch(
        data_iterator, args, timers)
    timers('batch generator').stop()

    context_len = int(data_b['context_length'][0])
    tokens = data_b['input_ids'][:, :context_len]
    data_b['vision_expert_mask'] = data_b['vision_expert_mask'][:, :context_len]
    data_b['image_embed_mask'] = data_b['image_embed_mask'][:, :context_len]
    data_b['image_rope_mask'] = data_b['image_rope_mask'][:, :context_len]

    data_b.pop('input_ids')
    data_b.pop('attention_mask')
    data_b.pop('position_ids')
    labels = data_b.pop('labels')
    qid = data_b.pop('question_id')

    model.add_mixin('auto-regressive', CachedAutoregressiveMixin())
    outputs = chat(model, tokenizer, tokens, **data_b)[0][context_len:]
    # print(outputs)
    model.del_mixin('auto-regressive')

    return torch.tensor(0, device=outputs.device), {k: torch.tensor(v, device=outputs.device) for k, v in
                                                    compute_metrics(
                                                        (outputs.cpu(), labels.cpu(), outputs.device)).items()}


from torch.nn import CrossEntropyLoss
def forward_step(data_iterator, model, args, timers):
    """Forward step."""

    # Get the batch.
    timers('batch generator').start()
    data_b = get_batch(
        data_iterator, args, timers)
    labels = data_b.pop('labels')
    timers('batch generator').stop()
    logits = model(**data_b)[0]
    lm_logits = logits.to(torch.float32)
    # Shift so that tokens < n predict n
    shift_labels = labels[..., 1:].contiguous()
    shift_logits = lm_logits[..., -1-shift_labels.size(-1):-1, :].contiguous()
    # Flatten the tokens
    loss_fct = CrossEntropyLoss(ignore_index=-100)
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    loss = loss.to(torch.float32)

    return loss, {'loss': loss}

from utils.utils import ItemDataset
def create_dataset_function(image_processor, text_processor, path, args):
    dataset = ItemDataset(image_processor, text_processor, args, path)
    return dataset

if __name__ == '__main__':
    py_parser = argparse.ArgumentParser(add_help=False)
    py_parser.add_argument('--max_length', type=int)
    py_parser.add_argument('--ignore_pad_token_for_loss', action='store_false')
    py_parser.add_argument("--version", type=str, default="chat", help='version to interact with')
    py_parser.add_argument("--from_pretrained", type=str, default="cogvlm-chat", help='pretrained ckpt')
    py_parser.add_argument("--local_tokenizer", type=str, default="lmsys/vicuna-7b-v1.5", help='tokenizer path')
    py_parser.add_argument("--vit_checkpoint_activations", action='store_true')
    py_parser = FineTuneTestCogVLMModel.add_model_specific_args(py_parser)
    known, args_list = py_parser.parse_known_args()
    args = get_args(args_list)
    args = argparse.Namespace(**vars(args), **vars(known))
    if args.use_qlora:
        args.device = 'cpu'

    model, args = FineTuneTestCogVLMModel.from_pretrained(args.from_pretrained, args, overwrite_args={'model_parallel_size': args.model_parallel_size} if args.model_parallel_size != 1 else {})
    if args.use_qlora and torch.cuda.is_available():
        model = model.to('cuda')
    from utils.utils import llama2_tokenizer
    tokenizer = llama2_tokenizer(args.local_tokenizer, signal_type=args.version)
    image_processor = get_image_processor(args.eva_args["image_size"][0])
    text_processor = llama2_text_processor(tokenizer, args.max_length, args.image_length)

    training_main(args, model_cls=model, forward_step_function=forward_step, create_dataset_function=partial(create_dataset_function, image_processor, text_processor), collate_fn=data_collator, forward_step_eval=forward_step_eval)

================================================
FILE: finetune_demo/finetune_cogagent_demo.py
================================================
import os
import torch
import argparse
from functools import partial
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from sat import mpu, get_args, get_tokenizer
from sat.training.deepspeed_training import training_main
from sat.helpers import print_rank0
from utils.models import FineTuneTrainCogAgentModel
from utils.utils import llama2_text_processor, llama2_text_processor_inference, get_image_processor

def disable_untrainable_params(self):
    total_trainable = 0
    # enable = ['vit']
    enable = ["encoder", "cross_attention", "linear_proj", 'mlp.vision', 'rotary.vision', 'eoi', 'boi', 'vit']
    if self.args.use_ptuning:
        enable.extend(['ptuning'])
    if self.args.use_lora or self.args.use_qlora:
        enable.extend(['matrix_A', 'matrix_B'])
    for n, p in self.named_parameters():
        flag = False
        for e in enable:
            if type(e) is tuple:
                if e[0].lower() in n.lower() and e[1].lower() in n.lower() and 55 > int(n[:n.find('.mlp')].split('.')[-1]) > 45:
                    flag = True
                    break
            else:
                if e.lower() in n.lower():
                    flag = True
                    break
        if not flag:
            p.requires_grad_(False)
        else:
            total_trainable += p.numel()
            if 'encoder' in n or 'vit' in n:
                p.lr_scale = 0.1
            print_rank0(n)
    print_rank0("***** Total trainable parameters: "+str(total_trainable)+" *****")

FineTuneTrainCogAgentModel.disable_untrainable_params = disable_untrainable_params

def data_collator(examples, cross_image_processor=None):
    def to_tensor(value):
        """Converts lists or numpy arrays to tensors."""
        if isinstance(value, list):
            return torch.tensor(value)
        elif isinstance(value, np.ndarray):
            return torch.from_numpy(value)
        return value
    
    def concatenate_tensors(attribute, key):
        """Concatenates tensors for a specific attribute and key."""
        if attribute is None:
            return torch.cat([ex[key] for ex in examples if isinstance(ex[key], torch.Tensor)])
        else:
            return torch.cat([ex[attribute][key] for ex in examples if isinstance(ex[attribute][key], torch.Tensor)])

    # Convert all lists and numpy arrays in examples to tensors
    for example in examples:
        for key, value in example.items():
            example[key] = to_tensor(value)

    # Extract and concatenate attributes from examples
    img_args = {}
    for attribute in ['vision', 'cross']:
        if attribute == 'cross' and cross_image_processor is None:
            continue

        if attribute in examples[-1]:  # Using the last example as reference
            for key in examples[-1][attribute]:
                tensor_key = f"{attribute}_{key}"
                tensors_to_concatenate = [ex[attribute][key] for ex in examples if isinstance(ex[attribute][key], torch.Tensor)]
                if tensors_to_concatenate:
                    img_args[tensor_key] = concatenate_tensors(attribute, key)
                else:
                    img_args[tensor_key] = examples[-1][attribute][key]

    # Remove 'vision' and 'cross' keys from examples
    for example in examples:
        example.pop('vision', None)
        example.pop('cross', None)

    # Create model_args by concatenating tensors and copying other attributes
    model_args = {key: concatenate_tensors(None, key) 
                  if isinstance(examples[-1][key], torch.Tensor) else examples[-1][key] 
                  for key in examples[-1]
                  }
    
    # Merge img_args into model_args
    model_args.update(img_args)
    return model_args


from collections import defaultdict

def broadcast_auto(data_dict):
    type2list = defaultdict(list)
    other = []
    for k in data_dict:
        if type(data_dict[k]) is torch.Tensor:
            type2list[data_dict[k].dtype].append(k)
        else:
            other.append(k)
    new_data = {}
    for k in type2list:
        new_data.update(mpu.broadcast_data(type2list[k], data_dict, k))
    for k in other:
        new_data[k] = data_dict[k]
    return new_data

def get_batch(data_iterator, args, timers):
    # Broadcast data.
    timers('data loader').start()
    if data_iterator is not None:
        data = next(data_iterator)
    else:
        data = None
    timers('data loader').stop()
    data_b = broadcast_auto(data)
    for k in data_b:
        if type(data_b[k]) is torch.Tensor and data_b[k].dtype is not torch.int32 and data_b[k].dtype is not torch.long:
            if args.fp16:
                data_b[k] = data_b[k].half()
            elif args.bf16:
                data_b[k] = data_b[k].bfloat16()
    return data_b

from torch.nn import CrossEntropyLoss
import numpy as np

from sat.model.mixins import CachedAutoregressiveMixin
from sat.generation.autoregressive_sampling import filling_sequence
from sat.generation.sampling_strategies import BaseStrategy, BeamSearchStrategy


def chat(model, tokenizer, tokens,
         max_length: int = 1800, num_beams=5, top_p=0.95, top_k=0, temperature=0.8, **kwargs):
    inputs = tokens.to(model.parameters().__next__().device)[0]
    seq = torch.cat(
        [inputs, torch.tensor([-1] * (max_length - len(inputs)), device=inputs.device)], dim=0
    )
    strategy = BaseStrategy(temperature=temperature, top_p=0.4, top_k=1, end_tokens=[tokenizer.eos_token_id])
    # strategy = BeamSearchStrategy(temperature=temperature, top_p=top_p, top_k=top_k, end_tokens=[tokenizer.eos_token_id],
    #                               num_beams=num_beams, consider_end=True)
    get_func = llama2_text_processor_inference.get_func(None, None, image_rope_mask=kwargs['image_rope_mask'])
    output = filling_sequence(
        model, seq,
        batch_size=1,
        strategy=strategy,
        get_masks_and_position_ids=get_func,
        **kwargs
    )[0]  # drop memory

    return output


def forward_step_eval(data_iterator, model, args, timers):
    def compute_metrics(eval_preds):
        preds, labels, device = eval_preds
        preds = preds.unsqueeze(0)
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        if args.ignore_pad_token_for_loss:
            # Replace -100 in the labels as we can't decode them.
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        score_dict = {
            "acc": [],
            "acc_w/o_case": [],
        }
        for pred, label in zip(decoded_preds, decoded_labels):
            if args.rank == 0:
                print('pred', pred, 'label', label, flush=True)
            if pred == label:
                score_dict['acc'].append(1.)
            else:
                score_dict['acc'].append(0.)
            if pred.lower() == label.lower():
                score_dict['acc_w/o_case'].append(1.)
            else:
                score_dict['acc_w/o_case'].append(0.)
            

        for k, v in score_dict.items():
            score_dict[k] = float(np.mean(v))
        return score_dict

    # Get the batch.
    timers('batch generator').start()
    data_b = get_batch(
        data_iterator, args, timers)
    timers('batch generator').stop()

    context_len = int(data_b['context_length'][0])
    tokens = data_b['input_ids'][:, :context_len]
    data_b['vision_expert_mask'] = data_b['vision_expert_mask'][:, :context_len]
    data_b['image_embed_mask'] = data_b['image_embed_mask'][:, :context_len]
    data_b['image_rope_mask'] = data_b['image_rope_mask'][:, :context_len]

    data_b.pop('input_ids')
    data_b.pop('attention_mask')
    data_b.pop('position_ids')
    labels = data_b.pop('labels')
    qid = data_b.pop('question_id')

    model.add_mixin('auto-regressive', CachedAutoregressiveMixin())
    outputs = chat(model, tokenizer, tokens, **data_b)[0][context_len:]
    # print(outputs)
    model.del_mixin('auto-regressive')

    return torch.tensor(0, device=outputs.device), {k: torch.tensor(v, device=outputs.device) for k, v in
                                                    compute_metrics(
                                                        (outputs.cpu(), labels.cpu(), outputs.device)).items()}


from torch.nn import CrossEntropyLoss
def forward_step(data_iterator, model, args, timers):
    """Forward step."""

    # Get the batch.
    timers('batch generator').start()
    data_b = get_batch(
        data_iterator, args, timers)
    labels = data_b.pop('labels')
    timers('batch generator').stop()
    logits = model(**data_b)[0]
    lm_logits = logits.to(torch.float32)
    # Shift so that tokens < n predict n
    shift_labels = labels[..., 1:].contiguous()
    shift_logits = lm_logits[..., -1-shift_labels.size(-1):-1, :].contiguous()
    # Flatten the tokens
    loss_fct = CrossEntropyLoss(ignore_index=-100)
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    loss = loss.to(torch.float32)

    return loss, {'loss': loss}

from utils.utils import ItemDataset
def create_dataset_function(image_processor, text_processor, cross_image_processor, path, args):
    dataset = ItemDataset(image_processor, text_processor, args, path, cross_image_processor=cross_image_processor)
    return dataset

from sat.model.finetune.lora2 import LoraMixin
from sat.model.finetune.prompt_tuning import PTuningV2Mixin

if __name__ == '__main__':
    py_parser = argparse.ArgumentParser(add_help=False)
    py_parser.add_argument('--max_length', type=int)
    py_parser.add_argument('--ignore_pad_token_for_loss', action='store_false')
    py_parser.add_argument("--version", type=str, default="chat", choices=["chat", "vqa"], help='version to interact with')
    py_parser.add_argument("--from_pretrained", type=str, default="cogagent-chat", help='pretrained ckpt')
    py_parser.add_argument("--local_tokenizer", type=str, default="lmsys/vicuna-7b-v1.5", help='tokenizer path')
    py_parser.add_argument("--vit_checkpoint_activations", action='store_true')
    py_parser = FineTuneTrainCogAgentModel.add_model_specific_args(py_parser)
    known, args_list = py_parser.parse_known_args()
    args = get_args(args_list)
    args = argparse.Namespace(**vars(args), **vars(known))
    if args.use_qlora:
        args.device = 'cpu'

    model, args = FineTuneTrainCogAgentModel.from_pretrained(args.from_pretrained, args, overwrite_args={'model_parallel_size': args.model_parallel_size} if args.model_parallel_size != 1 else {})
    if args.use_ptuning: # TODO: wait for SAT updating
        model.add_mixin("ptuning", PTuningV2Mixin(args.num_layers, args.hidden_size // args.num_attention_heads, args.num_attention_heads, args.pre_seq_len))

    if args.use_lora:
        model.add_mixin("lora", LoraMixin(args.num_layers, args.lora_rank, layer_range=args.layer_range), reinit=True)
        model.get_mixin("eva").vit_model.add_mixin("lora", LoraMixin(args.eva_args['num_layers'], args.lora_rank, layer_range=args.layer_range), reinit=True)
    elif args.use_qlora:
        model.add_mixin("lora", LoraMixin(args.num_layers, args.lora_rank, layer_range=args.layer_range, qlora=True), reinit=True)
        
    if args.use_qlora and torch.cuda.is_available():
        model = model.to('cuda')
    from utils.utils import llama2_tokenizer
    tokenizer = llama2_tokenizer(args.local_tokenizer, signal_type=args.version)
    image_processor = get_image_processor(args.eva_args["image_size"][0])
    cross_image_processor = get_image_processor(args.cross_image_pix)
    text_processor = llama2_text_processor(tokenizer, args.max_length, args.image_length)

    model = training_main(args, model_cls=model, forward_step_function=forward_step, create_dataset_function=partial(create_dataset_function, image_processor, text_processor, cross_image_processor), collate_fn=partial(data_collator, cross_image_processor=cross_image_processor), forward_step_eval=forward_step_eval)
    if args.use_lora:
        model.get_mixin("lora").merge_lora()
        model.get_mixin("eva").vit_model.get_mixin("lora").merge_lora()
        args.use_lora = False
        args.save = "checkpoints/merged_lora_cogagent"
        from sat.training.model_io import save_checkpoint
        save_checkpoint(1, model, None, None, args)

================================================
FILE: finetune_demo/finetune_cogagent_lora.sh
================================================
#! /bin/bash
# export PATH=/usr/local/cuda/bin:$PATH
# export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

NUM_GPUS_PER_WORKER=8
MP_SIZE=1

script_path=$(realpath $0)
script_dir=$(dirname $script_path)
main_dir=$(dirname $script_dir)
MODEL_TYPE="cogagent-chat"
VERSION="chat"
MODEL_ARGS="--from_pretrained $MODEL_TYPE \
    --max_length 400 \
    --lora_rank 50 \
    --use_lora \
    --local_tokenizer lmsys/vicuna-7b-v1.5 \
    --version $VERSION"
# TIPS: max_length include low-resolution image sequence (which has 256 tokens) 

OPTIONS_SAT="SAT_HOME=~/.sat_models"
OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2 LOCAL_WORLD_SIZE=$NUM_GPUS_PER_WORKER"
HOST_FILE_PATH="hostfile"

train_data="./archive_split/train"
valid_data="./archive_split/valid"

gpt_options=" \
       --experiment-name finetune-$MODEL_TYPE \
       --model-parallel-size ${MP_SIZE} \
       --mode finetune \
       --train-iters 2000 \
       --resume-dataloader \
       $MODEL_ARGS \
       --train-data ${train_data} \
       --valid-data ${valid_data} \
       --distributed-backend nccl \
       --lr-decay-style cosine \
       --warmup .02 \
       --checkpoint-activations \
       --vit_checkpoint_activations \
       --save-interval 200 \
       --eval-interval 200 \
       --save "./checkpoints" \
       --eval-iters 10 \
       --eval-batch-size 1 \
       --split 1. \
       --deepspeed_config test_config_bf16.json \
       --skip-init \
       --seed 2023
"

              
run_cmd="${OPTIONS_NCCL} ${OPTIONS_SAT} deepspeed --master_port 16666 --hostfile ${HOST_FILE_PATH} finetune_cogagent_demo.py ${gpt_options}"
echo ${run_cmd}
eval ${run_cmd}

set +x

================================================
FILE: finetune_demo/finetune_cogvlm_demo.py
================================================
import os
import torch
import argparse
from functools import partial
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from sat import mpu, get_args, get_tokenizer
from sat.training.deepspeed_training import training_main
from sat.helpers import print_rank0
from utils.models import FineTuneTrainCogVLMModel
from utils.utils import llama2_text_processor, llama2_text_processor_inference, get_image_processor

def disable_untrainable_params(self):
    total_trainable = 0
    enable = [('mlp', 'vit')]
    if self.args.use_ptuning:
        enable.extend(['ptuning'])
    if self.args.use_lora or self.args.use_qlora:
        enable.extend(['matrix_A', 'matrix_B'])
    for n, p in self.named_parameters():
        flag = False
        for e in enable:
            if type(e) is tuple:
                if e[0].lower() in n.lower() and e[1].lower() in n.lower() and 55 > int(n[:n.find('.mlp')].split('.')[-1]) > 45:
                    flag = True
                    break
            else:
                if e.lower() in n.lower():
                    flag = True
                    break
        if not flag:
            p.requires_grad_(False)
        else:
            total_trainable += p.numel()
            print_rank0(n)
    print_rank0("***** Total trainable parameters: "+str(total_trainable)+" *****")

FineTuneTrainCogVLMModel.disable_untrainable_params = disable_untrainable_params

def data_collator(examples):
    examples = [ex for ex in examples if len(ex) > 0] # drop {}
    for example in examples:
        for k in example:
            if isinstance(example[k], list):
                example[k] = torch.tensor(example[k])
            elif isinstance(example[k], np.ndarray):
                example[k] = torch.from_numpy(example[k])
    img_args = {}
    tmp_example = examples[0]
    for k in tmp_example['vision']:
        if type(tmp_example['vision'][k]) is torch.Tensor:
            img_args['vision_'+k] = torch.cat([example['vision'][k] for example in examples])
        else:
            img_args['vision_'+k] = example['vision'][k]
    for example in examples:
        example.pop('vision')
        if 'cross' in example:
            example.pop('cross')

    model_args = {}
    tmp_example = examples[0]
    for k in tmp_example:
        if type(tmp_example[k]) is torch.Tensor:
            model_args[k] = torch.cat([example[k] for example in examples])
        else:
            model_args[k] = tmp_example[k]
    model_args.update(img_args)
    return model_args

from collections import defaultdict

def broadcast_auto(data_dict):
    type2list = defaultdict(list)
    other = []
    for k in data_dict:
        if type(data_dict[k]) is torch.Tensor:
            type2list[data_dict[k].dtype].append(k)
        else:
            other.append(k)
    new_data = {}
    for k in type2list:
        new_data.update(mpu.broadcast_data(type2list[k], data_dict, k))
    for k in other:
        new_data[k] = data_dict[k]
    return new_data

def get_batch(data_iterator, args, timers):
    # Broadcast data.
    timers('data loader').start()
    if data_iterator is not None:
        data = next(data_iterator)
    else:
        data = None
    timers('data loader').stop()
    data_b = broadcast_auto(data)
    for k in data_b:
        if type(data_b[k]) is torch.Tensor and data_b[k].dtype is not torch.int32 and data_b[k].dtype is not torch.long:
            if args.fp16:
                data_b[k] = data_b[k].half()
            elif args.bf16:
                data_b[k] = data_b[k].bfloat16()
    return data_b

from torch.nn import CrossEntropyLoss
import numpy as np

from sat.model.mixins import CachedAutoregressiveMixin
from sat.generation.autoregressive_sampling import filling_sequence
from sat.generation.sampling_strategies import BaseStrategy, BeamSearchStrategy


def chat(model, tokenizer, tokens,
         max_length: int = 1800, num_beams=5, top_p=0.95, top_k=0, temperature=0.8, **kwargs):
    inputs = tokens.to(model.parameters().__next__().device)[0]
    seq = torch.cat(
        [inputs, torch.tensor([-1] * (max_length - len(inputs)), device=inputs.device)], dim=0
    )
    strategy = BaseStrategy(temperature=temperature, top_p=0.4, top_k=1, end_tokens=[tokenizer.eos_token_id])
    # strategy = BeamSearchStrategy(temperature=temperature, top_p=top_p, top_k=top_k, end_tokens=[tokenizer.eos_token_id],
    #                               num_beams=num_beams, consider_end=True)
    get_func = llama2_text_processor_inference.get_func(None, None, image_rope_mask=kwargs['image_rope_mask'])
    output = filling_sequence(
        model, seq,
        batch_size=1,
        strategy=strategy,
        get_masks_and_position_ids=get_func,
        **kwargs
    )[0]  # drop memory

    return output


def forward_step_eval(data_iterator, model, args, timers):
    def compute_metrics(eval_preds):
        preds, labels, device = eval_preds
        preds = preds.unsqueeze(0)
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        if args.ignore_pad_token_for_loss:
            # Replace -100 in the labels as we can't decode them.
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        score_dict = {
            "acc": [],
            "acc_w/o_case": [],
        }
        for pred, label in zip(decoded_preds, decoded_labels):
            if args.rank == 0:
                print('pred', pred, 'label', label, flush=True)
            if pred == label:
                score_dict['acc'].append(1.)
            else:
                score_dict['acc'].append(0.)
            if pred.lower() == label.lower():
                score_dict['acc_w/o_case'].append(1.)
            else:
                score_dict['acc_w/o_case'].append(0.)
            

        for k, v in score_dict.items():
            score_dict[k] = float(np.mean(v))
        return score_dict

    # Get the batch.
    timers('batch generator').start()
    data_b = get_batch(
        data_iterator, args, timers)
    timers('batch generator').stop()

    context_len = int(data_b['context_length'][0])
    tokens = data_b['input_ids'][:, :context_len]
    data_b['vision_expert_mask'] = data_b['vision_expert_mask'][:, :context_len]
    data_b['image_embed_mask'] = data_b['image_embed_mask'][:, :context_len]
    data_b['image_rope_mask'] = data_b['image_rope_mask'][:, :context_len]

    data_b.pop('input_ids')
    data_b.pop('attention_mask')
    data_b.pop('position_ids')
    labels = data_b.pop('labels')
    qid = data_b.pop('question_id')

    model.add_mixin('auto-regressive', CachedAutoregressiveMixin())
    outputs = chat(model, tokenizer, tokens, **data_b)[0][context_len:]
    # print(outputs)
    model.del_mixin('auto-regressive')

    return torch.tensor(0, device=outputs.device), {k: torch.tensor(v, device=outputs.device) for k, v in
                                                    compute_metrics(
                                                        (outputs.cpu(), labels.cpu(), outputs.device)).items()}


from torch.nn import CrossEntropyLoss
def forward_step(data_iterator, model, args, timers):
    """Forward step."""

    # Get the batch.
    timers('batch generator').start()
    data_b = get_batch(
        data_iterator, args, timers)
    labels = data_b.pop('labels')
    timers('batch generator').stop()
    logits = model(**data_b)[0]
    lm_logits = logits.to(torch.float32)
    # Shift so that tokens < n predict n
    shift_labels = labels[..., 1:].contiguous()
    shift_logits = lm_logits[..., -1-shift_labels.size(-1):-1, :].contiguous()
    # Flatten the tokens
    loss_fct = CrossEntropyLoss(ignore_index=-100)
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    loss = loss.to(torch.float32)

    return loss, {'loss': loss}

from utils.utils import ItemDataset
def create_dataset_function(image_processor, text_processor, path, args):
    dataset = ItemDataset(image_processor, text_processor, args, path)
    return dataset

from sat.model.finetune.lora2 import LoraMixin
from sat.model.finetune.prompt_tuning import PTuningV2Mixin

if __name__ == '__main__':
    py_parser = argparse.ArgumentParser(add_help=False)
    py_parser.add_argument('--max_length', type=int)
    py_parser.add_argument('--ignore_pad_token_for_loss', action='store_false')
    py_parser.add_argument("--version", type=str, default="chat_old", help='version to interact with')
    py_parser.add_argument("--from_pretrained", type=str, default="cogvlm-chat", help='pretrained ckpt')
    py_parser.add_argument("--local_tokenizer", type=str, default="lmsys/vicuna-7b-v1.5", help='tokenizer path')
    py_parser.add_argument("--vit_checkpoint_activations", action='store_true')
    py_parser = FineTuneTrainCogVLMModel.add_model_specific_args(py_parser)
    known, args_list = py_parser.parse_known_args()
    args = get_args(args_list)
    args = argparse.Namespace(**vars(args), **vars(known))
    if args.use_qlora:
        args.device = 'cpu'

    model, args = FineTuneTrainCogVLMModel.from_pretrained(args.from_pretrained, args, overwrite_args={'model_parallel_size': args.model_parallel_size} if args.model_parallel_size != 1 else {})
    if args.use_ptuning:
        model.add_mixin("ptuning", PTuningV2Mixin(args.num_layers, args.hidden_size // args.num_attention_heads, args.num_attention_heads, args.pre_seq_len))
    if args.use_lora:
        model.add_mixin("lora", LoraMixin(args.num_layers, args.lora_rank, layer_range=args.layer_range), reinit=True)
        model.get_mixin("eva").vit_model.add_mixin("lora", LoraMixin(args.eva_args['num_layers'], args.lora_rank, layer_range=args.layer_range), reinit=True)
    elif args.use_qlora:
        model.add_mixin("lora", LoraMixin(args.num_layers, args.lora_rank, layer_range=args.layer_range, qlora=True), reinit=True)
        
    if args.use_qlora and torch.cuda.is_available():
        model = model.to('cuda')
    from utils.utils import llama2_tokenizer
    tokenizer = llama2_tokenizer(args.local_tokenizer, signal_type=args.version)
    image_processor = get_image_processor(args.eva_args["image_size"][0])
    text_processor = llama2_text_processor(tokenizer, args.max_length, args.image_length)

    model = training_main(args, model_cls=model, forward_step_function=forward_step, create_dataset_function=partial(create_dataset_function, image_processor, text_processor), collate_fn=data_collator, forward_step_eval=forward_step_eval)
    if args.use_lora:
        model.get_mixin("lora").merge_lora()
        model.get_mixin("eva").vit_model.get_mixin("lora").merge_lora()
        args.use_lora = False
        args.save = "checkpoints/merged_lora_cogvlm{}".format(args.eva_args["image_size"][0])
        from sat.training.model_io import save_checkpoint
        save_checkpoint(1, model, None, None, args)

================================================
FILE: finetune_demo/finetune_cogvlm_lora.sh
================================================
#! /bin/bash
# export PATH=/usr/local/cuda/bin:$PATH
# export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

NUM_GPUS_PER_WORKER=8
MP_SIZE=1

script_path=$(realpath $0)
script_dir=$(dirname $script_path)
main_dir=$(dirname $script_dir)
MODEL_TYPE="cogvlm-base-490"
VERSION="base"
MODEL_ARGS="--from_pretrained $MODEL_TYPE \
    --max_length 1288 \
    --lora_rank 10 \
    --use_lora \
    --local_tokenizer lmsys/vicuna-7b-v1.5 \
    --version $VERSION"
# Tips: If training models of resolution 244, you can set --max_length smaller 

OPTIONS_SAT="SAT_HOME=~/.sat_models"
OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2 LOCAL_WORLD_SIZE=$NUM_GPUS_PER_WORKER"
HOST_FILE_PATH="hostfile"

train_data="./archive_split/train"
valid_data="./archive_split/valid"

gpt_options=" \
       --experiment-name finetune-$MODEL_TYPE \
       --model-parallel-size ${MP_SIZE} \
       --mode finetune \
       --train-iters 800 \
       --resume-dataloader \
       $MODEL_ARGS \
       --train-data ${train_data} \
       --valid-data ${valid_data} \
       --distributed-backend nccl \
       --lr-decay-style cosine \
       --warmup .02 \
       --checkpoint-activations \
       --vit_checkpoint_activations \
       --save-interval 200 \
       --eval-interval 200 \
       --save "./checkpoints" \
       --eval-iters 10 \
       --eval-batch-size 1 \
       --split 1. \
       --deepspeed_config test_config_bf16.json \
       --skip-init \
       --seed 2023
"

              
run_cmd="${OPTIONS_NCCL} ${OPTIONS_SAT} deepspeed --master_port 16666 --hostfile ${HOST_FILE_PATH} finetune_cogvlm_demo.py ${gpt_options}"
echo ${run_cmd}
eval ${run_cmd}

set +x

================================================
FILE: finetune_demo/test_config_bf16.json
================================================
{
    "train_micro_batch_size_per_gpu": 4,
    "gradient_accumulation_steps": 1,
    "gradient_clipping": 0.1,
    "zero_optimization": {
      "stage": 2,
      "contiguous_gradients": false,
      "overlap_comm": true,
      "reduce_scatter": true,
      "reduce_bucket_size": 4e7,
      "allgather_bucket_size": 1e8,
      "load_from_fp32_weights": false
    },
    "offload_optimizer": {
      "device": "cpu",
      "pin_memory": true
    },
    "zero_allow_untested_optimizer": true,
    "bf16": {
      "enabled": true
    },
    "optimizer": {
      "type": "Adam",
      "params": {
        "lr": 0.00001,
        "betas": [
          0.9,
          0.95
        ],
        "eps": 1e-8,
        "weight_decay": 5e-2
      }
    },
    "activation_checkpointing": {
      "partition_activations": false,
      "contiguous_memory_optimization": false,
      "cpu_checkpointing": false
    },
    "wall_clock_breakdown": false
  }
  

================================================
FILE: openai_demo/openai_api.py
================================================
import os
import gc
import time
import base64

from contextlib import asynccontextmanager
from typing import List, Literal, Union, Tuple, Optional
import torch
import uvicorn
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from loguru import logger
from pydantic import BaseModel, Field
from sse_starlette.sse import EventSourceResponse
from transformers import AutoModelForCausalLM, LlamaTokenizer, PreTrainedModel, PreTrainedTokenizer, \
    TextIteratorStreamer
from PIL import Image
from io import BytesIO

MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/cogvlm-chat-hf')
TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", 'lmsys/vicuna-7b-v1.5')
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
if os.environ.get('QUANT_ENABLED'):
    QUANT_ENABLED = True
else:
    with torch.cuda.device(DEVICE):
        __, total_bytes = torch.cuda.mem_get_info()
        total_gb = total_bytes / (1 << 30)
        if total_gb < 40:
            QUANT_ENABLED = True
        else:
            QUANT_ENABLED = False

@asynccontextmanager
async def lifespan(app: FastAPI):
    """
    An asynchronous context manager for managing the lifecycle of the FastAPI app.
    It ensures that GPU memory is cleared after the app's lifecycle ends, which is essential for efficient resource management in GPU environments.
    """
    yield
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()


app = FastAPI(lifespan=lifespan)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


class ModelCard(BaseModel):
    """
    A Pydantic model representing a model card, which provides metadata about a machine learning model.
    It includes fields like model ID, owner, and creation time.
    """
    id: str
    object: str = "model"
    created: int = Field(default_factory=lambda: int(time.time()))
    owned_by: str = "owner"
    root: Optional[str] = None
    parent: Optional[str] = None
    permission: Optional[list] = None


class ModelList(BaseModel):
    object: str = "list"
    data: List[ModelCard] = []


class ImageUrl(BaseModel):
    url: str


class TextContent(BaseModel):
    type: Literal["text"]
    text: str


class ImageUrlContent(BaseModel):
    type: Literal["image_url"]
    image_url: ImageUrl


ContentItem = Union[TextContent, ImageUrlContent]


class ChatMessageInput(BaseModel):
    role: Literal["user", "assistant", "system"]
    content: Union[str, List[ContentItem]]
    name: Optional[str] = None


class ChatMessageResponse(BaseModel):
    role: Literal["assistant"]
    content: str = None
    name: Optional[str] = None


class DeltaMessage(BaseModel):
    role: Optional[Literal["user", "assistant", "system"]] = None
    content: Optional[str] = None


class ChatCompletionRequest(BaseModel):
    model: str
    messages: List[ChatMessageInput]
    temperature: Optional[float] = 0.8
    top_p: Optional[float] = 0.8
    max_tokens: Optional[int] = None
    stream: Optional[bool] = False
    # Additional parameters
    repetition_penalty: Optional[float] = 1.0


class ChatCompletionResponseChoice(BaseModel):
    index: int
    message: ChatMessageResponse


class ChatCompletionResponseStreamChoice(BaseModel):
    index: int
    delta: DeltaMessage


class UsageInfo(BaseModel):
    prompt_tokens: int = 0
    total_tokens: int = 0
    completion_tokens: Optional[int] = 0


class ChatCompletionResponse(BaseModel):
    model: str
    object: Literal["chat.completion", "chat.completion.chunk"]
    choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
    created: Optional[int] = Field(default_factory=lambda: int(time.time()))
    usage: Optional[UsageInfo] = None


@app.get("/v1/models", response_model=ModelList)
async def list_models():
    """
    An endpoint to list available models. It returns a list of model cards.
    This is useful for clients to query and understand what models are available for use.
    """
    model_card = ModelCard(id="cogvlm-chat-17b")  # can be replaced by your model id like cogagent-chat-18b
    return ModelList(data=[model_card])


@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def create_chat_completion(request: ChatCompletionRequest):
    global model, tokenizer

    if len(request.messages) < 1 or request.messages[-1].role == "assistant":
        raise HTTPException(status_code=400, detail="Invalid request")

    gen_params = dict(
        messages=request.messages,
        temperature=request.temperature,
        top_p=request.top_p,
        max_tokens=request.max_tokens or 1024,
        echo=False,
        stream=request.stream,
    )

    if request.stream:
        generate = predict(request.model, gen_params)
        return EventSourceResponse(generate, media_type="text/event-stream")
    response = generate_cogvlm(model, tokenizer, gen_params)

    usage = UsageInfo()

    message = ChatMessageResponse(
        role="assistant",
        content=response["text"],
    )
    logger.debug(f"==== message ====\n{message}")
    choice_data = ChatCompletionResponseChoice(
        index=0,
        message=message,
    )
    task_usage = UsageInfo.model_validate(response["usage"])
    for usage_key, usage_value in task_usage.model_dump().items():
        setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
    return ChatCompletionResponse(model=request.model, choices=[choice_data], object="chat.completion", usage=usage)


async def predict(model_id: str, params: dict):
    """
    Handle streaming predictions. It continuously generates responses for a given input stream.
    This is particularly useful for real-time, continuous interactions with the model.
    """

    global model, tokenizer

    choice_data = ChatCompletionResponseStreamChoice(
        index=0,
        delta=DeltaMessage(role="assistant"),
        finish_reason=None
    )
    chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
    yield "{}".format(chunk.model_dump_json(exclude_unset=True))

    previous_text = ""
    for new_response in generate_stream_cogvlm(model, tokenizer, params):
        decoded_unicode = new_response["text"]
        delta_text = decoded_unicode[len(previous_text):]
        previous_text = decoded_unicode
        delta = DeltaMessage(
            content=delta_text,
            role="assistant",
        )
        choice_data = ChatCompletionResponseStreamChoice(
            index=0,
            delta=delta,
        )
        chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
        yield "{}".format(chunk.model_dump_json(exclude_unset=True))
    choice_data = ChatCompletionResponseStreamChoice(
        index=0,
        delta=DeltaMessage(),
    )
    chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
    yield "{}".format(chunk.model_dump_json(exclude_unset=True))


def generate_cogvlm(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, params: dict):
    """
    Generates a response using the CogVLM model. It processes the chat history and image data, if any,
    and then invokes the model to generate a response.
    """

    for response in generate_stream_cogvlm(model, tokenizer, params):
        pass
    return response


def process_history_and_images(messages: List[ChatMessageInput]) -> Tuple[
    Optional[str], Optional[List[Tuple[str, str]]], Optional[List[Image.Image]]]:
    """
    Process history messages to extract text, identify the last user query,
    and convert base64 encoded image URLs to PIL images.

    Args:
        messages(List[ChatMessageInput]): List of ChatMessageInput objects.
    return: A tuple of three elements:
             - The last user query as a string.
             - Text history formatted as a list of tuples for the model.
             - List of PIL Image objects extracted from the messages.
    """
    formatted_history = []
    image_list = []
    last_user_query = ''

    for i, message in enumerate(messages):
        role = message.role
        content = message.content

        if isinstance(content, list):  # text
            text_content = ' '.join(item.text for item in content if isinstance(item, TextContent))
        else:
            text_content = content

        if isinstance(content, list):  # image
            for item in content:
                if isinstance(item, ImageUrlContent):
                    image_url = item.image_url.url
                    if image_url.startswith("data:image/jpeg;base64,"):
                        base64_encoded_image = image_url.split("data:image/jpeg;base64,")[1]
                        image_data = base64.b64decode(base64_encoded_image)
                        image = Image.open(BytesIO(image_data)).convert('RGB')
                        image_list.append(image)

        if role == 'user':
            if i == len(messages) - 1:  # 最后一条用户消息
                last_user_query = text_content
            else:
                formatted_history.append((text_content, ''))
        elif role == 'assistant':
            if formatted_history:
                if formatted_history[-1][1] != '':
                    assert False, f"the last query is answered. answer again. {formatted_history[-1][0]}, {formatted_history[-1][1]}, {text_content}"
                formatted_history[-1] = (formatted_history[-1][0], text_content)
            else:
                assert False, f"assistant reply before user"
        else:
            assert False, f"unrecognized role: {role}"

    return last_user_query, formatted_history, image_list


@torch.inference_mode()
def generate_stream_cogvlm(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, params: dict):
    """
    Generates a stream of responses using the CogVLM model in inference mode.
    It's optimized to handle continuous input-output interactions with the model in a streaming manner.
    """
    messages = params["messages"]
    temperature = float(params.get("temperature", 1.0))
    repetition_penalty = float(params.get("repetition_penalty", 1.0))
    top_p = float(params.get("top_p", 1.0))
    max_new_tokens = int(params.get("max_tokens", 256))
    query, history, image_list = process_history_and_images(messages)

    logger.debug(f"==== request ====\n{query}")

    input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history,
                                                        images=[image_list[-1]])
    inputs = {
        'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE),
        'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE),
        'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE),
        'images': [[input_by_model['images'][0].to(DEVICE).to(torch_type)]],
    }
    if 'cross_images' in input_by_model and input_by_model['cross_images']:
        inputs['cross_images'] = [[input_by_model['cross_images'][0].to(DEVICE).to(torch_type)]]

    input_echo_len = len(inputs["input_ids"][0])
    streamer = TextIteratorStreamer(
        tokenizer=tokenizer,
        timeout=60.0,
        skip_prompt=True,
        skip_special_tokens=True
)
    gen_kwargs = {
        "repetition_penalty": repetition_penalty,
        "max_new_tokens": max_new_tokens,
        "do_sample": True if temperature > 1e-5 else False,
        "top_p": top_p if temperature > 1e-5 else 0,
        'streamer': streamer,
    }
    if temperature > 1e-5:
        gen_kwargs["temperature"] = temperature

    total_len = 0
    generated_text = ""
    with torch.no_grad():
        model.generate(**inputs, **gen_kwargs)
        for next_text in streamer:
            generated_text += next_text
            yield {
                "text": generated_text,
                "usage": {
                    "prompt_tokens": input_echo_len,
                    "completion_tokens": total_len - input_echo_len,
                    "total_tokens": total_len,
                },
            }
    ret = {
        "text": generated_text,
        "usage": {
            "prompt_tokens": input_echo_len,
            "completion_tokens": total_len - input_echo_len,
            "total_tokens": total_len,
        },
    }
    yield ret


gc.collect()
torch.cuda.empty_cache()

if __name__ == "__main__":
    tokenizer = LlamaTokenizer.from_pretrained(
        TOKENIZER_PATH,
        trust_remote_code=True)

    if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
        torch_type = torch.bfloat16
    else:
        torch_type = torch.float16

    print("========Use torch type as:{} with device:{}========\n\n".format(torch_type, DEVICE))

    if 'cuda' in DEVICE:
        if QUANT_ENABLED:
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_PATH,
                load_in_4bit=True,
                trust_remote_code=True,
                torch_dtype=torch_type,
                low_cpu_mem_usage=True
            ).eval()
        else:
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_PATH,
                load_in_4bit=False,
                trust_remote_code=True,
                torch_dtype=torch_type,
                low_cpu_mem_usage=True
            ).to(DEVICE).eval()
            
    else:
        model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, trust_remote_code=True).float().to(DEVICE).eval()
    uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)


================================================
FILE: openai_demo/openai_api_request.py
================================================
"""
This script is designed to mimic the OpenAI API interface with CogVLM & CogAgent Chat
It demonstrates how to integrate image and text-based input to generate a response.
Currently, the model can only handle a single image.
Therefore, do not use this script to process multiple images in one conversation. (includes images from history)
And it only works on the chat model, not the base model.
"""
import requests
import json
import base64

base_url = "http://127.0.0.1:8000"


def create_chat_completion(model, messages, temperature=0.8, max_tokens=2048, top_p=0.8, use_stream=False):
    """
    This function sends a request to the chat API to generate a response based on the given messages.

    Args:
        model (str): The name of the model to use for generating the response.
        messages (list): A list of message dictionaries representing the conversation history.
        temperature (float): Controls randomness in response generation. Higher values lead to more random responses.
        max_tokens (int): The maximum length of the generated response.
        top_p (float): Controls diversity of response by filtering less likely options.
        use_stream (bool): Determines whether to use a streaming response or a single response.

    The function constructs a JSON payload with the specified parameters and sends a POST request to the API.
    It then handles the response, either as a stream (for ongoing responses) or a single message.
    """

    data = {
        "model": model,
        "messages": messages,
        "stream": use_stream,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
    }

    response = requests.post(f"{base_url}/v1/chat/completions", json=data, stream=use_stream)
    if response.status_code == 200:
        if use_stream:
            # 处理流式响应
            for line in response.iter_lines():
                if line:
                    decoded_line = line.decode('utf-8')[6:]
                    try:
                        response_json = json.loads(decoded_line)
                        content = response_json.get("choices", [{}])[0].get("delta", {}).get("content", "")
                        print(content)
                    except:
                        print("Special Token:", decoded_line)
        else:
            # 处理非流式响应
            decoded_line = response.json()
            content = decoded_line.get("choices", [{}])[0].get("message", "").get("content", "")
            print(content)
    else:
        print("Error:", response.status_code)
        return None


def encode_image(image_path):
    """
    Encodes an image file into a base64 string.
    Args:
        image_path (str): The path to the image file.

    This function opens the specified image file, reads its content, and encodes it into a base64 string.
    The base64 encoding is used to send images over HTTP as text.
    """

    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


def simple_image_chat(use_stream=True, img_path=None):
    """
    Facilitates a simple chat interaction involving an image.

    Args:
        use_stream (bool): Specifies whether to use streaming for chat responses.
        img_path (str): Path to the image file to be included in the chat.

    This function encodes the specified image and constructs a predefined conversation involving the image.
    It then calls `create_chat_completion` to generate a response from the model.
    The conversation includes asking about the content of the image and a follow-up question.
    """

    img_url = f"data:image/jpeg;base64,{encode_image(img_path)}"
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "What’s in this image?",
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": img_url
                    },
                },
            ],
        },
        {
            "role": "assistant",
            "content": "The image displays a wooden boardwalk extending through a vibrant green grassy wetland. The sky is partly cloudy with soft, wispy clouds, indicating nice weather. Vegetation is seen on either side of the boardwalk, and trees are present in the background, suggesting that this area might be a natural reserve or park designed for ecological preservation and outdoor recreation. The boardwalk allows visitors to explore the area without disturbing the natural habitat.",
        },
        {
            "role": "user",
            "content": "Do you think this is a spring or winter photo?"
        },
    ]
    create_chat_completion("cogvlm-chat-17b", messages=messages, use_stream=use_stream)


if __name__ == "__main__":
    simple_image_chat(use_stream=False, img_path="demo.jpg")


================================================
FILE: requirements.txt
================================================
SwissArmyTransformer>=0.4.9
transformers>=4.36.2
xformers>=0.0.22
torch>=2.1.0
torchvision>=0.16.2
spacy>=3.6.0
pillow>=10.2.0
deepspeed>=0.13.1
seaborn>=0.13.2
loguru~=0.7.2
streamlit>=1.31.0
timm>=0.9.12
accelerate>=0.26.1
pydantic>=2.6.0

# for openai demo
openai>=1.16.0
sse-starlette>=1.8.2
fastapi>=0.110.1
httpx>=0.27.0
uvicorn>=0.29.0
jsonlines>=4.0.0


================================================
FILE: utils/__init__.py
================================================


================================================
FILE: utils/merge_model.py
================================================
# -*- encoding: utf-8 -*-
import os, sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import torch
import argparse
from models.cogvlm_model import FineTuneTestCogVLMModel
from sat.training.model_io import save_checkpoint

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--version", type=str, default="base", help='version to interact with')
    parser.add_argument("--from_pretrained", type=str, default="checkpoints/merged_lora", help='pretrained ckpt')
    parser.add_argument("--fp16", action="store_true")
    parser.add_argument("--bf16", action="store_true")
    args = parser.parse_args()
    rank = int(os.environ.get('RANK', 0))
    world_size = int(os.environ.get('WORLD_SIZE', 1))
    parser = FineTuneTestCogVLMModel.add_model_specific_args(parser)
    args = parser.parse_args()

    # load model
    model, model_args = FineTuneTestCogVLMModel.from_pretrained(
        args.from_pretrained,
        args=argparse.Namespace(
        deepspeed=None,
        local_rank=rank,
        rank=rank,
        world_size=world_size,
        model_parallel_size=world_size,
        mode='inference',
        skip_init=True,
        use_gpu_initialization=True if torch.cuda.is_available() else False,
        device='cuda',
        **vars(args)
    ), url='local', overwrite_args={'model_parallel_size': 1})
    model = model.eval()
    model_args.save = './checkpoints/merged_model_{}'.format(model_args.eva_args["image_size"][0])
    save_checkpoint(1, model, None, None, model_args)

if __name__ == "__main__":
    main()


================================================
FILE: utils/models/__init__.py
================================================
from .cogagent_model import CogAgentModel, FineTuneTrainCogAgentModel, FineTuneTestCogAgentModel
from .cogvlm_model import CogVLMModel, FineTuneTrainCogVLMModel, FineTuneTestCogVLMModel

================================================
FILE: utils/models/cogagent_model.py
================================================
from sat.model.official.llama_model import LLaMAModel
import json
import torch
from functools import partial
from sat.model.base_model import BaseMixin
import torch.nn as nn
import numpy as np
from sat.resources.urls import MODEL_URLS

from .eva_clip_L_hf import Eva2LargeEncoder
from .mixin import LlamaVisionExpertFCMixin, LlamaVisionExpertAttnMixin


MODEL_URLS["cogagent-chat"] = "r2://cogagent-chat.zip"
MODEL_URLS["cogagent-vqa"] = "r2://cogagent-vqa.zip"


class GLU(nn.Module):
    def __init__(self, args, in_features):
        super().__init__()
        self.linear_proj = nn.Linear(in_features, args.hidden_size, bias=False)
        self.norm1 = nn.LayerNorm(args.hidden_size)
        self.act1 = nn.GELU()
        self.act2 = nn.functional.silu
        self.dense_h_to_4h = nn.Linear(args.hidden_size, args.inner_hidden_size, bias=False)
        self.gate_proj = nn.Linear(args.hidden_size, args.inner_hidden_size, bias=False)
        self.dense_4h_to_h = nn.Linear(args.inner_hidden_size, args.hidden_size, bias=False)

    def forward(self, x):
        x = self.linear_proj(x)
        x = self.act1(self.norm1(x))
        x = self.act2(self.gate_proj(x)) * self.dense_h_to_4h(x)
        x = self.dense_4h_to_h(x)
        return x

from .eva_clip_model import EVA2CLIPModel
import argparse
from copy import deepcopy
def override_dist_dtype_device_args(args, b={}):
    if args.mode == 'inference':
        minimal_args = argparse.Namespace(
            world_size=args.world_size,
            rank=args.rank,
            local_rank=args.local_rank,
            skip_init=args.skip_init,
            use_gpu_initialization=args.use_gpu_initialization,
            deepspeed=args.deepspeed,
            bf16=args.bf16,
            fp16=args.fp16,
            mode=args.mode,
            device=args.device
        )
    else:
        minimal_args = argparse.Namespace(
                world_size=args.world_size,
                rank=args.rank,
                local_rank=args.local_rank,
                skip_init=args.skip_init,
                use_gpu_initialization=args.use_gpu_initialization,
                deepspeed=args.deepspeed,
                bf16=args.bf16,
                fp16=args.fp16,
                mode=args.mode,
                checkpoint_activations=args.checkpoint_activations if not hasattr(args, 'vit_checkpoint_activations') else args.vit_checkpoint_activations,
                checkpoint_num_layers=args.checkpoint_num_layers,
                device=args.device,
                hidden_dropout=0.,
                attention_dropout=0.,
            )
    if hasattr(args, 'model_parallel_size'):
        b['model_parallel_size'] = args.model_parallel_size
    return argparse.Namespace(**deepcopy(b), **vars(minimal_args))


class ExternalVisionModel(BaseMixin):
    '''A combination of vit and a linear projection'''
    def __init__(self, args, vitclass):
        '''
            args: the args to initialize the vit model
            vitclass: the class of VIT model, must be a subclass of BaseModel
            project_dim: the dimension of the projection layer
            default_load: the default load path for the vit model
            model_parallel_size: the model parallel size for the vit model
        '''
        super().__init__()
        self.vit = vitclass()
        # self.ppx = nn.Embedding(80, 1024)
        # self.ppy = nn.Embedding(80, 1024)
        # nn.init.uniform_(self.ppx.weight.data)
        # nn.init.uniform_(self.ppy.weight.data)

        # self.pos_embed = nn.Parameter(
        #     torch.from_numpy(get_2d_sincos_pos_embed(1024, 80)).float()
        # )
        cross_image_length = (args.cross_image_pix//14)**2
        self.pos_embed = nn.Parameter(
            torch.zeros(cross_image_length, 1024)
        )

    def forward(self, *args, **kw_args):
        enc = self.vit(*args, **kw_args)
        # i = torch.arange(80, device=enc.device)
        # j = torch.arange(80, device=enc.device)
        # posx = self.ppx(i).unsqueeze(0).repeat(80, 1, 1)
        # posy = self.ppy(j).unsqueeze(1).repeat(1, 80, 1)
        # pos = (posx + posy).view(-1, 1024).unsqueeze(0)

        # return enc + pos + self.pos_embed.unsqueeze(0)
        return enc + self.pos_embed.unsqueeze(0)

class ImageMixin(BaseMixin):
    def __init__(self, args):
        super().__init__()
        vit_args = override_dist_dtype_device_args(args, args.eva_args)
        self.vit_model = EVA2CLIPModel(EVA2CLIPModel.get_args(**vars(vit_args)))
        self.in_features = 1792
        self.linear_proj = GLU(args, self.in_features)
        self.image_length = args.image_length
        self.boi = nn.Parameter(torch.zeros(1, 1, args.hidden_size))
        self.eoi = nn.Parameter(torch.zeros(1, 1, args.hidden_size))
        
        # self.ppx = nn.Embedding(16,1792)
        # self.ppy = nn.Embedding(16,1792)

        # self.pos_embed = nn.Parameter(
        #     torch.from_numpy(get_2d_sincos_pos_embed(1792, 16)).float()
        # )
        self.pos_embed = nn.Parameter(
            torch.zeros(self.image_length, 1792)
        )

    def word_embedding_forward(self, input_ids, output_cross_layer, **kw_args):
        vision_inputs = {}
        for k in kw_args:
            if k.startswith('vision_') and k != 'vision_expert_mask':
                vision_inputs[k[7:]] = kw_args[k]
        if input_ids.shape[1] == 1 or not vision_inputs:
            return self.transformer.word_embeddings(input_ids)
        image_emb = self.vit_model(**vision_inputs)[0]
        
        # i = torch.arange(16, device=image_emb.device)
        # j = torch.arange(16, device=image_emb.device)
        # posx = self.ppx(i).unsqueeze(0).repeat(16, 1, 1)
        # posy = self.ppy(j).unsqueeze(1).repeat(1, 16, 1)
        # pos = (posx + posy).view(256, -1).unsqueeze(0)
        # image_emb = image_emb + pos + self.pos_embed.unsqueeze(0)
        image_emb = image_emb + self.pos_embed.unsqueeze(0)
            
        image_emb = self.linear_proj(image_emb)

        image_embed_mask = kw_args['image_embed_mask']
        word_embedding = self.transformer.word_embeddings(input_ids).clone()
        word_embedding[image_embed_mask.bool()] = torch.cat([self.boi.repeat(len(image_emb), 1, 1), image_emb, self.eoi.repeat(len(image_emb), 1, 1)], dim=1).reshape(-1, image_emb.shape[-1])

        return word_embedding.contiguous()
    
class CogAgentModel(LLaMAModel):
    def __init__(self, args, transformer=None, **kwargs):
        super().__init__(args, transformer=transformer, **kwargs)
        self.image_length = args.image_length
        self.cross_image_pix = args.cross_image_pix
        self.add_mixin("eva", ImageMixin(args))
        self.del_mixin("mlp")
        self.add_mixin("mlp", LlamaVisionExpertFCMixin(args.hidden_size, args.inner_hidden_size, args.num_layers, 32))
        self.del_mixin("rotary")
        self.add_mixin("rotary", LlamaVisionExpertAttnMixin(args.hidden_size, args.num_attention_heads, args.num_layers, 32))
        
        cross_model = ExternalVisionModel(args, vitclass=partial(Eva2LargeEncoder, image_size=self.cross_image_pix))
        # if args.mode != 'inference':
        # cross_model.vit.model.set_grad_checkpointing(True)
        self.add_mixin("encoder", cross_model)

    @classmethod
    def add_model_specific_args(cls, parser):
        group = parser.add_argument_group('CogAgent', 'CogAgent Configurations')
        group.add_argument('--image_length', type=int, default=256)
        group.add_argument('--cross_image_pix', type=int, default=1120) # Standard CogAgent use 1120; if you want to adjust this param, finetune the model first.
        group.add_argument('--eva_args', type=json.loads, default={})
        return super().add_model_specific_args(parser)

    def forward(self, input_ids, vision_expert_mask, image_embed_mask, **kwargs):
        
        cross_inputs = {}
        for k in kwargs:
            if k.startswith('cross_'):
                cross_inputs[k[6:]] = kwargs[k]
        if kwargs.get("mems_cross") is not None:
            kwargs['encoder_outputs'] = kwargs["mems_cross"][0]
        else:
            outputs = self.get_mixin('encoder')(**cross_inputs)
            kwargs['encoder_outputs'] = outputs
        kwargs['cross_attention_mask'] = cross_inputs['attention_mask'] 
                
        if input_ids.shape[1] > 1:
            return super().forward(input_ids=input_ids, vision_expert_mask=vision_expert_mask, image_embed_mask=image_embed_mask, **kwargs)
        return super().forward(input_ids=input_ids, **kwargs)


class FineTuneTrainCogAgentModel(CogAgentModel):
    def __init__(self, args, transformer=None, **kw_args):
        super().__init__(args, transformer=transformer, **kw_args)
        self.args = args
        # If you want to use model parallel with a mp_size=1 checkpoint, and meanwhile you also want to use lora,
        # you have to add_mixin after loading model checkpoint.
        
    @classmethod
    def add_model_specific_args(cls, parser):
        group = parser.add_argument_group('CogAgent-finetune', 'CogAgent finetune Configurations')
        group.add_argument('--pre_seq_len', type=int, default=8)
        group.add_argument('--lora_rank', type=int, default=10)
        group.add_argument('--use_ptuning', action="store_true")
        group.add_argument('--use_lora', action="store_true")
        group.add_argument('--use_qlora', action="store_true")
        group.add_argument('--layer_range', nargs='+', type=int, default=None)
        return super().add_model_specific_args(parser)


from sat.model.finetune import PTuningV2Mixin
from sat.model.finetune.lora2 import LoraMixin
class FineTuneTestCogAgentModel(CogAgentModel):
    def __init__(self, args, transformer=None, **kw_args):
        super().__init__(args, transformer=transformer, **kw_args)
        if args.use_ptuning:
            self.add_mixin("ptuning", PTuningV2Mixin(args.num_layers, args.hidden_size // args.num_attention_heads, args.num_attention_heads, args.pre_seq_len))
        if args.use_lora:
            self.add_mixin("lora", LoraMixin(args.num_layers, args.lora_rank, layer_range=args.layer_range), reinit=True)
            self.get_mixin("eva").vit_model.add_mixin("lora", LoraMixin(args.eva_args['num_layers'], args.lora_rank, layer_range=args.layer_range), reinit=True)
        elif args.use_qlora:
            self.add_mixin("lora", LoraMixin(args.num_layers, args.lora_rank, layer_range=args.layer_range, qlora=True), reinit=True)
        self.args = args
        
    @classmethod
    def add_model_specific_args(cls, parser):
        group = parser.add_argument_group('CogAgent-finetune', 'CogAgent finetune Configurations')
        group.add_argument('--pre_seq_len', type=int, default=8)
        group.add_argument('--lora_rank', type=int, default=10)
        group.add_argument('--use_ptuning', action="store_true")
        group.add_argument('--use_lora', action="store_true")
        group.add_argument('--use_qlora', action="store_true")
        group.add_argument('--layer_range', nargs='+', type=int, default=None)
        return super().add_model_specific_args(parser)


================================================
FILE: utils/models/cogvlm_model.py
================================================
from sat.model.official.llama_model import LLaMAModel
import json
import torch
from sat.model.base_model import BaseMixin
import torch.nn as nn
from .mixin import LlamaVisionExpertFCMixin, LlamaVisionExpertAttnMixin

from sat.resources.urls import MODEL_URLS

MODEL_URLS["cogvlm-base-224"] = "r2://cogvlm-base-224.zip"
MODEL_URLS["cogvlm-base-490"] = "r2://cogvlm-base-490.zip"
MODEL_URLS["cogvlm-chat-v1.1"] = "r2://cogvlm-chat-v1.1.zip"
MODEL_URLS["cogvlm-grounding-base"] = "r2://cogvlm-grounding-base.zip"
MODEL_URLS["cogvlm-grounding-generalist-v1.1"] = "r2://cogvlm-grounding-generalist-v1.1.zip"


class GLU(nn.Module):
    def __init__(self, args, in_features):
        super().__init__()
        self.linear_proj = nn.Linear(in_features, args.hidden_size, bias=False)
        self.norm1 = nn.LayerNorm(args.hidden_size)
        self.act1 = nn.GELU()
        self.act2 = nn.functional.silu
        self.dense_h_to_4h = nn.Linear(args.hidden_size, args.inner_hidden_size, bias=False)
        self.gate_proj = nn.Linear(args.hidden_size, args.inner_hidden_size, bias=False)
        self.dense_4h_to_h = nn.Linear(args.inner_hidden_size, args.hidden_size, bias=False)

    def forward(self, x):
        x = self.linear_proj(x)
        x = self.act1(self.norm1(x))
        x = self.act2(self.gate_proj(x)) * self.dense_h_to_4h(x)
        x = self.dense_4h_to_h(x)
        return x

from .eva_clip_model import EVA2CLIPModel
import argparse
from copy import deepcopy
def override_dist_dtype_device_args(args, b={}):
    if args.mode == 'inference':
        minimal_args = argparse.Namespace(
            world_size=args.world_size,
            rank=args.rank,
            local_rank=args.local_rank,
            skip_init=args.skip_init,
            use_gpu_initialization=args.use_gpu_initialization,
            deepspeed=args.deepspeed,
            bf16=args.bf16,
            fp16=args.fp16,
            mode=args.mode,
            device=args.device
        )
    else:
        minimal_args = argparse.Namespace(
                world_size=args.world_size,
                rank=args.rank,
                local_rank=args.local_rank,
                skip_init=args.skip_init,
                use_gpu_initialization=args.use_gpu_initialization,
                deepspeed=args.deepspeed,
                bf16=args.bf16,
                fp16=args.fp16,
                mode=args.mode,
                checkpoint_activations=args.checkpoint_activations if not hasattr(args, 'vit_checkpoint_activations') else args.vit_checkpoint_activations,
                checkpoint_num_layers=args.checkpoint_num_layers,
                device=args.device,
                hidden_dropout=0.,
                attention_dropout=0.,
            )
    if hasattr(args, 'model_parallel_size'):
        b['model_parallel_size'] = args.model_parallel_size
    return argparse.Namespace(**deepcopy(b), **vars(minimal_args))

class ImageMixin(BaseMixin):
    def __init__(self, args):
        super().__init__()
        vit_args = override_dist_dtype_device_args(args, args.eva_args)
        self.vit_model = EVA2CLIPModel(EVA2CLIPModel.get_args(**vars(vit_args)))
        self.in_features = 1792
        self.linear_proj = GLU(args, self.in_features)
        self.image_length = args.image_length
        self.boi = nn.Parameter(torch.zeros(1, 1, args.hidden_size))
        self.eoi = nn.Parameter(torch.zeros(1, 1, args.hidden_size))

    def word_embedding_forward(self, input_ids, output_cross_layer, **kw_args):
        vision_inputs = {}
        for k in kw_args:
            if k.startswith('vision_') and k != 'vision_expert_mask':
                vision_inputs[k[7:]] = kw_args[k]
        if input_ids.shape[1] == 1 or not vision_inputs:
            return self.transformer.word_embeddings(input_ids)
        image_emb = self.vit_model(**vision_inputs)[0]
        image_emb = self.linear_proj(image_emb)

        image_embed_mask = kw_args['image_embed_mask']
        word_embedding = self.transformer.word_embeddings(input_ids).clone()
        word_embedding[image_embed_mask.bool()] = torch.cat([self.boi.repeat(len(image_emb), 1, 1), image_emb, self.eoi.repeat(len(image_emb), 1, 1)], dim=1).reshape(-1, image_emb.shape[-1])
        return word_embedding.contiguous()


class CogVLMModel(LLaMAModel):
    def __init__(self, args, transformer=None, **kwargs):
        super().__init__(args, transformer=transformer, **kwargs)
        self.image_length = args.image_length
        self.add_mixin("eva", ImageMixin(args))
        self.del_mixin("mlp")
        self.add_mixin("mlp", LlamaVisionExpertFCMixin(args.hidden_size, args.inner_hidden_size, args.num_layers, 32))
        self.del_mixin("rotary")
        self.add_mixin("rotary", LlamaVisionExpertAttnMixin(args.hidden_size, args.num_attention_heads, args.num_layers, 32))

    @classmethod
    def add_model_specific_args(cls, parser):
        group = parser.add_argument_group('CogVLM', 'CogVLM Configurations')
        group.add_argument('--image_length', type=int, default=256)
        group.add_argument('--eva_args', type=json.loads, default={})
        return super().add_model_specific_args(parser)

    def forward(self, input_ids, vision_expert_mask, image_embed_mask, **kwargs):
        if input_ids.shape[1] > 1:
            return super().forward(input_ids=input_ids, vision_expert_mask=vision_expert_mask, image_embed_mask=image_embed_mask, **kwargs)
        return super().forward(input_ids=input_ids, **kwargs)


class FineTuneTrainCogVLMModel(CogVLMModel):
    def __init__(self, args, transformer=None, **kw_args):
        super().__init__(args, transformer=transformer, **kw_args)
        self.args = args
        # If you want to use model parallel with a mp_size=1 checkpoint, and meanwhile you also want to use lora,
        # you have to add_mixin after loading model checkpoint.
        
    @classmethod
    def add_model_specific_args(cls, parser):
        group = parser.add_argument_group('CogVLM-finetune', 'CogVLM finetune Configurations')
        group.add_argument('--pre_seq_len', type=int, default=8)
        group.add_argument('--lora_rank', type=int, default=10)
        group.add_argument('--use_ptuning', action="store_true")
        group.add_argument('--use_lora', action="store_true")
        group.add_argument('--use_qlora', action="store_true")
        group.add_argument('--layer_range', nargs='+', type=int, default=None)
        return super().add_model_specific_args(parser)


from sat.model.finetune import PTuningV2Mixin
from sat.model.finetune.lora2 import LoraMixin
class FineTuneTestCogVLMModel(CogVLMModel):
    def __init__(self, args, transformer=None, **kw_args):
        super().__init__(args, transformer=transformer, **kw_args)
        if args.use_ptuning:
            self.add_mixin("ptuning", PTuningV2Mixin(args.num_layers, args.hidden_size // args.num_attention_heads, args.num_attention_heads, args.pre_seq_len))
        if args.use_lora:
            self.add_mixin("lora", LoraMixin(args.num_layers, args.lora_rank, layer_range=args.layer_range), reinit=True)
            self.get_mixin("eva").vit_model.add_mixin("lora", LoraMixin(args.eva_args['num_layers'], args.lora_rank, layer_range=args.layer_range), reinit=True)
        elif args.use_qlora:
            self.add_mixin("lora", LoraMixin(args.num_layers, args.lora_rank, layer_range=args.layer_range, qlora=True), reinit=True)
        self.args = args
        
    @classmethod
    def add_model_specific_args(cls, parser):
        group = parser.add_argument_group('CogVLM-finetune', 'CogVLM finetune Configurations')
        group.add_argument('--pre_seq_len', type=int, default=8)
        group.add_argument('--lora_rank', type=int, default=10)
        group.add_argument('--use_ptuning', action="store_true")
        group.add_argument('--use_lora', action="store_true")
        group.add_argument('--use_qlora', action="store_true")
        group.add_argument('--layer_range', nargs='+', type=int, default=None)
        return super().add_model_specific_args(parser)


================================================
FILE: utils/models/eva_clip_L_hf.py
================================================
from math import pi
import torch
from torch import nn
from einops import rearrange, repeat
import logging

def broadcat(tensors, dim = -1):
    num_tensors = len(tensors)
    shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
    assert len(shape_lens) == 1, 'tensors must all have the same number of dimensions'
    shape_len = list(shape_lens)[0]
    dim = (dim + shape_len) if dim < 0 else dim
    dims = list(zip(*map(lambda t: list(t.shape), tensors)))
    expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
    assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]), 'invalid dimensions for broadcastable concatentation'
    max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
    expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
    expanded_dims.insert(dim, (dim, dims[dim]))
    expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
    tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
    return torch.cat(tensors, dim = dim)

def rotate_half(x):
    x = rearrange(x, '... (d r) -> ... d r', r = 2)
    x1, x2 = x.unbind(dim = -1)
    x = torch.stack((-x2, x1), dim = -1)
    return rearrange(x, '... d r -> ... (d r)')

class VisionRotaryEmbeddingFast(nn.Module):
    def __init__(
        self,
        dim,
        pt_seq_len,
        ft_seq_len=None,
        custom_freqs = None,
        freqs_for = 'lang',
        theta = 10000,
        max_freq = 10,
        num_freqs = 1,
        patch_dropout = 0.
    ):
        super().__init__()
        if custom_freqs:
            freqs = custom_freqs
        elif freqs_for == 'lang':
            freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
        elif freqs_for == 'pixel':
            freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
        elif freqs_for == 'constant':
            freqs = torch.ones(num_freqs).float()
        else:
            raise ValueError(f'unknown modality {freqs_for}')

        if ft_seq_len is None: ft_seq_len = pt_seq_len
        t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len

        freqs = torch.einsum('..., f -> ... f', t, freqs)
        freqs = repeat(freqs, '... n -> ... (n r)', r = 2)
        freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim = -1)

        freqs_cos = freqs.cos().view(-1, freqs.shape[-1])
        freqs_sin = freqs.sin().view(-1, freqs.shape[-1])

        self.patch_dropout = patch_dropout

        self.register_buffer("freqs_cos", freqs_cos)
        self.register_buffer("freqs_sin", freqs_sin)

        logging.info(f'Shape of rope freq: {self.freqs_cos.shape}')

    def forward(self, t, patch_indices_keep=None):
        if patch_indices_keep is not None:
            batch = t.size()[0]
            batch_indices = torch.arange(batch)
            batch_indices = batch_indices[..., None]

            freqs_cos = repeat(self.freqs_cos, 'i j -> n i m j', n=t.shape[0], m=t.shape[1])
            freqs_sin = repeat(self.freqs_sin, 'i j -> n i m j', n=t.shape[0], m=t.shape[1])

            freqs_cos = freqs_cos[batch_indices, patch_indices_keep]
            freqs_cos = rearrange(freqs_cos, 'n i m j -> n m i j')
            freqs_sin = freqs_sin[batch_indices, patch_indices_keep]
            freqs_sin = rearrange(freqs_sin, 'n i m j -> n m i j')

            return  t * freqs_cos + rotate_half(t) * freqs_sin

        return  t * self.freqs_cos + rotate_half(t) * self.freqs_sin

import torch.nn as nn
import os
from dataclasses import dataclass
from typing import Optional, Tuple, Union
from functools import partial

import numpy as np
import torch
import torch.nn.functional as F
from torch import nn

# --------------------------------------------------------
# Adapted from  https://github.com/microsoft/unilm/tree/master/beit
# --------------------------------------------------------
import math
import os
from functools import partial
import torch
import torch.nn as nn
import torch.nn.functional as F
import logging
try:
    from timm.models.layers import drop_path, to_2tuple, trunc_normal_
except:
    from timm.layers import drop_path, to_2tuple, trunc_normal_
    
class PatchDropout(nn.Module):
    """
    https://arxiv.org/abs/2212.00794
    """

    def __init__(self, prob, exclude_first_token=True):
        super().__init__()
        assert 0 <= prob < 1.
        self.prob = prob
        self.exclude_first_token = exclude_first_token  # exclude CLS token
        logging.info(f"os.getenv('RoPE')={os.getenv('RoPE')}")

    def forward(self, x):
        if not self.training or self.prob == 0.:
            return x

        if self.exclude_first_token:
            cls_tokens, x = x[:, :1], x[:, 1:]
        else:
            cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])

        batch = x.size()[0]
        num_tokens = x.size()[1]

        batch_indices = torch.arange(batch)
        batch_indices = batch_indices[..., None]

        keep_prob = 1 - self.prob
        num_patches_keep = max(1, int(num_tokens * keep_prob))

        rand = torch.randn(batch, num_tokens)
        patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices

        x = x[batch_indices, patch_indices_keep]

        if self.exclude_first_token:
            x = torch.cat((cls_tokens, x), dim=1)

        if self.training and os.getenv('RoPE') == '1':
            return x, patch_indices_keep

        return x

if os.getenv('ENV_TYPE') == 'deepspeed':
    try:
        from deepspeed.runtime.activation_checkpointing.checkpointing import checkpoint
    except:
        from torch.utils.checkpoint import checkpoint
else:
    from torch.utils.checkpoint import checkpoint

import xformers.ops as xops

class DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    """
    def __init__(self, drop_prob=None):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training)
    
    def extra_repr(self) -> str:
        return 'p={}'.format(self.drop_prob)


class Mlp(nn.Module):
    def __init__(
        self, 
        in_features, 
        hidden_features=None, 
        out_features=None, 
        act_layer=nn.GELU, 
        norm_layer=nn.LayerNorm, 
        drop=0.,
        subln=False,

        ):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()

        self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity()

        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        # x = self.drop(x)
        # commit this for the orignal BERT implement 
        x = self.ffn_ln(x)

        x = self.fc2(x)
        x = self.drop(x)
        return x

class SwiGLU(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.SiLU, drop=0., 
                norm_layer=nn.LayerNorm, subln=False):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features

        self.w1 = nn.Linear(in_features, hidden_features)
        self.w2 = nn.Linear(in_features, hidden_features)

        self.act = act_layer()
        self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity()
        self.w3 = nn.Linear(hidden_features, out_features)
        
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x1 = self.w1(x)
        x2 = self.w2(x)
        hidden = self.act(x1) * x2
        x = self.ffn_ln(hidden)
        x = self.w3(x)
        x = self.drop(x)
        return x

class Attention(nn.Module):
    def __init__(
            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
            proj_drop=0., window_size=None, attn_head_dim=None, xattn=False, rope=None, subln=False, norm_layer=nn.LayerNorm):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        if attn_head_dim is not None:
            head_dim = attn_head_dim
        all_head_dim = head_dim * self.num_heads
        self.scale = qk_scale or head_dim ** -0.5

        self.subln = subln
        if self.subln:
            self.q_proj = nn.Linear(dim, all_head_dim, bias=False)
            self.k_proj = nn.Linear(dim, all_head_dim, bias=False)
            self.v_proj = nn.Linear(dim, all_head_dim, bias=False)
        else:
            self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)

        if qkv_bias:
            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
        else:
            self.q_bias = None
            self.v_bias = None

        if window_size:
            self.window_size = window_size
            self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
            self.relative_position_bias_table = nn.Parameter(
                torch.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
            # cls to token & token 2 cls & cls to cls

            # get pair-wise relative position index for each token inside the window
            coords_h = torch.arange(window_size[0])
            coords_w = torch.arange(window_size[1])
            coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
            relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
            relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
            relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
            relative_coords[:, :, 1] += window_size[1] - 1
            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
            relative_position_index = \
                torch.zeros(size=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
            relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
            relative_position_index[0, 0:] = self.num_relative_distance - 3
            relative_position_index[0:, 0] = self.num_relative_distance - 2
            relative_position_index[0, 0] = self.num_relative_distance - 1

            self.register_buffer("relative_position_index", relative_position_index)
        else:
            self.window_size = None
            self.relative_position_bias_table = None
            self.relative_position_index = None

        self.attn_drop = nn.Dropout(attn_drop)
        self.inner_attn_ln = norm_layer(all_head_dim) if subln else nn.Identity()
        # self.proj = nn.Linear(all_head_dim, all_head_dim)
        self.proj = nn.Linear(all_head_dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)
        self.xattn = xattn
        self.xattn_drop = attn_drop

        self.rope = rope

    def forward(self, x, rel_pos_bias=None, attn_mask=None):
        B, N, C = x.shape
        if self.subln: 
            q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias)
            k = F.linear(input=x, weight=self.k_proj.weight, bias=None)
            v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias)

            q = q.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)     # B, num_heads, N, C
            k = k.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)  
            v = v.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) 
        else: 

            qkv_bias = None
            if self.q_bias is not None:
                qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
            
            qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
            qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)   # 3, B, num_heads, N, C
            q, k, v = qkv[0], qkv[1], qkv[2]

        if self.rope:
            # slightly fast impl
            q_t = q[:, :, 1:, :]
            ro_q_t = self.rope(q_t)
            q = torch.cat((q[:, :, :1, :], ro_q_t), -2).type_as(v)

            k_t = k[:, :, 1:, :]
            ro_k_t = self.rope(k_t)
            k = torch.cat((k[:, :, :1, :], ro_k_t), -2).type_as(v)

        if self.xattn:
            q = q.permute(0, 2, 1, 3)   # B, num_heads, N, C -> B, N, num_heads, C
            k = k.permute(0, 2, 1, 3)
            v = v.permute(0, 2, 1, 3)

            x = xops.memory_efficient_attention(
                q, k, v,
                p=self.xattn_drop,
                scale=self.scale,
                )
            x = x.reshape(B, N, -1)
            x = self.inner_attn_ln(x)
            x = self.proj(x)
            x = self.proj_drop(x)
        else:
            q = q * self.scale
            attn = (q @ k.transpose(-2, -1))

            if self.relative_position_bias_table is not None:
                relative_position_bias = \
                    self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
                        self.window_size[0] * self.window_size[1] + 1,
                        self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
                relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
                attn = attn + relative_position_bias.unsqueeze(0).type_as(attn)

            if rel_pos_bias is not None:
                attn = attn + rel_pos_bias.type_as(attn)

            if attn_mask is not None:
                attn_mask = attn_mask.bool()
                attn = attn.masked_fill(~attn_mask[:, None, None, :], float("-inf"))
            
            attn = attn.softmax(dim=-1)
            attn = self.attn_drop(attn)

            x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
            x = self.inner_attn_ln(x)
            x = self.proj(x)
            x = self.proj_drop(x)
        return x


class Block(nn.Module):

    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
                 drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm,
                 window_size=None, attn_head_dim=None, xattn=False, rope=None, postnorm=False,
                 subln=False, naiveswiglu=False):
        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attn = Attention(
            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
            attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim,
            xattn=xattn, rope=rope, subln=subln, norm_layer=norm_layer)
        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)

        if naiveswiglu:
            self.mlp = SwiGLU(
                in_features=dim, 
                hidden_features=mlp_hidden_dim, 
                subln=subln,
                norm_layer=norm_layer,
            )
        else:
            self.mlp = Mlp(
                in_features=dim, 
                hidden_features=mlp_hidden_dim, 
                act_layer=act_layer,
                subln=subln,
                drop=drop
            )

        if init_values is not None and init_values > 0:
            self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
            self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
        else:
            self.gamma_1, self.gamma_2 = None, None

        self.postnorm = postnorm

    def forward(self, x, rel_pos_bias=None, attn_mask=None):
        if self.gamma_1 is None:
            if self.postnorm:
                x = x + self.drop_path(self.norm1(self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)))
                x = x + self.drop_path(self.norm2(self.mlp(x)))
            else:
                x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))
                x = x + self.drop_path(self.mlp(self.norm2(x)))
        else:
            if self.postnorm:
                x = x + self.drop_path(self.gamma_1 * self.norm1(self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)))
                x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x)))
            else:
                x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))
                x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
        return x


class PatchEmbed(nn.Module):
    """ Image to Patch Embedding
    """
    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
        super().__init__()
        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)
        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
        self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = num_patches

        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x, **kwargs):
        B, C, H, W = x.shape
        # FIXME look at relaxing size constraints
        assert H == self.img_size[0] and W == self.img_size[1], \
            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
        x = self.proj(x).flatten(2).transpose(1, 2)
        return x


class RelativePositionBias(nn.Module):

    def __init__(self, window_size, num_heads):
        super().__init__()
        self.window_size = window_size
        self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
        self.relative_position_bias_table = nn.Parameter(
            torch.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
        # cls to token & token 2 cls & cls to cls

        # get pair-wise relative position index for each token inside the window
        coords_h = torch.arange(window_size[0])
        coords_w = torch.arange(window_size[1])
        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
        relative_coords[:, :, 1] += window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
        relative_position_index = \
            torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
        relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
        relative_position_index[0, 0:] = self.num_relative_distance - 3
        relative_position_index[0:, 0] = self.num_relative_distance - 2
        relative_position_index[0, 0] = self.num_relative_distance - 1

        self.register_buffer("relative_position_index", relative_position_index)

    def forward(self):
        relative_position_bias = \
            self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
                self.window_size[0] * self.window_size[1] + 1,
                self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
        return relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww


class EVAVisionTransformer(nn.Module):
    """ Vision Transformer with support for patch or hybrid CNN input stage
    """
    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
                 drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None, patch_dropout=0.,
                 use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False, rope=False,
                 use_mean_pooling=True, init_scale=0.001, grad_checkpointing=False, xattn=False, postnorm=False,
                 pt_hw_seq_len=16, intp_freq=False, naiveswiglu=False, subln=False):
        super().__init__()
        self.image_size = img_size
        self.num_classes = num_classes
        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models

        self.patch_embed = PatchEmbed(
            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
        num_patches = self.patch_embed.num_patches

        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        # self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        if use_abs_pos_emb:
            self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
        else:
            self.pos_embed = None
        self.pos_drop = nn.Dropout(p=drop_rate)

        if use_shared_rel_pos_bias:
            self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads)
        else:
            self.rel_pos_bias = None

        if rope:
            half_head_dim = embed_dim // num_heads // 2
            hw_seq_len = img_size // patch_size
            self.rope = VisionRotaryEmbeddingFast(
                dim=half_head_dim,
                pt_seq_len=pt_hw_seq_len,
                ft_seq_len=hw_seq_len if intp_freq else None,
                # patch_dropout=patch_dropout
            )
        else: 
            self.rope = None

        self.naiveswiglu = naiveswiglu

        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
        self.use_rel_pos_bias = use_rel_pos_bias
        self.blocks = nn.ModuleList([
            Block(
                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
                init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None,
                xattn=xattn, rope=self.rope, postnorm=postnorm, subln=subln, naiveswiglu=naiveswiglu)
            for i in range(depth)])
        self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim)
        self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()

        if self.pos_embed is not None:
            trunc_normal_(self.pos_embed, std=.02)

        trunc_normal_(self.cls_token, std=.02)
        # trunc_normal_(self.mask_token, std=.02)

        self.apply(self._init_weights)
        self.fix_init_weight()

        if isinstance(self.head, nn.Linear):
            trunc_normal_(self.head.weight, std=.02)
            self.head.weight.data.mul_(init_scale)
            self.head.bias.data.mul_(init_scale)

        # setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn
        self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0. else nn.Identity()

        self.grad_checkpointing = grad_checkpointing

    def fix_init_weight(self):
        def rescale(param, layer_id):
            param.div_(math.sqrt(2.0 * layer_id))

        for layer_id, layer in enumerate(self.blocks):
            rescale(layer.attn.proj.weight.data, layer_id + 1)
            if self.naiveswiglu:
                rescale(layer.mlp.w3.weight.data, layer_id + 1)
            else:
                rescale(layer.mlp.fc2.weight.data, layer_id + 1)

    def get_cast_dtype(self) -> torch.dtype:
        return self.blocks[0].mlp.fc2.weight.dtype

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    def get_num_layers(self):
        return len(self.blocks)
    
    def lock(self, unlocked_groups=0, freeze_bn_stats=False):
        assert unlocked_groups == 0, 'partial locking not currently supported for this model'
        for param in self.parameters():
            param.requires_grad = False

    @torch.jit.ignore
    def set_grad_checkpointing(self, enable=True):
        self.grad_checkpointing = enable

    @torch.jit.ignore
    def no_weight_decay(self):
        return {'pos_embed', 'cls_token'}

    def get_classifier(self):
        return self.head

    def reset_classifier(self, num_classes, global_pool=''):
        self.num_classes = num_classes
        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()

    def forward_features(self, x, return_all_features=False):
        
        x = self.patch_embed(x)
        batch_size, seq_len, _ = x.size()

        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
        x = torch.cat((cls_tokens, x), dim=1)
        if self.pos_embed is not None:
            x = x + self.pos_embed
        x = self.pos_drop(x)

        # a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in
        if os.getenv('RoPE') == '1':
            if self.training and not isinstance(self.patch_dropout, nn.Identity):
                x, patch_indices_keep = self.patch_dropout(x)
                self.rope.forward = partial(self.rope.forward, patch_indices_keep=patch_indices_keep)
            else:
                self.rope.forward = partial(self.rope.forward, patch_indices_keep=None)
                x = self.patch_dropout(x)
        else:
            x = self.patch_dropout(x)

        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
        for i, blk in enumerate(self.blocks):
            if i == len(self.blocks)-1:
                continue
            if self.grad_checkpointing:
                x = checkpoint(blk, x, (rel_pos_bias,))
            else:
                x = blk(x, rel_pos_bias=rel_pos_bias)

        if not return_all_features:
            x = self.norm(x)
            if self.fc_norm is not None:
                return self.fc_norm(x.mean(1))
            else:
                return x[:, 0]
        return x

    def forward(self, x, return_all_features=False):
        if return_all_features:
            return self.forward_features(x, return_all_features)
        x = self.forward_features(x)
        x = self.head(x)
        return x

class LayerNorm(nn.LayerNorm):
    """Subclass torch's LayerNorm (with cast back to input dtype)."""

    def forward(self, x: torch.Tensor):
        orig_type = x.dtype
        x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
        return x.to(orig_type)

try:
    from apex.normalization import FusedLayerNorm
except:
    FusedLayerNorm = LayerNorm
    print("Please build and install Nvidia apex package with option '--cuda_ext' according to https://github.com/NVIDIA/apex#from-source .")


@dataclass
class CLIPVisionCfg:
    layers: Union[Tuple[int, int, int, int], int] = 12
    width: int = 768
    head_width: int = 64
    mlp_ratio: float = 4.0
    patch_size: int = 16
    image_size: Union[Tuple[int, int], int] = 224
    ls_init_value: Optional[float] = None  # layer scale initial value
    patch_dropout: float = 0. # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
    global_average_pool: bool = False # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580)
    drop_path_rate: Optional[float] = None  # drop path rate
    timm_model_name: str = None  # a valid model name overrides layers, width, patch_size
    timm_model_pretrained: bool = False  # use (imagenet) pretrained weights for named model
    timm_pool: str = 'avg'  # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
    timm_proj: str = 'linear'  # linear projection for timm model output ('linear', 'mlp', '')
    timm_proj_bias: bool = False  # enable bias final projection
    eva_model_name: str = None # a valid eva model name overrides layers, width, patch_size
    qkv_bias: bool = True
    fusedLN: bool = False
    xattn: bool = False
    postnorm: bool = False
    rope: bool = False
    pt_hw_seq_len: int = 16   # 224/14
    intp_freq: bool = False
    naiveswiglu: bool = False
    subln: bool = False


def _build_vision_tower(
        embed_dim: int,
        vision_cfg: CLIPVisionCfg
):
    if isinstance(vision_cfg, dict):
        vision_cfg = CLIPVisionCfg(**vision_cfg)

    if vision_cfg.eva_model_name:
        vision_heads = vision_cfg.width // vision_cfg.head_width
        norm_layer = LayerNorm
        visual = EVAVisionTransformer(
            img_size=vision_cfg.image_size,
            patch_size=vision_cfg.patch_size,
            num_classes=embed_dim,
            use_mean_pooling=vision_cfg.global_average_pool, #False
            init_values=vision_cfg.ls_init_value,
            patch_dropout=vision_cfg.patch_dropout,
            embed_dim=vision_cfg.width,
            depth=vision_cfg.layers,
            num_heads=vision_heads,
            mlp_ratio=vision_cfg.mlp_ratio,
            qkv_bias=vision_cfg.qkv_bias,
            drop_path_rate=vision_cfg.drop_path_rate,
            norm_layer= partial(FusedLayerNorm, eps=1e-6) if vision_cfg.fusedLN else partial(norm_layer, eps=1e-6),
            xattn=vision_cfg.xattn,
            rope=vision_cfg.rope,
            postnorm=vision_cfg.postnorm,
            pt_hw_seq_len= vision_cfg.pt_hw_seq_len,   # 224/14
            intp_freq= vision_cfg.intp_freq,
            naiveswiglu= vision_cfg.naiveswiglu,
            subln= vision_cfg.subln
        )

    return visual

class Eva2LargeEncoder(nn.Module):
    def __init__(self, image_size=224):
        super(Eva2LargeEncoder, self).__init__()
        self.config = {
            "embed_dim": 768,
            "vision_cfg": {
                "image_size": 336,
                "layers": 24,
                "width": 1024,
                "drop_path_rate": 0,
                "head_width": 64,
                "mlp_ratio": 2.6667,
                "patch_size": 14,
                "eva_model_name": "eva-clip-l-14-336",
                "xattn": True,
                "fusedLN": True,
                "rope": True,
                "pt_hw_seq_len": 16,
                "intp_freq": True,
                "naiveswiglu": True,
                "subln": True
            }
        }
        self.config['vision_cfg']['image_size'] = image_size
        
        import os
        self.model = _build_vision_tower(**self.config)


    def forward(self, image, **kwargs): # diverge from hf version
        encode = self.model(image, return_all_features=True)[:, 1:, :]
        return encode

class CrossVisionModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.vit = Eva2LargeEncoder(image_size=config.cross_image_size)
        self.pos_embed = nn.Parameter(torch.zeros((self.vit.config['vision_cfg']['image_size'] // self.vit.config['vision_cfg']['patch_size']) ** 2, self.vit.config['vision_cfg']['width']))

    def forward(self, images):
        enc = self.vit(images)
        return enc + self.pos_embed.unsqueeze(0)


================================================
FILE: utils/models/eva_clip_model.py
================================================
import torch
from sat.model.base_model import BaseModel
from sat.model.mixins import BaseMixin
from sat.model.official.vit_model import ViTProperty, ImagePatchEmbeddingMixin, InterpolatedPositionEmbeddingMixin, gelu
from sat import mpu

class IdentityMixin(BaseMixin):
    def __init__(self):
        super().__init__()

    def final_forward(self, logits, **kwargs):
        return logits[:, 1:]

import xformers.ops as xops
class XAttn(BaseMixin):
    def __init__(self, head_dim):
        super().__init__()
        self.scale = head_dim ** -0.5

    def attention_fn(self, query_layer, key_layer, value_layer, attention_mask,
                       attention_dropout=None, log_attention_weights=None, scaling_attention_score=True, **kwargs):
        dropout_p = 0. # xformers does not support dropout for eva hidden size

        query_layer = query_layer.permute(0, 2, 1, 3)   # B, num_heads, N, C -> B, N, num_heads, C
        key_layer = key_layer.permute(0, 2, 1, 3)
        value_layer = value_layer.permute(0, 2, 1, 3)

        out = xops.memory_efficient_attention(
            query_layer, key_layer, value_layer,
            p=dropout_p,
            scale=self.scale,
            )
        return out
    
    def attention_forward(self, hidden_states, mask, **kw_args):
        self = self.transformer.layers[kw_args['layer_id']].attention
        attention_fn = self.hooks['attention_fn']

        mixed_raw_layer = self.query_key_value(hidden_states)

        B, N, C = hidden_states.shape
        mixed_raw_layer = mixed_raw_layer.reshape(B, N, 3, self.num_attention_heads_per_partition, -1).permute(2, 0, 3, 1, 4)   # 3, B, num_heads, N, C
        query_layer, key_layer, value_layer = mixed_raw_layer[0], mixed_raw_layer[1], mixed_raw_layer[2]

        dropout_fn = self.attention_dropout if self.training else None

        context_layer = attention_fn(query_layer, key_layer, value_layer, mask, dropout_fn, **kw_args)

        context_layer = context_layer.view(B, N, -1)
        output = self.dense(context_layer)

        if self.training:
            output = self.output_dropout(output)
        return output

class NewLayerForward(BaseMixin):
    def __init__(self):
        super().__init__()

    def layer_forward(self, hidden_states, mask, *args, **kw_args):
        '''
            hidden_states: [batch, seq_len, hidden_size]
            mask: [(1, 1), seq_len, seq_len]
        '''
        self = self.transformer.layers[kw_args['layer_id']]
        
        attention_input = hidden_states

        # Self attention.
        attention_output = self.input_layernorm(self.attention(attention_input, mask, **kw_args))

        # DropPath for attention
        if self.training and self.drop_path > 0.:
            if mpu.get_cuda_rng_tracker is not None:
                # drop_path must use model parallel rng tracker
                # the tracker is initialized as seed of `seed + model_parallel_rank`
                # deepspeed act-ckpt record the model parallel tracker states
                with mpu.get_cuda_rng_tracker().fork():
                    # drop_path percentage 0, others 1/(1-p)
                    random_tensor = (1-self.drop_path
                                    + torch.rand((attention_output.shape[0],), dtype=attention_output.dtype, device=attention_output.device)).floor_() / (1-self.drop_path)
                    attention_output = random_tensor.view(-1, 1, 1) * attention_output
        
        # Residual connection.
        hidden_states = attention_input + attention_output
        mlp_input = hidden_states

        # MLP.
        mlp_output = self.post_attention_layernorm(self.mlp(mlp_input, **kw_args))

        # DropPath for mlp
        if self.training and self.drop_path > 0.:
            if mpu.get_cuda_rng_tracker is not None:
                with mpu.get_cuda_rng_tracker().fork():
                    random_tensor = (1-self.drop_path
                                    + torch.rand((mlp_output.shape[0],), dtype=mlp_output.dtype, device=mlp_output.device)).floor_() / (1-self.drop_path)
                    mlp_output = random_tensor.view(-1, 1, 1) * mlp_output

        # Second residual connection.
        output = mlp_input + mlp_output

        return output

class EVA2CLIPModel(BaseModel):
    def __init__(self, args, transformer=None, **kwargs):
        property = ViTProperty(args.image_size, args.patch_size, args.pre_len, args.post_len)
        args.max_sequence_length = property.pre_len + property.num_patches + property.post_len
        if 'activation_func' not in kwargs:
            kwargs['activation_func'] = gelu
        super().__init__(args, transformer=transformer, **kwargs)
        self.transformer.property = property
        self.add_mixin("patch_embedding", ImagePatchEmbeddingMixin(args.in_channels, args.hidden_size, property))
        self.add_mixin("pos_embedding", InterpolatedPositionEmbeddingMixin())
        self.add_mixin("final", IdentityMixin())
        self.add_mixin("newpost", NewLayerForward())
        self.add_mixin("xattn", XAttn(args.hidden_size // args.num_attention_heads))

    @classmethod
    def add_model_specific_args(cls, parser):
        group = parser.add_argument_group('EVA2CLIP', 'EVA2CLIP Configurations')
        group.add_argument('--image-size', nargs='+', type=int, default=[224, 224])
        group.add_argument('--pre-len', type=int, default=1) # [cls] by default
        group.add_argument('--post-len', type=int, default=0) # empty by default, but sometimes with special tokens, such as [det] in yolos.
        group.add_argument('--in-channels', type=int, default=3)
        group.add_argument('--patch-size', type=int, default=16)
        return parser


================================================
FILE: utils/models/mixin.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from sat.transformer_defaults import attention_fn_default
from sat.model.base_model import BaseMixin, non_conflict
from sat.mpu.layers import ColumnParallelLinear, RowParallelLinear
from sat.mpu.utils import split_tensor_along_last_dim
from sat import mpu


class LlamaVisionExpertFCMixin(BaseMixin):
    def __init__(self, in_features, hidden_features, num_layers=32, num_vision_layers=0, vision_layer_range=None,
                 params_dtype=torch.float, device=torch.device('cpu')):
        super().__init__()

        self.num_layers = num_layers
        self.num_vision_layers = num_vision_layers
        if vision_layer_range is None:
            vision_layer_range = [i for i in range(min(num_vision_layers, num_layers))]
        self.vision_layer_range = vision_layer_range
        self.gate_proj = nn.ModuleList([ColumnParallelLinear(
            in_features,
            hidden_features,
            gather_output=False,
            init_method=None,
            bias=False,
            params_dtype=params_dtype,
            module=self,
            name="dense_h_to_4h_gate",
            skip_init=True,
            device=device
        ) for i in range(num_layers)])
        # Trainable vision expert parameters
        vision_dense_h_to_4h_list = []
        vision_dense_4h_to_h_list = []
        gate_proj_list = []


        for i in vision_layer_range:
            vision_dense_h_to_4h = ColumnParallelLinear(
                in_features,
                hidden_features,
                gather_output=False,
                init_method=None,
                bias=False,
                params_dtype=params_dtype,
                module=self,
                name="vision_dense_h_to_4h",
                skip_init=True,
                device=device
            )

            # Project back to h.
            vision_dense_4h_to_h = RowParallelLinear(
                hidden_features,
                in_features,
                input_is_parallel=True,
                init_method=None,
                bias=False,
                params_dtype=params_dtype,
                module=self,
                name="vision_dense_4h_to_h",
                skip_init=True,
                device=device
            )

            gate_proj = ColumnParallelLinear(
                in_features,
                hidden_features,
                gather_output=False,
                init_method=None,
                bias=False,
                params_dtype=params_dtype,
                module=self,
                name="vision_gate_proj",
                skip_init=True,
                device=device
            )

            vision_dense_h_to_4h_list.append(vision_dense_h_to_4h)
            vision_dense_4h_to_h_list.append(vision_dense_4h_to_h)
            gate_proj_list.append(gate_proj)

        self.vision_dense_h_to_4h_list = nn.ModuleDict([
            (str(layer_id), vision_dense_h_to_4h)
            for layer_id, vision_dense_h_to_4h in zip(vision_layer_range, vision_dense_h_to_4h_list)
        ])
        self.vision_dense_4h_to_h_list = nn.ModuleDict([
            (str(layer_id), vision_dense_4h_to_h)
            for layer_id, vision_dense_4h_to_h in zip(vision_layer_range, vision_dense_4h_to_h_list)
        ])
        self.vision_gate_proj = nn.ModuleDict([
            (str(layer_id), gate_proj)
            for layer_id, gate_proj in zip(vision_layer_range, gate_proj_list)
        ])

    def mlp_forward(self, hidden_states, **kw_args):
        mixin_self = self
        self = self.transformer.layers[kw_args['layer_id']].mlp
        if "vision_expert_mask" in kw_args:
            vision_expert_mask = kw_args['vision_expert_mask']
        else:
            vision_expert_mask = None

        layer_id_key = str(int(kw_args['layer_id']))

        if kw_args['layer_id'] in mixin_self.vision_layer_range and (vision_expert_mask is not None) and vision_expert_mask.any():
            vision_dense_h_to_4h = mixin_self.vision_dense_h_to_4h_list[layer_id_key]
            vision_dense_4h_to_h = mixin_self.vision_dense_4h_to_h_list[layer_id_key]
            vision_gate_proj = mixin_self.vision_gate_proj[layer_id_key]
            output = torch.empty(hidden_states.shape, dtype=hidden_states.dtype, device=hidden_states.device)

            language_hidden_state = hidden_states[~vision_expert_mask.bool()]
            language_intermediate_parallel = self.activation_func(mixin_self.gate_proj[kw_args['layer_id']](language_hidden_state)) * self.dense_h_to_4h(language_hidden_state)
            output[~vision_expert_mask.bool()] = self.dense_4h_to_h(language_intermediate_parallel)  # language_output

            vision_hidden_state = hidden_states[vision_expert_mask.bool()]
            vision_intermediate_parallel = vision_dense_h_to_4h(vision_hidden_state)
            gate_output = vision_gate_proj(vision_hidden_state)

            vision_intermediate_parallel *= self.activation_func(gate_output)
            output[vision_expert_mask.bool()] = vision_dense_4h_to_h(vision_intermediate_parallel)  # vision_output
        else:
            intermediate_parallel = self.activation_func(mixin_self.gate_proj[kw_args['layer_id']](hidden_states)) * self.dense_h_to_4h(hidden_states)
            output = self.dense_4h_to_h(intermediate_parallel)

        return output.contiguous()

    def copy_param(self):
        with torch.no_grad():
            for i in self.vision_layer_range:
                self.vision_gate_proj[str(i)].weight.data.copy_(self.gate_proj[i].weight.data)
                self.vision_dense_4h_to_h_list[str(i)].weight.data.copy_(self.transformer.layers[i].mlp.dense_4h_to_h.weight.data)
                self.vision_dense_h_to_4h_list[str(i)].weight.data.copy_(self.transformer.layers[i].mlp.dense_h_to_4h.weight.data)

from sat.mpu import get_model_parallel_world_size
from sat.mpu.utils import divide
from sat.model.position_embedding.triton_rotary_embeddings import FastRotaryEmbedding

class LlamaVisionExpertAttnMixin(BaseMixin):
    def __init__(self, hidden_size, num_heads, num_layers=28, num_vision_layers=0, use_vision_expert=True, vision_layer_range=None,
                 params_dtype=torch.float, device=torch.device('cpu')):
        super().__init__()

        world_size = get_model_parallel_world_size()
        self.hidden_size = hidden_size
        self.num_attention_heads = num_heads
        self.hidden_size_per_attention_head = divide(hidden_size, num_heads)
        self.num_attention_heads_per_partition = divide(num_heads, world_size)
        self.inner_hidden_size = num_heads * self.hidden_size_per_attention_head

        self.rotary_emb = FastRotaryEmbedding(
             hidden_size // num_heads, pos_idx_in_fp32=False
         )

        self.num_vision_layers = num_vision_layers
        self.num_layers = num_layers
        if vision_layer_range is None:
            vision_layer_range = [i for i in range(min(num_vision_layers, num_layers))]
        self.vision_layer_range = vision_layer_range

        self.use_vision_expert = use_vision_expert
        # Trainable vision expert parameters

        if self.use_vision_expert:
            vision_query_key_value_list = []
            vision_dense_list = []
            for i in vision_layer_range:
                vision_query_key_value = ColumnParallelLinear(
                    hidden_size,
                    3 * hidden_size,
                    stride=3,
                    gather_output=False,
                    init_method=None,
                    bias=False,
                    params_dtype=params_dtype,
                    module=self,
                    name="vision_query_key_value",
                    skip_init=True,
                    device=device
                )

                vision_dense = RowParallelLinear(
                    self.inner_hidden_size,
                    hidden_size,
                    input_is_parallel=True,
                    init_method=None,
                    bias=False,
                    params_dtype=params_dtype,
                    module=self,
                    name="vision_dense",
                    skip_init=True,
                    device=device,
                    final_bias=False
                )

                vision_query_key_value_list.append(vision_query_key_value)
                vision_dense_list.append(vision_dense)

            self.vision_query_key_value_list = nn.ModuleDict([
                (str(layer_id), vision_query_key_value)
                for layer_id, vision_query_key_value in zip(vision_layer_range, vision_query_key_value_list)
            ])
            self.vision_dense_list = nn.ModuleDict([
                (str(layer_id), vision_dense)
                for layer_id, vision_dense in zip(vision_layer_range, vision_dense_list)
            ])

    def attention_forward(self, hidden_states, mask, **kw_args):
        mixin_self = self
        self = self.transformer.layers[kw_args['layer_id']].attention
        attention_fn = attention_fn_default
        if 'attention_fn' in self.hooks:
            attention_fn = self.hooks['attention_fn']
        if "vision_expert_mask" in kw_args:
            vision_expert_mask = kw_args['vision_expert_mask']
        else:
            vision_expert_mask = None

        layer_id_key = str(int(kw_args['layer_id']))
        if mixin_self.use_vision_expert and kw_args['layer_id'] in mixin_self.vision_layer_range and (
                vision_expert_mask is not None) and vision_expert_mask.any():
            shape = list(hidden_states.shape)
            parallel_size = mpu.get_model_parallel_world_size()
            shape[-1] = shape[-1] * 3 // parallel_size
            vision_query_key_value = mixin_self.vision_query_key_value_list[layer_id_key]
            mixed_raw_layer = torch.empty(shape, dtype=hidden_states.dtype, device=hidden_states.device)
            language_hidden_states = hidden_states[~vision_expert_mask.bool()]
            vision_hidden_states = hidden_states[vision_expert_mask.bool()]
            mixed_raw_layer[~vision_expert_mask.bool()] = self.query_key_value(
                language_hidden_states)  # language_mixed_raw_layer
            mixed_raw_layer[vision_expert_mask.bool()] = vision_query_key_value(
                vision_hidden_states)  # vision_mixed_raw_layer
        else:
            mixed_raw_layer = self.query_key_value(hidden_states)

        (mixed_query_layer,
            mixed_key_layer,
            mixed_value_layer) = split_tensor_along_last_dim(mixed_raw_layer, 3)

        dropout_fn = self.attention_dropout if self.training else None

        query_layer = self._transpose_for_scores(mixed_query_layer)
        key_layer = self._transpose_for_scores(mixed_key_layer)
        value_layer = self._transpose_for_scores(mixed_value_layer)

        query_layer, key_layer = mixin_self.rotary_emb(query_layer,key_layer, kw_args['position_ids'], max_seqlen=kw_args['position_ids'].max()+1, layer_id=kw_args['layer_id'])
        
        context_layer = attention_fn(query_layer, key_layer, value_layer, mask, dropout_fn, **kw_args)

        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
        context_layer = context_layer.view(*new_context_layer_shape)

        if mixin_self.use_vision_expert and kw_args['layer_id'] in mixin_self.vision_layer_range and (
                vision_expert_mask is not None) and vision_expert_mask.any():
            vision_dense = mixin_self.vision_dense_list[layer_id_key]
            parallel_size = mpu.get_model_parallel_world_size()
            target_shape = context_layer.shape[:-1] + (context_layer.shape[-1] * parallel_size,)
            output = torch.empty(target_shape, dtype=hidden_states.dtype, device=hidden_states.device)
            output[~vision_expert_mask.bool()] = self.dense(context_layer[~vision_expert_mask.bool()])  # language
            output[vision_expert_mask.bool()] = vision_dense(context_layer[vision_expert_mask.bool()])  # vision
        else:
            output = self.dense(context_layer)

        if self.training:
            output = self.output_dropout(output)
        return output.contiguous()

    def copy_param(self):
        with torch.no_grad():
            for i in self.vision_layer_range:
                self.vision_query_key_value_list[str(i)].weight.data.copy_(self.transformer.layers[i].attention.query_key_value.weight.data)
                self.vision_dense_list[str(i)].weight.data.copy_(self.transformer.layers[i].attention.dense.weight.data)

================================================
FILE: utils/split_dataset.py
================================================
import os
import shutil

def find_all_files(path, suffix=".jpg"):
    target_files = []
    for cur_dir, _, files in os.walk(path, followlinks=True):
        for f in files:
            if f.endswith(suffix):
                target_files.append(os.path.join(cur_dir, f))
    print(f'find {len(target_files)} files...')
    return target_files

all_files = find_all_files('archive')
os.makedirs("archive_split", exist_ok=True)
os.makedirs("archive_split/train", exist_ok=True)
os.makedirs("archive_split/valid", exist_ok=True)
os.makedirs("archive_split/test", exist_ok=True)

import random
random.seed(2023)
random.shuffle(all_files)
train = all_files[:8000]
valid = all_files[8000:8000+500]
test = all_files[8000+500:8000+500+1500]

print("building train")
for file in train:
    shutil.move(file, os.path.join("archive_split/train", file.split("/")[-1]))
print("building valid")
for file in valid:
    shutil.move(file, os.path.join("archive_split/valid", file.split("/")[-1]))
print("building test")
for file in test:
    shutil.move(file, os.path.join("archive_split/test", file.split("/")[-1]))
print("done")

================================================
FILE: utils/utils/__init__.py
================================================
from .chat import chat
from .language import llama2_tokenizer, llama2_text_processor, llama2_text_processor_inference
from .vision import get_image_processor
from .grounding_parser import parse_response
from .dataset import ItemDataset

================================================
FILE: utils/utils/chat.py
================================================
# -*- encoding: utf-8 -*-
'''
@File    :   chat.py
@Time    :   2023/05/08 19:10:08
@Author  :   Ming Ding 
@Contact :   dm18@mails.tsinghua.edu.cn
'''

from typing import Optional, Tuple, Union, List, Callable, Dict, Any
import requests
from PIL import Image
from io import BytesIO

import torch
from sat.generation.autoregressive_sampling import filling_sequence, stream_filling_sequence, get_masks_and_position_ids_default
from sat.generation.sampling_strategies import BaseStrategy, BeamSearchStrategy
from sat.mpu import get_model_parallel_rank

def process_image(image_path, img_processor, cross_img_processor, image):
    if image is None:
        if image_path.startswith("http"):
            response = requests.get(image_path, timeout=10)
            image = Image.open(BytesIO(response.content))
        else:
            image = Image.open(image_path)

    if image is not None and isinstance(image, Image.Image):
        pil_img = image.convert('RGB')
        img_dict = img_processor(pil_img)
        cross_img_dict = cross_img_processor(pil_img) if cross_img_processor is not None else {}
        ret = (img_dict, pil_img, cross_img_dict)
    else:
        ret = image
    return ret

def chat(image_path, model, text_processor, img_processor,
        query: str, history: List[Tuple[str, str]] = None, cross_img_processor=None, image: Image = None,
        max_length: int = 4096, top_p=0.95, top_k=5, temperature=0.95, repetition_penalty=1.0,
        invalid_slices=[], no_prompt=False, args=None
        ):
    if image is None:
        assert image_path is not None
    if not history:
        history = []

    if no_prompt:
        query = ''
    prompt = text_processor.history_to_prompt(query, history)

    (torch_image, pil_img, cross_image) = process_image(image_path, img_processor, cross_img_processor, image)

    if torch_image is not None:
        for k in torch_image:
            if type(torch_image[k]) is torch.Tensor and torch_image[k].dtype is not torch.int and torch_image[k].dtype is not torch.long:
                torch_image[k] = torch_image[k].to(torch.bfloat16 if args.bf16 else torch.float16)
            if type(torch_image[k]) is torch.Tensor:
                torch_image[k] = torch_image[k].to(next(model.parameters()).device)
                
    if cross_image is not None:
        for k in cross_image:
            if type(cross_image[k]) is torch.Tensor and cross_image[k].dtype is not torch.int and cross_image[k].dtype is not torch.long:
                cross_image[k] = cross_image[k].to(torch.bfloat16 if args.bf16 else torch.float16)
            if type(cross_image[k]) is torch.Tensor:
                cross_image[k] = cross_image[k].to(next(model.parameters()).device)

    inputs_dic = text_processor(prompt)
    for k in inputs_dic:
        if type(inputs_dic[k]) is torch.Tensor and inputs_dic[k].dtype is not torch.int and inputs_dic[k].dtype is not torch.long:
            inputs_dic[k] = inputs_dic[k].to(torch.bfloat16 if args.bf16 else torch.float16)
        if type(inputs_dic[k]) is torch.Tensor:
            inputs_dic[k] = inputs_dic[k].to(next(model.parameters()).device)
    input_ids = inputs_dic['input_ids'].to(model.parameters().__next__().device)[0]
    
    if max_length-len(input_ids) <= 1:
        response = "The prompt exceeds the context length limit, please try again."
        return response, history, (torch_image, pil_img)
    
    seq = torch.cat(
        [input_ids, torch.tensor([-1]*(max_length-len(input_ids)), device=input_ids.device)], dim=0
    )
    strategy = BaseStrategy(temperature=temperature, top_p=top_p, top_k=top_k, end_tokens=[text_processor.tokenizer.eos_token_id],
                            invalid_slices=invalid_slices, repetition_penalty=repetition_penalty)
    # use beam search to get a better result
    # strategy = BeamSearchStrategy(temperature=temperature, top_p=top_p, top_k=top_k, end_tokens=[text_processor.tokenizer.eos_token_id],
    #                               num_beams=5, consider_end=True, repetition_penalty=repetition_penalty)
    get_func = text_processor.get_func(input_ids, **inputs_dic) if hasattr(text_processor, 'get_func') else get_masks_and_position_ids_default

    img_inputs = {'vision_'+k: v for k, v in torch_image.items()}
    if cross_image is not None:
        img_inputs = {**img_inputs, **{'cross_'+k:v for k,v in cross_image.items()}}
    inputs_dic.pop('input_ids')
    inputs = {**img_inputs, **inputs_dic}

    if args.stream_chat:
        filling_stream = stream_filling_sequence(
            model, seq,
            batch_size=1,
            get_masks_and_position_ids=get_func,
            strategy=strategy,
            **inputs
        )
        if get_model_parallel_rank() == 0:
            if 'chinese' in args and not args.chinese:
                print("Model: ", end='')
            else:
                print("模型：", end='')
        offset = len(text_processor.tokenizer.decode(input_ids))
        for tokens, mems in filling_stream:
            torch.cuda.empty_cache()
            tmp_response = text_processor.tokenizer.decode(tokens[0])
            if tmp_response[-1] != "�":
                if get_model_parallel_rank() == 0:
                    tmp_response_offseted = tmp_response[offset:]
                    if hasattr(text_processor, 'process_response'):
                        tmp_response_offseted = text_processor.process_response(tmp_response_offseted)
                    print(tmp_response_offseted, end='', flush=True)
                offset = len(tmp_response)
        if get_model_parallel_rank() == 0:
            print()
        output = strategy.finalize(tokens, mems)[0]

        response = text_processor.tokenizer.decode(output[0])
    else:
        output = filling_sequence(
            model, seq,
            batch_size=1,
            get_masks_and_position_ids=get_func,
            strategy=strategy,
            **inputs
        )[0] # drop memory
        
        # ---------------
        # port from inference_glm.py, more general than chat mode
        # clip -1s and fill back generated things into seq
        if type(output) is not list:
            output_list = output.tolist()
        else:
            output_list = output

        response = text_processor.tokenizer.decode(output_list[0])
    # print('original:', response)
    if hasattr(text_processor, 'process_response'):
        response = text_processor.process_response(response)
    response = response.split(text_processor.sep)[-1].strip()
    if get_model_parallel_rank() == 0:
        from utils.utils.grounding_parser import parse_response
        parse_response(pil_img, response)
    history = history + [(query, response)]
    return response, history, (torch_image, pil_img, cross_image)


================================================
FILE: utils/utils/dataset.py
================================================
import os
import logging
import random
import logging
import jsonlines
from io import BytesIO
from PIL import Image
from torch.utils.data import Dataset
from sat.helpers import print_rank0

def find_all_files(path, suffix=".jpg"):
    target_files = []
    for cur_dir, _, files in os.walk(path, followlinks=True):
        for f in files:
            if f.endswith(suffix):
                target_files.append(os.path.join(cur_dir, f))
    print_rank0(f'find {len(target_files)} files...')
    return target_files

class ItemDataset(Dataset):
    def __init__(self, image_processor, text_processor, args, data_dirs, cross_image_processor=None, **kwargs):
        super().__init__()
        self.data = self.load_data(data_dirs)
        self.image_processor, self.text_processor, self.cross_image_processor = image_processor, text_processor, cross_image_processor
    
    def process_img(self, img):
        img_dict = {'vision': self.image_processor(img)}
        if self.cross_image_processor:
            img_dict.update({'cross': self.cross_image_processor(img)})
        return img_dict
    
    def process_text(self, answer, prompt):
        return self.text_processor(answer, prompt)
    
    def load_data(self, data_dir):
        all_files = find_all_files(data_dir, suffix=".jpg")
        print_rank0(f"find {len(all_files)} samples in all...")
        return all_files
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data = self.data[index]
        # img
        try:
            img = Image.open(data).convert('RGB')
        except Exception as e:
            print_rank0(e, level=logging.WARNING)
            return {}
        img_dict = self.process_img(img)
        # text
        label = data.split('/')[-1].split('.')[0]
        uni_key = label
        text_dict = self.process_text(label, "CAPTCHA:")
        if text_dict is None:
            print_rank0(f"Process text failed. Please check the max_target_length & max_source_length.\n The data is {data}", level=logging.WARNING)
            return {}
        # other attr
        ret = {**img_dict, **text_dict, "question_id": uni_key}
        return ret

================================================
FILE: utils/utils/grounding_parser.py
================================================
import seaborn as sns
from PIL import Image, ImageDraw, ImageFont
import matplotlib.font_manager
import spacy
import re

nlp = spacy.load("en_core_web_sm")

def draw_boxes(image, boxes, texts, output_fn='output.png'):
    box_width = 5
    color_palette = sns.color_palette("husl", len(boxes))
    colors = [(int(r*255), int(g*255), int(b*255)) for r, g, b in color_palette]

    width, height = image.size
    absolute_boxes = [[(int(box[0] * width), int(box[1] * height), int(box[2] * width), int(box[3] * height)) for box in b] for b in boxes]
    
    overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
    draw = ImageDraw.Draw(overlay)
    font_path = sorted(matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext='ttf'))[0]
    font = ImageFont.truetype(font_path, size=26)

    for box, text, color in zip(absolute_boxes, texts, colors):
        for b in box:
            draw.rectangle(b, outline=color, width=box_width)
            if not text:
                continue
            splited_text = text.split('\n')
            num_lines = len(splited_text)
            text_width, text_height = font.getbbox(splited_text[0])[-2:]
            y_start = b[3] - text_height * num_lines - box_width
            if b[2] - b[0] < 100 or b[3] - b[1] < 100:
                y_start = b[3]
            for i, line in enumerate(splited_text):
                text_width, text_height = font.getbbox(line)[-2:]
                x = b[0] + box_width
                y = y_start + text_height * i
                draw.rectangle([x, y, x+text_width, y+text_height], fill=(128, 128, 128, 160))
                draw.text((x, y), line, font=font, fill=(255, 255, 255))
    img_with_overlay = Image.alpha_composite(image.convert('RGBA'), overlay).convert('RGB')
    img_with_overlay.save(output_fn)

def boxstr_to_boxes(box_str):
    boxes = [[int(y)/1000 for y in x.split(',')] for x in box_str.split(';') if x.replace(',', '').isdigit()]
    return boxes

def text_to_dict(text):
    doc = nlp(text)

    box_matches = list(re.finditer(r'\[\[([^\]]+)\]\]', text))
    box_positions = [match.start() for match in box_matches]

    noun_phrases = []
    boxes = []

    for match, box_position in zip(box_matches, box_positions):
        nearest_np_start = max([0] + [chunk.start_char for chunk in doc.noun_chunks if chunk.end_char <= box_position])
        noun_phrase = text[nearest_np_start:box_position].strip()
        if noun_phrase and noun_phrase[-1] == '?':
            noun_phrase = text[:box_position].strip()
        box_string = match.group(1)
        
        noun_phrases.append(noun_phrase)
        boxes.append(boxstr_to_boxes(box_string))

    pairs = []
    for noun_phrase, box_string in zip(noun_phrases, boxes):
        pairs.append((noun_phrase.lower(), box_string))
    return dict(pairs)

def parse_response(img, response, output_fn='output.png'):
    img = img.convert('RGB')
    width, height = img.size
    ratio = min(1920 / width, 1080 / height)
    new_width = int(width * ratio)
    new_height = int(height * ratio)
    new_img = img.resize((new_width, new_height), Image.LANCZOS)
    pattern = r"\[\[(.*?)\]\]"
    positions = re.findall(pattern, response)
    boxes = [[[int(y) for y in x.split(',')] for x in pos.split(';') if x.replace(',', '').isdigit()] for pos in positions]
    dic = text_to_dict(response)
    if not dic:
        texts = []
        boxes = []
    else:
        texts, boxes = zip(*dic.items())
    draw_boxes(new_img, boxes, texts, output_fn=output_fn)

================================================
FILE: utils/utils/language.py
================================================
def base_history_to_prompt(self, query, history):
    prompt = '<EOI>' + query
    return prompt

def chat_history_to_prompt(self, query, history):
    prompt = "<EOI> [INST] "
    for i, (old_query, response) in enumerate(history):
        prompt += old_query + " [/INST] " + response + " [INST] "
    prompt += query + " [/INST] "
    return prompt

def vqa_history_to_prompt(self, query, history):
    # Only support single round chat in vqa mode
    prompt = "<EOI>Question: "
    # for i, (old_query, response) in enumerate(history):
    #     prompt += old_query + " Short answer: " + response + " Question: "
    prompt += query + " Short answer:"
    return prompt

def chat_old_history_to_prompt(self, query, history):
    prompt = "<EOI>Question: "
    for i, (old_query, response) in enumerate(history):
        prompt += old_query + " Answer: " + response + "\nQuestion: "
    prompt += query + " Answer:"
    return prompt

_history_to_prompt = {
    "base": base_history_to_prompt,
    "chat": chat_history_to_prompt,
    "vqa": vqa_history_to_prompt,
    "chat_old": chat_old_history_to_prompt, # for cogvlm-v1.1
}

from transformers import LlamaTokenizer

def llama2_tokenizer(tokenizer_path, signal_type="base"):
    tokenizer = LlamaTokenizer.from_pretrained(tokenizer_path)
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = 32000
    tokenizer.boi = "[IMG]"
    tokenizer.eoi = "[/IMG]"
    assert signal_type in ["base", "chat", "vqa", "chat_old"]
    tokenizer.signal_type = signal_type
    return tokenizer

import re
import numpy as np
import torch

class llama2_text_processor:
    def __init__(self, tokenizer, max_target_length=2048, image_length=257, model=None):
        self.tokenizer = tokenizer
        self.max_target_length = max_target_length
        self.image_length = image_length

    def __call__(self, caption, prompt=""):
        if '<EOI>' not in prompt:
            prompt = self.replace_tags_with_empty(prompt)
            # caption = self.replace_tags_with_empty(caption)
            history = []
            prompt = self.history_to_prompt(prompt, history)

        input_ids = [self.tokenizer.bos_token_id]

        prompt_splits = prompt.split('<EOI>')
        caption_splits = caption.split('<EOI>')
        if len(prompt_splits) > 0:
            input_ids.extend(self.tokenizer.encode(prompt_splits[0], add_special_tokens=False))
        for tokens in prompt_splits[1:]:
            tokens_with_img = [-100] + self.tokenizer.encode(tokens, add_special_tokens=False)
            input_ids.extend(tokens_with_img)
        context_length = len(input_ids) + (len(prompt_splits)-1) * (self.image_length + 1)
        if context_length > self.max_target_length - 10:
            return None
        if len(caption_splits) > 0:
            input_ids.extend(self.tokenizer.encode(caption_splits[0], add_special_tokens=False))
        for tokens in caption_splits[1:]:
            tokens_with_img = [-100] + self.tokenizer.encode(tokens, add_special_tokens=False)
            input_ids.extend(tokens_with_img)

        if len(input_ids) > self.max_target_length - self.image_length - 5:
            input_ids = input_ids[:self.max_target_length - self.image_length - 5]

        input_ids += [self.tokenizer.eos_token_id]

        while -100 in input_ids:
            img_idx = input_ids.index(-100)
            input_ids = input_ids[:img_idx] + [0] * (self.image_length + 1) + [-1] + input_ids[img_idx+1:]

        image_position = []
        while -1 in input_ids:
            img_idx = input_ids.index(-1)
            input_ids[img_idx] = 0
            image_position.append(img_idx)

        image_embed_mask = [0] * len(input_ids)
        vision_expert_mask = [0] * len(input_ids)
        image_rope_mask = [0] * len(input_ids)
        for idx in image_position:
            image_embed_mask[idx-self.image_length-1: idx+1] = [1] * (self.image_length + 2)
            vision_expert_mask[idx-self.image_length-1: idx] = [1] * (self.image_length + 1)
            image_rope_mask[idx - self.image_length: idx] = [1] * self.image_length
        attention_mask = [1] * len(input_ids)
        labels = [-100] * context_length + input_ids[context_length:]

        pad_len = self.max_target_length - len(input_ids)
        input_ids = input_ids + [self.tokenizer.pad_token_id] * pad_len
        attention_mask = attention_mask + [1] * pad_len
        vision_expert_mask = vision_expert_mask + [0] * pad_len
        image_embed_mask = image_embed_mask + [0] * pad_len
        image_rope_mask = image_rope_mask + [0] * pad_len
        np_mask = np.tril(np.expand_dims(np.array(attention_mask), 0).repeat(len(attention_mask), 0))
        labels = labels + [-100] * pad_len

        for idx in image_position:
            labels[idx-self.image_length-1: idx+1] = [-100] * (self.image_length + 2)

        position_ids = []
        pid = -1
        for i in range(len(input_ids)):
            if image_rope_mask[i] == 0 or (i > 0 and image_rope_mask[i] != image_rope_mask[i - 1]):
                pid += 1
            position_ids.append(pid)

        input_ids = torch.tensor(input_ids).unsqueeze(0)
        labels = torch.tensor(labels).unsqueeze(0)
        attention_mask = torch.from_numpy(np_mask).unsqueeze(0).unsqueeze(0)
        image_embed_mask = torch.tensor(image_embed_mask).unsqueeze(0)
        vision_expert_mask = torch.tensor(vision_expert_mask).unsqueeze(0)
        image_rope_mask = torch.tensor(image_rope_mask).unsqueeze(0)
        position_ids = torch.tensor(position_ids).unsqueeze(0)
        context_length = torch.tensor(context_length).unsqueeze(0).long()
        return {'input_ids': input_ids, 'labels': labels, 'position_ids': position_ids, 'attention_mask': attention_mask, 'image_embed_mask': image_embed_mask,
                'context_length': context_length, 'image_position': image_position, 'vision_expert_mask': vision_expert_mask, 'image_rope_mask': image_rope_mask
                }

    def history_to_prompt(self, query, history):
        return _history_to_prompt[self.tokenizer.signal_type](self, query, history)

    def replace_tags_with_empty(self, text):
        return re.sub('<pad>|<s>|</s>|<EOI>', '', text)

from functools import partial
def get_masks_and_position_ids(seq, image_logits_mask):
    tokens = seq.unsqueeze(0)

    attention_mask = torch.ones((1, len(seq), len(seq)), device=tokens.device)
    attention_mask.tril_()
    attention_mask.unsqueeze_(1)

    position_ids = []
    pid = -1
    for i in range(len(image_logits_mask[0])):
        if image_logits_mask[0][i] == 0 or (i > 0 and image_logits_mask[0][i] != image_logits_mask[0][i - 1]):
            pid += 1
        position_ids.append(pid)
    for i in range(tokens.shape[1]-image_logits_mask.shape[1]):
        pid += 1
        position_ids.append(pid)
    position_ids = torch.tensor(position_ids, dtype=torch.long, device=tokens.device)
    position_ids = position_ids.unsqueeze(0)

    return tokens, attention_mask, position_ids

class llama2_text_processor_inference:
    def __init__(self, tokenizer, max_target_length=1024, image_length=257, model=None, no_prompt=False, english=True):
        self.tokenizer = tokenizer
        self.max_target_length = max_target_length
        self.image_length = image_length
        if self.tokenizer.signal_type == "chat":
            self.sep = "[/INST]"
        elif self.tokenizer.signal_type == "vqa":
            self.sep = " Short answer:"
        elif self.tokenizer.signal_type == "chat_old":
            self.sep = " Answer:"
        else:
            self.sep = "<unk>"

        self.invalid_slices = []
        self.no_eoi = True

    def __call__(self, prompt=""):
        if '<EOI>' not in prompt:
            prompt = self.replace_tags_with_empty(prompt)
            # caption = self.replace_tags_with_empty(caption)
            history = []
            prompt = self.history_to_prompt(prompt, history)

        input_ids = [self.tokenizer.bos_token_id]

        prompt_splits = prompt.split('<EOI>')
        if len(prompt_splits) > 0:
            input_ids.extend(self.tokenizer.encode(prompt_splits[0], add_special_tokens=False))
        for tokens in prompt_splits[1:]:
            tokens_with_img = [-100] + self.tokenizer.encode(tokens, add_special_tokens=False)
            input_ids.extend(tokens_with_img)

        while -100 in input_ids:
            img_idx = input_ids.index(-100)
            input_ids = input_ids[:img_idx] + [0] * (self.image_length + 1) + [-1] + input_ids[img_idx + 1:]

        image_position = []
        while -1 in input_ids:
            img_idx = input_ids.index(-1)
            input_ids[img_idx] = 0
            image_position.append(img_idx)

        image_embed_mask = [0] * len(input_ids)
        vision_expert_mask = [0] * len(input_ids)
        image_rope_mask = [0] * len(input_ids)
        for idx in image_position:
            image_embed_mask[idx - self.image_length - 1: idx + 1] = [1] * (self.image_length + 2)
            vision_expert_mask[idx - self.image_length - 1: idx] = [1] * (self.image_length + 1)
            image_rope_mask[idx - self.image_length: idx] = [1] * self.image_length

        input_ids = torch.tensor(input_ids).unsqueeze(0)
        image_embed_mask = torch.tensor(image_embed_mask).unsqueeze(0)
        vision_expert_mask = torch.tensor(vision_expert_mask).unsqueeze(0)
        image_rope_mask = torch.tensor(image_rope_mask).unsqueeze(0)
        return {'input_ids': input_ids, 'image_embed_mask': image_embed_mask, 'vision_expert_mask': vision_expert_mask, 'image_rope_mask': image_rope_mask}

    def history_to_prompt(self, query, history):
        return _history_to_prompt[self.tokenizer.signal_type](self, query, history)

    def replace_tags_with_empty(self, text):
        return re.sub('<pad>|<s>|</s>|<EOI>', '', text)

    def process_response(self, response):
        return response.replace('</s>', '')
    
    def get_func(self, inputs, **kwargs):
        get_func = partial(get_masks_and_position_ids, image_logits_mask=kwargs['image_rope_mask'])
        return get_func

================================================
FILE: utils/utils/template.py
================================================
cn_template=[
'这幅作品描绘了：',
'描述这张图片：',
'从这张图片中，我们可以看到：',
'这张图片中最引人注目的是什么？',
'如果要在这张图片上添加一个标语，您会写什么？',
'描述图片内容的关键词：',
'画面传达了以下信息：',
'这张图展示了：',
'这张图片展示了什么？',
'描述这张图片中的场景：',
'这张图片的主要焦点是：',
'适合这张图片的标题是：',
'这张图片可以被描述为：',
'图片中的元素包括：',
'这张图片想表达什么信息？',
'请用一句话来概括这张图片的主题。',
'图片呈现了以下场景：',
'以简短的语言概括这张图片：',
'这张照片的故事是：',
'从这幅画中，我们可以发现：',
'对这幅图像进行简要说明：',
]

en_template = [
'The essence of this image is:',
'A brief summary of this scene would be:',
'If this image could speak, it would say:',
'The image depicts:',
'A photo of',
'Key elements in this picture include:',
'This visual representation showcases:',
'The main focus of this photograph is:',
'Can you identify the main elements or characters in this image?',
'Summarize this image in a single sentence:',
'What\'s happening in this picture?',
'Give a creative title for this image:',
'In a few words, what does this image convey?',
'Capture the essence of this image with a phrase:',
'Describe the scene in this image:',
'The main focus of this picture is:',
'A suitable caption for this image would be:',
'This image can be best described as:',
]

en_template_q = [ # from gpt-4
    "Describe the image.",
    "Give me a summary of this image.",
    "What do you see in the picture?",
    "Tell me about this picture.",
    "Explain the image to me.",
    "Break down what's in the photo.",
    "What is depicted in the picture?",
    "Illustrate the content of the image.",
    "Convey the essence of the image.",
    "Elaborate on the picture.",
    "Can you detail the image?",
    "Provide an overview of this picture.",
    "Walk me through this image.",
    "What does the image show?",
    "Characterize the picture for me.",
    "Render a description of the image.",
    "Can you clarify what's in the image?",
    "Discuss the elements of the picture.",
    "Provide insight into this image.",
    "What's going on in this photo?"
] + [ # from https://github.com/shikras/shikra/blob/main/config/_base_/dataset/template/image_cap.json
  "Describe this image as simply as possible.",
  "What happened in the picture? Answer in short sentences.",
  "Briefly say the content of this scene",
  "Show the content in the photo in short text.",
  "Please describe the content of the image in a few words.",
  "What is the content of the image? Please answer in short sentences.",
  "Can you give me a brief description of this image?",
  "What do you see in this picture?",
  "In a few words, describe the content of the image.",
  "Provide a concise explanation of this photograph.",
  "What is happening in this scene?",
  "Summarize the content of the photo.",
  "What are the main elements present in the image?",
  "Quickly explain the content of this visual.",
  "In a nutshell, what can you say about this picture?",
  "What's the main subject in the image?",
  "Describe the main features of the image.",
  "What is depicted in this photograph?",
  "Give me a short description of the picture.",
  "Briefly describe the objects and actions in the image.",
  "What is the context of this image?",
  "What are the key elements shown in this image?",
  "What is the main theme of the photograph?",
  "In just a few words, tell me what you see in this image.",
  "What is the essence of the image?",
  "Give me a quick breakdown of what's happening in the image.",
  "What does this picture represent?",
  "Using simple words, tell me what the image is showing.",
  "Quickly mention the content of the image.",
  "Describe the general scenario happening in the image.",
  "Can you summarize the main aspects of this image?",
  "Briefly point out the significant aspects of the image.",
  "What is the core subject illustrated in this picture?",
  "Tell me the central theme of the image briefly.",
  "What important features should I look for in this image?",
  "Describe the primary elements of the photo.",
  "In a sentence or two, describe the image.",
  "Outline the main content of this image.",
  "What event is captured in the picture?",
  "Simply put, what is being shown in the image?",
  "What do you notice immediately in the image?",
  "Provide a brief interpretation of the image.",
  "Tell me the key things happening in this image.",
  "Express the general theme of this photograph.",
  "What is the core idea of the image?",
  "Explain briefly what the image conveys.",
  "What is the primary focus of this visual?",
  "Name the most important components of this image.",
  "Explain the basic scene depicted in the image.",
  "What subject matter is portrayed in the picture?",
  "What are the prominent features of the image?",
  "Give a concise interpretation of this image.",
  "Quickly describe the situation happening in the image.",
  "Identify the focal point of the photograph.",
  "What can you gather from this image in a few words?",
  "Describe the image in the simplest way possible.",
  "What's happening in the image at a glance?",
  "What is the basic idea behind this picture?",
  "Enumerate the crucial elements of the photograph.",
  "What is the fundamental concept shown in the image?",
  "Using few words, tell me the main idea of the photo.",
  "Describe the essential aspects of this image.",
  "Briefly outline the content within the image.",
  "In a simple manner, explain the image.",
  "What are the most striking details in the picture?",
  "What can you say about the image in a nutshell?",
  "Give a summary of the essential components of the image.",
  "What is the primary message conveyed by the image?",
  "Tell me briefly what the photograph is all about.",
  "What is the central idea behind this image?",
  "What do you observe in the image in simple terms?",
  "Briefly express the main points of the image.",
  "Describe the simple version of what's happening in the image.",
  "What is the context of the image in brief?",
  "Briefly indicate the notable features of the image.",
  "What stands out in the photograph?",
  "What are the major details visible in the picture?",
  "What characters or objects are present in the image?",
  "What do you see at first glance in the image?",
  "Explain in brief the subject matter of the photograph.",
  "Mention the main objects and actions in the image briefly.",
  "What are the main components of the picture?",
  "What is the primary objective of the image?",
  "Give a short overview of the scene in the image.",
  "How would you describe the content of the image?",
  "What significant elements can you spot in the image?",
  "In your own words, quickly describe the image.",
  "Quickly outline the main ideas of this photograph.",
  "Briefly explain the components of this image.",
  "What are the key points portrayed in the picture?",
  "Describe in a simplified manner the content of the image.",
  "Give the short version of what's going on in the image.",
  "What are the major aspects of this photograph?",
  "What essential details can you see in the image?",
  "What core elements are present in the picture?",
  "Explain the main idea behind the photograph.",
  "Name the key features of this visual.",
  "What are the crucial points presented in this image?",
  "Sum up the most important things in the image.",
  "What do you think is the primary focus of this picture?",
  "What are the major factors visible in the image?",
  "Briefly mention the key details of the photograph.",
  "Describe the main events or objects in the image.",
  "In a sentence, describe the content of the image.",
  "What key aspects can you see in the photograph?",
  "What are the primary elements of this picture?",
  "Concisely explain the content of this visual.",
  "Give a short analysis of the image.",
  "Describe the notable features of the photograph.",
  "What's the main story being told in the image?",
  "Provide a simple description of this photograph.",
  "Express the gist of the scene in the image.",
  "What can you deduce from the image briefly?",
  "What are the most important aspects of the visual?",
  "What do you find most striking in the photo?",
  "Describe the essence of the picture.",
  "Give a brief outline of the image content.",
  "What grabs your attention in the image?",
  "Explain the focal points of this photograph.",
  "Describe the core elements of the image.",
  "Outline the key aspects of this picture.",
  "What's happening in this image in brief?",
  "What scene is represented in the photograph?",
  "What central theme can you identify in the image?",
  "Give a brief overview of the image.",
  "What main features are present in the image?",
  "Describe the simple context of the photograph.",
  "What are the standout details in the image?",
  "Explain the primary purpose of the image.",
  "Capture the basic essence of the picture.",
  "Identify the key components of this image.",
  "What's the main idea shown in the image?",
  "Concisely describe the core content of the image.",
  "Describe the primary aspects of this image.",
  "Outline the significant parts of the photo.",
  "What is the most important part of the image?",
  "In a short statement, explain the image.",
  "Relay a brief, clear account of the picture shown. The image is",
  "Can you provide a brief description of the image?",
  "Summarize the content of this picture.",
  "Please tell me what's happening in this photo.",
  "Quickly describe what you see in the photograph.",
  "In a sentence or two, describe the scene in this image.",
  "Give me a short summary of what you see in this picture.",
  "Provide a concise analysis of the image.",
  "Tell me in a nutshell what's happening in this image.",
  "What does this photo depict in brief?",
  "Briefly explain the content of this image.",
  "Express the idea of the image in short form.",
  "Kindly give a condensed description of the picture.",
  "In few words, describe what this picture is about.",
  "Offer a succinct summary of the scene in this image.",
  "Quickly tell me the main subject of the image.",
  "What is the theme of this photo in brief?",
  "In simple words, explain the image.",
  "Please give a short and sweet description of this image.",
  "Provide an abbreviated version of the content of this photo.",
  "Shorten the scenario of this scene.",
  "Please give a concise description of this image.",
  "Boil down the content of this photograph.",
  "Quickly summarize what you see in the image.",
  "Sketch the main points of this picture.",
  "Offer a compact summary of the elements in this image.",
  "In one sentence, describe the theme of this picture.",
  "Pare down the content of this photo.",
  "Provide a to-the-point explanation of this image.",
  "Highlight the main subject of the photograph.",
  "Summary: What can you see in this image?",
  "What's the brief context of this picture?",
  "Describe this scene in a few words.",
  "What's the main focus of this image?",
  "In just a couple of words, tell me about this picture.",
  "Give a snapshot description of this image.",
  "Be succinct while describing the content of this photo.",
  "Cut to the main part of the picture.",
  "Quickly express the idea of this scene.",
  "What's the abbreviated version of this image?",
  "Outline the moment captured in this photo.",
  "Please make a brief statement about the image.",
  "What are the basic elements in this picture?",
  "Trim down the content of this image.",
  "Distill the content of the photograph.",
  "Give me the main idea of this picture.",
  "Point out the primary focus of this image.",
  "What's the gist of this scene?",
  "Provide a pithy description of this photo.",
  "In brief, explain the elements in this image.",
  "Offer a short version of the content of this photograph.",
  "Capture the essence of this picture.",
  "Curtly describe what's happening in this image.",
  "Brief me on the content of this scene.",
  "In a word or two, what does this photo show?",
  "Condense the content of this image.",
  "Simply summarize the elements in this picture.",
  "What is the main object of interest in the image?",
  "Highlight the crux of this photograph.",
  "Provide a brief explanation of what's occurring in this image.",
  "Quickly identify the central theme of this picture.",
  "Reveal the core content of this image.",
  "What's the focal point of this photo?",
  "Give a compressed description of this scene.",
  "Explain the key concept of this image in simple terms.",
  "Wrap up the content of this picture.",
  "Make a concise statement about the photograph.",
  "Identify the primary subject in this image.",
  "What's happening in the photo in few words?",
  "Simplify the description of this scene.",
  "In a nutshell, explain the content of this image.",
  "Offer the main takeaway from this photograph.",
  "In a few words, give me the main idea of this picture.",
  "Share a brief description of the primary action in this image.",
  "What can you observe in this image in short?",
  "Whittle down the content of this photo.",
  "Strike at the heart of the scene depicted in this image.",
  "Preserve the essence while describing this picture.",
  "Keep it short and explain this photograph.",
  "What's the key thing to notice in this image?",
  "Give me the abridged version of this scene.",
  "Pare the content of this picture down to its essence.",
  "Provide a trimmed down explanation of this photo.",
  "Expressed briefly, what does this image show?",
  "Offer a concise assessment of the scene in this picture.",
  "What does the photograph illustrate in short?",
  "What are the salient features of this image?",
  "Bullet-point the main elements of this picture.",
  "Concisely express the key aspect of this photo.",
  "Briefly, what can you spot in this image?",
  "Filter the description of this scene down to the essentials.",
  "Illustrate the core concept of this picture.",
  "Sum up the main event in this photograph.",
  "What is the most striking feature of this image?",
  "Cut to the chase and explain this scene.",
  "Select the main element to describe in this picture.",
  "What do you see in the photo in brief?",
  "Give a short but informative description of this image.",
  "What stands out the most in this scene?",
  "In few words, summarize the main part of this picture.",
  "Briefly, what's going on in this photograph?",
  "List the key elements of this image.",
  "State the essence of this picture.",
  "Define the central idea of this photo briefly.",
  "Shorten your description of this image.",
  "Be concise while explaining the content of this picture.",
  "What's the short version of this photo's content?",
  "Point out the main component in this photograph.",
  "In a phrase, explain the essence of this image.",
  "Selectively describe the content of this scene.",
  "Briefly, what is this picture all about?",
  "What's the central subject of this photo?",
  "Get to the point and explain this image.",
  "Briefly, tell me the main action depicted in this picture.",
  "What's the main message of this photograph in brief?",
  "Condense the scene captured in this image.",
  "Please stick to the main point of this photo.",
  "Single out the main focus of this picture.",
  "Streamline the content of this image.",
  "What's the overall theme in this scene?",
  "Distill the main idea from this photograph.",
  "In a few words, what's the main event in this picture?",
  "Give a terse description of the content of this image.",
  "Catch the essence of this photo.",
  "What's the main aspect of this image?",
  "Briefly, describe the primary focus of this picture.",
  "What is the key attribute of this photo?",
  "What's the main highlight of this image?",
  "Simplify the content of this scene.",
  "Explain the key feature of this photograph concisely.",
  "Abstain from details while describing this picture.",
  "Be short while explaining this image.",
  "What's the essential point in this photo?",
  "Just tell me the main subject in the picture.",
  "Highlight the primary idea of this image.",
  "Get straight to the point about this scene.",
  "Stick to the basics while describing this photo."
]

shikra_template = {
    'caption2box': [
      "Where is <expr>?",
      "Where is <expr> in the image?",
      "Where is <expr>? answer in [[x0,y0,x1,y1]] format.",
      "Can you point out <expr> in the image and provide the bounding boxes of its location?",
      "Help me to locate <expr> in and give me its bounding boxes, please.",
      "In the given, could you find and tell me the bounding boxes of <expr>?",
      "Guide me to the location of <expr> within the image by providing its bounding boxes.",
      "I'd like to know the exact bounding boxes of <expr> in the photo.",
      "Would you kindly provide the bounding boxes of <expr> located in the picture?",
      "Can you find <expr> in and give me the bounding boxes of where it is located?",
      "I'm trying to locate <expr> in. Can you determine its bounding boxes for me?",
      "What are the bounding boxes of <expr> in the image?",
      "Can you disclose the position of <expr> in the photograph by stating its bounding boxes?",
      "In, could you let me know the location of <expr> in the form of bounding boxes?",
      "I need the bounding boxes of <expr> in, can you please assist me with that?",
      "Where in is <expr> located? Provide me with its bounding boxes, please.",
      "May I have the bounding boxes of <expr>?",
      "In the photograph, could you pinpoint the location of <expr> and tell me its bounding boxes?",
      "Can you please search and find <expr> in, then let me know its bounding boxes?",
      "Please, point out the position of <expr> in the image by giving its bounding boxes.",
      "What are the exact bounding boxes of <expr> in the provided picture?",
      "Detect the location of <expr> in and share the bounding boxes with me, please.",
      "In the picture, I'd like you to locate <expr> and provide its coordinates.",
      "Please indicate the location of <expr> in the photo by giving bounding boxes.",
      "Find <expr> in and share its coordinates with me.",
      "Could you please help me find the bounding boxes of <expr> in the image?",
      "I am looking for the position of <expr> in. Can you provide its bounding boxes?",
      "In the image, can you locate <expr> and let me know its coordinates?",
      "I'd appreciate if you could find and tell me the bounding boxes of <expr>.",
      "In, I need the bounding box bounding boxes of <expr>.",
      "Point me to the location of <expr> in the picture by providing its bounding boxes.",
      "Could you trace <expr> in and tell me its bounding boxes?",
      "Can you assist me in locating <expr> in, and then provide its bounding boxes?",
      "I'm curious, what are the bounding boxes of <expr> in the photo?",
      "Kindly share the bounding boxes of <expr> located in the image.",
      "I would like to find <expr> in. Can you give me its bounding boxes?",
      "Can you spot <expr> in and disclose its bounding boxes to me?",
      "Please, reveal the location of <expr> in the provided photograph as coordinates.",
      "Help me locate and determine the bounding boxes of <expr>.",
      "I request the bounding boxes of <expr> in the image.",
      "In the given, can you find <expr> and tell me its bounding boxes?",
      "I need to know the position of <expr> in as bounding boxes.",
      "Locate <expr> in and provide its bounding boxes, please.",
      "Assist me in finding <expr> in the photo and provide the bounding box bounding boxes.",
      "In, can you guide me to the location of <expr> by providing bounding boxes?",
      "I'd like the bounding boxes of <expr> as it appears in the image.",
      "What location does <expr> hold in the picture? Inform me of its bounding boxes.",
      "Identify the position of <expr> in and share its bounding boxes.",
      "I'd like to request the bounding boxes of <expr> within the photo.",
      "How can I locate <expr> in the image? Please provide the bounding boxes.",
      "I am interested in knowing the bounding boxes of <expr> in the picture.",
      "Assist me in locating the position of <expr> in the photograph and its bounding box bounding boxes.",
      "In the image, I need to find <expr> and know its bounding boxes. Can you please help?"
    ],
    'box2caption': [
      "Can you give me a description of the region <objs> in image?",
      "In the provided image, would you mind describing the selected area <objs>?",
      "I need details about the area <objs> located within image.",
      "Could you please share some information on the region <objs> in this photograph?",
      "Describe what's happening within the coordinates <objs> of the given image.",
      "What can you tell me about the selected region <objs> in the photo?",
      "Please, can you help me understand what's inside the region <objs> in image?",
      "Give me a comprehensive description of the specified area <objs> in the picture.",
      "I'm curious about the area <objs> in the following image. Can you describe it?",
      "Please elaborate on the area with the coordinates <objs> in the visual.",
      "In the displayed image, help me understand the region defined by <objs>.",
      "Regarding the image, what's going on in the section <objs>?",
      "In the given photograph, can you explain the area with coordinates <objs>?",
      "Kindly describe what I should be seeing in the area <objs> of image.",
      "Within the input image, what can be found in the region defined by <objs>?",
      "Tell me what you see within the designated area <objs> in the picture.",
      "Please detail the contents of the chosen region <objs> in the visual input.",
      "What's inside the area <objs> of the provided graphic?",
      "I'd like some information about the specific region <objs> in the image.",
      "Help me understand the details within the area <objs> in photograph.",
      "Can you break down the region <objs> in the image for me?",
      "What is taking place within the specified area <objs> in this capture?",
      "Care to elaborate on the targeted area <objs> in the visual illustration?",
      "What insights can you provide about the area <objs> in the selected picture?",
      "What does the area <objs> within the given visual contain?",
      "Analyze and describe the region <objs> in the included photo.",
      "Please provide details for the area marked as <objs> in this photographic.",
      "For the image, can you assess and describe what's happening at <objs>?",
      "Fill me in about the selected portion <objs> within the presented image.",
      "In the image, elaborate on the details found within the section <objs>.",
      "Please interpret and describe the area <objs> inside the given picture.",
      "What information can you give me about the coordinates <objs> in image?",
      "Regarding the coordinates <objs> in image, can you provide a description?",
      "In the photo, can you delve into the details of the region <objs>?",
      "Please provide insights on the specified area <objs> within the graphic.",
      "Detail the chosen region <objs> in the depicted scene.",
      "Can you discuss the entities within the region <objs> of image?",
      "I'd appreciate a breakdown of the area <objs> in the displayed image.",
      "What's the story in the section <objs> of the included visual?",
      "Please enlighten me about the region <objs> in the given photo.",
      "Offer a thorough description of the area <objs> within the illustration.",
      "What can you share about the area <objs> in the presented image?",
      "Help me grasp the context of the region <objs> within image.",
      "Kindly give an overview of the section <objs> in photo.",
      "What details can you provide about the region <objs> in the snapshot?",
      "Can you divulge the contents of the area <objs> within the given image?",
      "In the submitted image, please give a synopsis of the area <objs>.",
      "In the image, please describe the bounding box <objs>.",
      "Please describe the region <objs> in the picture.",
      "Describe the bbox <objs> in the provided photo.",
      "What can you tell me about the area <objs> within the image?",
      "Could you give me a description of the rectangular region <objs> found in?",
      "In, what elements can be found within the coordinates <objs>?",
      "Please provide details for the area within the bounding box <objs> in.",
      "Can you generate a description for the selected region <objs> in the image?",
      "Kindly describe the objects or scenery in the bounding box <objs> within.",
      "What details can you provide for the rectangle defined by the coordinates <objs> in?",
      "In relation to the picture, please describe the content of the area marked by <objs>.",
      "I'd like to know more about the area <objs> in the given image. Can you describe it?",
      "Can you help me by describing the part of that lies within the bounding box <objs>?",
      "What's happening in the section of the photo enclosed by the coordinates <objs>?",
      "Describe the image content present in the specified rectangular area <objs> of.",
      "Please provide information about the area within the bounding box <objs> in the picture.",
      "Could you offer a description of the contents in the selected area <objs> of the image?",
      "I'm curious about the area <objs> in. Can you provide a description of it?",
      "What can be observed in the rectangular region <objs> in the photograph?",
      "Please explain what is contained in the portion of defined by the box <objs>.",
      "In the photograph, can you describe the objects or scenery enclosed by <objs>?",
      "Can you give a brief explanation of the specified area <objs> in the image?",
      "What does the area <objs> look like in the context of the image?",
      "Could you please describe the contents of the bounding box <objs> in the given image?",
      "I would like to know more about the rectangular region <objs> within the picture. Can you describe it?",
      "Please tell me about the area <objs> in the image. What does it contain?",
      "Help me understand what's happening in the selected bounding box <objs> within.",
      "Can you provide a description of the area <objs> in the image?",
      "What sort of things can be seen in the region <objs> of the photo?",
      "Describe what can be found within the bounds of <objs> in the image.",
      "In, can you paint a picture of the area enclosed by coordinates <objs>?",
      "Please provide a detailed account of the area covered by the bounding box <objs> in.",
      "Give me a vivid description of what's happening in the area <objs> within the snapshot.",
      "In the image, what do you observe within the rectangular box defined by the coordinates <objs>?",
      "Could you give me a breakdown of the content in the specified area <objs> of the picture?",
      "Please elucidate the area<objs> of the image.",
      "I'd appreciate it if you could describe the portion of that lies within the rectangle <objs>.",
      "Can you share some insights about the rectangular region <objs> in the image?",
      "Help me visualize the section of the photo enclosed by the bounding box <objs>.",
      "Would you kindly provide a description for the content within the rectangular area <objs> of?",
      "In, can you tell me more about the area specified by the bounding box <objs>?",
      "Please describe what can be seen in the rectangular region <objs> of the image.",
      "Can you analyze the content of the area <objs> within the photograph?",
      "In the provided image, please explain the content within the region <objs>.",
      "I'm interested in the selected rectangle <objs> in. Can you tell me more about it?",
      "Explain what can be found in the bounding box <objs> in the context of the image.",
      "Kindly share your observations about the rectangular region <objs> within.",
      "I'd like a thorough description of the area <objs> in the image.",
      "Could you please provide a description of the rectangular area <objs> in?",
      "Please describe the section of the picture defined by the bbox <objs>.",
      "Tell me more about the scenery or objects within the rectangular region <objs> in.",
      "Would you kindly describe the content of the area enclosed by <objs> in the image?",
      "Help me understand the objects or scenery within the bounding box <objs> in the image.",
      "I would like to know about the section of the image enclosed by the rectangle <objs>. Can you describe it?",
      "Describe the selected rectangular area <objs> in the photo.",
      "Tell me about the region <objs> of the image.",
      "I request a description of the area <objs> in the picture.",
      "Can you elaborate on the content of the bounding box <objs> in?",
      "Please share details about the rectangular region <objs> within the image.",
      "What can I find in the bbox <objs> of the provided image?",
      "In the image, could you provide a description for the coordinates <objs>?",
      "Could you tell me more about the area <objs> in the snapshot?",
      "Fill me in on the details of the rectangular box <objs> within the image.",
      "What's going on in the section of contained within the bounding box <objs>?",
      "I would like a description of the content within the bbox <objs> in.",
      "Please enlighten me about the area <objs> in the photograph.",
      "Can you give me a visual rundown of the area <objs> in?",
      "Describe the visual elements within the selected area <objs> of the image.",
      "Tell me what you see in the area <objs> within the context of the image.",
      "Explain the content within the rectangular region <objs> of the image.",
      "I'd like some information about the bounding box <objs> in the photo.",
      "What is happening within the rectangle defined by coordinates <objs> in the image?",
      "Please describe the content within the area <objs> displayed in the image.",
      "What can be seen in the bounding box <objs> in the context of the provided image?",
      "Share some details about the objects or environment within the bounding box <objs> in.",
      "Please describe the area <objs> in the image for me.",
      "Can you generate a description of the contents within the selected region <objs> in?",
      "What objects or scenery can be found in the area <objs> in the image?",
      "Please tell me more about the rectangular section <objs> in the photo.",
      "Could you describe the content of the bbox <objs> in the image?",
      "What does the selected region <objs> in the image encompass?",
      "I am interested in the region <objs> of the image; please describe it.",
      "Can you provide some context for the area <objs> within the picture?",
      "Please give me some details about the rectangle <objs> in the image.",
      "In the photo, what can you see within the region defined by the bounding box <objs>?",
      "I would like a detailed description of the portion of enclosed by the bbox <objs>.",
      "Please help me understand the content present within the rectangle <objs> in.",
      "Would you mind describing the rectangular area <objs> in the provided image?"
    ],
  'caption_with_box': [
      "Can you provide a description of the image and include the coordinates [[x0,y0,x1,y1]] for each mentioned object?",
      "Please explain what's happening in the photo and give coordinates [[xmin,ymin,xmax,ymax]] for the items you reference.",
      "Analyze the contents of the picture and share the positions of mentioned items using the top-left and bottom-right coordinates.",
      "What do you see in this image? Please mention the objects and their locations using the format [[x1,y1,x2,y2]].",
      "Examine the image and describe its content, specifying the location of each mentioned noun using coordinates [[x1,y1,x2,y2]].",
      "Could you interpret the scene from this image and provide the coordinates [[xmin,ymin,xmax,ymax]] for each element you describe?",
      "Please provide an overview of the visual information in this image, along with the location data [[xmin,ymin,xmax,ymax]] for each mentioned object.",
      "Tell me about the picture and include position info [[x0,y0,x1,y1]] for the objects you describe.",
      "What is displayed in this image? Remember to mention the objects and their corresponding locations using the format [[xmin,ymin,xmax,ymax]].",
      "Give a brief analysis of the image and make sure to include the location of objects using their coordinates [[x1,y1,x2,y2]].",
      "Explain the content of this image and provide the coordinates [[x1,y1,x2,y2]] for all objects that you mention.",
      "Describe the scene in this picture and give the top-left and bottom-right coordinates [[xmin,ymin,xmax,ymax]] for each item you talk about.",
      "Please give a summary of the image and include the position info for each object you identify with coordinates [[x0,y0,x1,y1]].",
      "What is happening in the photo? Please point out the objects and their locations using the format [[x1,y1,x2,y2]].",
      "Illustrate the content of the image and specify the coordinates [[xmin,ymin,xmax,ymax]] for every object you mention.",
      "What can you tell me about this image? Remember to provide location data for the objects you describe using coordinates [[x1,y1,x2,y2]].",
      "Please interpret this image and give coordinates [[x1,y1,x2,y2]] for each object you mention.",
      "Detail what you see in the image and provide the top-left and bottom-right coordinates [[xmin,ymin,xmax,ymax]] for each mentioned noun.",
      "Take a look at this image and give an explanation of its content, including the position data [[x1,y1,x2,y2]] for each object you describe.",
      "What is the image depicting? Please mention the positions of any mentioned objects using square brackets.",
      "Describe the visual elements in the image and note the positions of any mentioned objects in square brackets.",
      "Could you please analyze the content of the image and mention the positions of any mentioned objects in square brackets?",
      "Tell me about the objects present in the image and note their positions using square brackets.",
      "What can you tell me about the contents of the image? Please indicate the positions of any mentioned objects in square brackets.",
      "Provide a comprehensive description of the image and specify the positions of any mentioned objects in square brackets.",
      "Describe the scene in the image and mention the positions of any mentioned objects using square brackets.",
      "Can you identify the objects in the image? Please include their positions in square brackets.",
      "Please describe the visual details in the image and note the positions of any mentioned objects using square brackets.",
      "What is happening in the image? Please mention the positions of any mentioned objects using square brackets.",
      "Analyze the content of the image and provide the positions of any mentioned objects in square brackets.",
      "Describe the main elements in the image and note the positions of any mentioned objects using square brackets.",
      "Could you please provide a detailed description of the image? Don't forget to mention the positions of any mentioned objects in square brackets.",
      "Can you provide a description of the image and include the coordinates [[x0,y0,x1,y1]] for each mentioned object?",
      "Please explain what's happening in the photo and give coordinates [[xmin,ymin,xmax,ymax]] for the items you reference.",
      "Analyze the contents of the picture and share the positions of mentioned items using the top-left and bottom-right coordinates.",
      "What do you see in this image? Please mention the objects and their locations using the format [[x1,y1,x2,y2]].",
      "Examine the image and describe its content, specifying the location of each mentioned noun using coordinates [[x1,y1,x2,y2]].",
      "Could you interpret the scene from this image and provide the coordinates [[xmin,ymin,xmax,ymax]] for each element you describe?",
      "Please provide an overview of the visual information in this image, along with the location data [[xmin,ymin,xmax,ymax]] for each mentioned object.",
      "Tell me about the picture and include position info [[x0,y0,x1,y1]] for the objects you describe.",
      "What is displayed in this image? Remember to mention the objects and their corresponding locations using the format [[xmin,ymin,xmax,ymax]].",
      "Give a brief analysis of the image and make sure to include the location of objects using their coordinates [[x1,y1,x2,y2]].",
      "Explain the content of this image and provide the coordinates [[x1,y1,x2,y2]] for all objects that you mention.",
      "Describe the scene in this picture and give the top-left and bottom-right coordinates [[xmin,ymin,xmax,ymax]] for each item you talk about.",
      "Please give a summary of the image and include the position info for each object you identify with coordinates [[x0,y0,x1,y1]].",
      "What is happening in the photo? Please point out the objects and their locations using the format [[x1,y1,x2,y2]].",
      "Illustrate the content of the image and specify the coordinates [[xmin,ymin,xmax,ymax]] for every object you mention.",
      "What can you tell me about this image? Remember to provide location data for the objects you describe using coordinates [[x1,y1,x2,y2]].",
      "Please interpret this image and give coordinates [[x1,y1,x2,y2]] for each object you mention.",
      "Detail what you see in the image and provide the top-left and bottom-right coordinates [[xmin,ymin,xmax,ymax]] for each mentioned noun.",
      "Take a look at this image and give an explanation of its content, including the position data [[x1,y1,x2,y2]] for each object you describe.",
      "What are the details of this picture? Please include the coordinates [[x1,y1,x2,y2]] for each object you mention.",
      "Can you provide a detailed description of the contents of the image? Please include the positions of any mentioned objects in square brackets.",
      "What is the image depicting? Please mention the positions of any mentioned objects using square brackets.",
      "Describe the visual elements in the image and note the positions of any mentioned objects in square brackets.",
      "Could you please analyze the content of the image and mention the positions of any mentioned objects in square brackets?",
      "Tell me about the objects present in the image and note their positions using square brackets.",
      "How would you describe the contents of the image? Please provide the positions of mentioned objects in square brackets.",
      "What do you observe in the image? Don't forget to mention the objects and their locations using square brackets.",
      "Can you give an overview of the image and list the objects along with their positions using square brackets?",
      "Describe the activities taking place in the image and point out the objects with their locations using square brackets.",
      "Can you explain what is going on in this picture and give the bounding boxes for each object you mention?",
      "Provide a summary of the image and include bounding box coordinates for the objects you talk about.",
      "Help me understand what's in the image and also give me the bounding boxes for the objects you describe.",
      "Explain the scene depicted in the image and include the bounding boxes for the nouns you reference.",
      "Analyze this picture for me and provide coordinates for the items you discuss.",
      "I need a breakdown of what is happening in the image, and please include the bounding box information.",
      "Give me a rundown of what's in this image, along with the coordinates for each mentioned object.",
      "Elaborate on the image and provide the boundaries for the objects you mention.",
      "Discuss the contents of this image and include the bounding boxes for mentioned objects.",
      "Unveil what's happening in the image and provide the coordinates for the objects in discussion.",
      "Clarify the situation depicted in the photo and include bounding box details for the objects mentioned.",
      "Break down the image and share the bounding box coordinates of objects you mention.",
      "Reveal the meaning behind the image and provide me with the bounding box details for the mentioned nouns.",
      "Examine the picture and disclose the bounding box coordinates for each object you discuss.",
      "Interpret the image and include the bounding boxes of the items you discuss.",
      "Convey the essence of the photo and provide the bounding box information for mentioned objects.",
      "Enlighten me about the image and provide me with the bounding box coordinates for each subject.",
      "Narrate the image and include the bounding boxes for the objects you describe.",
      "Decipher the story behind the image and provide the bounding box for each object in the story.",
      "Illustrate your understanding of the image, and give the boxes of the described objects.",
      "Walk me through the contents of the image, and include the bounding box for the mentioned items.",
      "I need to know what's in the image and please provide coordinates for the featured objects.",
      "Dissect the components of the image and include the bounding boxes for each object discussed.",
      "Give insights into the picture and provide the bounding box details for the objects mentioned.",
      "Delve into the image, and furnish the coordinates for the items you reference.",
      "Portray the events in the image and include the location and boundaries of the described objects.",
      "Unravel the aspects of the image and give the bounding box for the mentioned items.",
      "Tell me everything about the picture and don't forget to mention bounding boxes for the described items.",
      "Disentangle the details of the picture and include bounding box coordinates for mentioned items.",
      "Explore the elements within the picture and provide the bounding boxes for each object mentioned.",
      "Detail the occurrences in the picture and supply the bounding box info for the talked-about objects.",
      "Lay out the context of the picture and include bounding box details for the featured objects.",
      "Uncover the truth behind the picture and include the bounding boxes for the described nouns.",
      "What's happening in the picture? Please provide the bounding box info for mentioned objects.",
      "Discuss the events taking place in and include the bboxes of the involved objects.",
      "In the picture, describe what's going on and provide the bboxes of mentioned objects.",
      "Decode the message in the picture, and provide boundaries for the relevant objects.",
      "Let me know what you see in the picture and provide the bounding boxes for the objects you discuss.",
      "Scrutinize the picture and include the coordinates for the items you talk about.",
      "Summon the essence from the picture and present the bounding box coordinates for relevant objects.",
      "Deconstruct the scene in the picture and include bounding box info for the mentioned nouns.",
      "Identify the contents of the picture and provide the coordinates for the objects involved.",
      "Make sense of the happenings in the picture and include bounding box coordinates for the objects.",
      "What can you tell me about the picture? Please include bounding boxes for any mentioned objects.",
      "Deduce the meaning of the picture and provide location details for the discussed items.",
      "Can you give me the gist of the picture and provide the bboxes of the described objects?",
      "Describe what is taking place in the picture, and include the bboxes of the involved items.",
      "Please narrate the story in the picture, and provide the bounding box coordinates for the included objects.",
      "Scrutinize the contents of the photo and include the location details of the items you talk about.",
      "Analyze what's happening within the photo and provide bounding box info for the referenced objects.",
      "Relate the situation in the photo and include the location details of the items you discuss.",
      "Probe into the photo and provide the boundaries for the included objects.",
      "Gather the meaning of the photo and provide location info for the mentioned nouns.",
      "Resolve the context of the picture and supply the bounding box details for the objects you discuss.",
      "Bring clarity to the situation in the photo and provide the bounding box for the relevant objects.",
      "Unfold the story of the photo and include the bounding box coordinates for the included nouns.",
      "Speaking on the photo, what do you see? Don't forget to include bounding boxes for mentioned objects.",
      "Inform me about the particulars in the photo and provide the bounding box info for the discussed items.",
      "Give me the lowdown on the photo, and include bounding boxes for the objects you discuss.",
      "Share with me the details of the photo and provide the bounding boxes for the nouns mentioned.",
      "Provide a glimpse into the happenings of the photo and include bounding boxes for the involved objects.",
      "Decode the events occurring in the photo and provide the location details for the mentioned items.",
      "Delineate the elements of the photo and include the bounding box for each object discussed.",
      "Explain to me the context of the photo and provide bounding box details for any discussed objects.",
      "Describe the subjects within the photo and include bounding box coordinates for the mentioned objects.",
      "Break down the narrative of the picture and include the boundaries for any related items.",
      "Tell me about the image and provide me with the bboxes of any mentioned objects.",
      "What's the story in the image? Please include bounding boxes for any objects discussed.",
      "Elucidate the context of the image and provide bounding box details for the objects you mention.",
      "Annotate the image with the bounding box coordinates of the objects you discuss during your description.",
      "Examine the image carefully and point out the objects along with their respective bounding boxes.",
      "Dig into the scene on the image and provide the bounding box info for the mentioned items.",
      "Study the photo and cite bounding box coordinates for the subjects you mention.",
      "Inspect the image and give me the coordinates of the bounding box for each mentioned object.",
      "Dive into the details of the picture, and include the bounding boxes for any referenced nouns.",
      "Analyze the photo and provide the boxes of the objects involved.",
      "Take a look at the image and give me the location details for any mentioned items.",
      "Go through the scene, describing its content, and provide bounding boxes for the mentioned nouns.",
      "Evaluate the scene and include the boxes of the items you reference.",
      "Share your perspective of the scene and give the bounding box for each object you discuss.",
      "Get into the specifics of the picture and provide the boxes of the mentioned items.",
      "Quote the happenings unfolding in the frame and provide the bounding box coordinates of related objects.",
      "Shed light on the events in the frame and include the location details of the mentioned items.",
      "Bring out the description of the frame and provide the bounding box for each object you mention.",
      "Dissect the scenario in the frame and include the bounding boxes for any referenced objects."
    ],
  'box_qa_True': [
      "<question> Let's think step by step.",
      "<question> Let's think step by step.",
      "<question> Please include the reasoning process.",
      "<question> Please include the reasoning process.",
      "Using the image as reference, can you answer the following question: <question> Please include the reasoning process.",
      "After examining the image, I'd like to know the answer to this question: <question> Please provide an explanation of your answer.",
      "<question> Can you give me an answer based on the image, along with the reasoning process?",
      "Looking at the image, I need to ask this question '<question>'. Can you answer it and provide the explanation?",
      "After checking out the picture, I have a question: <question> Can you give me an answer with reasoning?",
      "Please have a look at the image and tell me the answer to my question: <question> Don't forget to provide the reasoning.",
      "I want to know the answer to this question: <question> Please refer to the image and give an explanation as well.",
      "Help me understand the answer to the following question based on the image: <question> Remember to explain the reasoning.",
      "Consider the image and answer my question: <question> Be sure to offer reasoning for the answer.",
      "Regarding the image, can you tell me the answer to the question '<question>' and explain your thought process?",
      "Here's an image I need assistance with. What's the answer to the following question: <question> Please provide reasoning.",
      "Can you deduce the answer to question '<question>' after examining the image, along with the reasoning process?",
      "Having a look at image, can you tell me the answer to my question '<question>' and the logic leading to it?",
      "Investigate the image and provide me with the answer to this question: <question> Don't forget to reveal your reasoning.",
      "In reference to the image, I have a question: <question> Can you respond with your answer and an explanation?",
      "If you take a glance at the image, can you give me the answer for my question: <question> and add an explanation?",
      "Centered on the image, please unravel my query: <question> and be sure to involve the reasoning process.",
      "Can you offer an answer to my following inquiry: <question> Make sure to examine the image and clarify your reasoning.",
      "Looking at image, would you provide an answer to the question '<question>'? Kindly include your thought process as well.",
      "Upon analyzing the image, please find the answer to my question '<question>' and provide a detailed explanation.",
      "Please provide a solution to my question: <question> First, examine the image and then walk me through your reasoning.",
      "After inspecting the picture thoroughly, kindly furnish the answer to the query: <question> and provide the reasoning.",
      "Carefully observe the image and provide me with a well-reasoned answer to the question '<question>'.",
      "Focusing on the image, please offer an answer to my question '<question>' along with the reasoning process.",
      "Evaluate the image and let me know your answer regarding this question '<question>'. Include your thinking process as well.",
      "Keeping the image in mind, please help me with the following question: <question> and explain the reasoning process.",
      "Give your observation on the image and your response to the question '<question>', along with a clear reasoning explanation.",
      "Based on the image, kindly address my query: <question> Remember to elucidate the reasoning process.",
      "In view of the image, could you please respond to the question '<question>' and provide the reasoning process?",
      "Deliberate on the image and enlighten me with an answer to the question '<question>' including the reasoning process.",
      "Please share your insights on the image by answering the question '<question>'. Do illustrate your reasoning process.",
      "Examine the following image closely and provide the answer to my question: <question> Do include the thinking process.",
      "Critique the image and furnish the answer to my question '<question>', along with a thorough reasoning.",
      "Please analyze the image and supply an answer to the following query: <question> Ensure to elucidate the justifying process.",
      "Scrutinize the image and help me with the answer to this question: <question> and explain your deduction methodology.",
      "Please answer the following question '<question>' based on the image, and describe your thought process."
    ],
  'box_qa_False': [
      "Please briefly answer: <question>",
      "Can you give a concise response to: <question>",
      "In relation to the image, provide a short answer for: <question>",
      "<question> - I need a succinct reply, please.",
      "Could you offer a brief explanation for: <question>",
      "After looking at the image, quickly answer: <question>",
      "I'm looking for a short response to: <question>",
      "Based on the image, can you sum up your answer for: <question>",
      "Without going into detail, answer: <question>",
      "Briefly, what's your take on: <question>",
      "Considering the image, please keep your answer brief for: <question>",
      "Quickly tell me about: <question>",
      "I don't need a lengthy explanation, just a quick answer to: <question>",
      "Can you keep it brief and answer: <question>",
      "<question> - I'm hoping for a brief response.",
      "Without delving too deep, please reply to: <question>",
      "Just a short answer will do for: <question>",
      "Briefly elaborate on: <question>",
      "For <question>, please keep your answer concise.",
      "Simply put, how would you respond to: <question>",
      "I'm in a rush, so a brief answer to <question> would be appreciated.",
      "Your quick thoughts on: <question>",
      "A concise reply for: <question>, please.",
      "In light of the image, briefly explain: <question>",
      "No need for details, just answer: <question>",
      "Cut to the chase, what's your take on: <question>",
      "<question> - A short explanation, if you will.",
      "Trim the details, I just need an answer for: <question>",
      "Quick and concise, please answer: <question>",
      "For <question>, a succinct response would be great."
  ]
}

en_template_task = [
    "Can you advise me on how to <TASK>?",
    "I'm looking for guidance on how to <TASK>.",
    "What steps do I need to take to <TASK>?",
    "Could you provide instructions for <TASK>?",
    "I'm wondering what the process is for <TASK>.",
    "How can I go about <TASK>?",
    "I need assistance with planning to <TASK>.",
    "Do you have any recommendations for <TASK>?",
    "Please share some tips for <TASK>.",
    "I'd like to know the best way to <TASK>.",
    "What's the most effective way to <TASK>?",
    "I'm seeking advice on accomplishing <TASK>.",
    "Could you guide me through the steps to <TASK>?",
    "I'm unsure how to start with <TASK>.",
    "Is there a strategy for successfully <TASK>?",
    "What's the proper procedure for <TASK>?",
    "How should I prepare for <TASK>?",
    "I'm not sure where to begin with <TASK>.",
    "I need some insights on <TASK>.",
    "Can you explain how to tackle <TASK>?",
    "I'm interested in the process of <TASK>.",
    "Could you enlighten me on <TASK>?",
    "What are the recommended steps for <TASK>?",
    "Is there a preferred method for <TASK>?",
    "I'd appreciate your advice on <TASK>.",
    "Can you shed light on <TASK>?",
    "What would be the best approach to <TASK>?",
    "How do I get started with <TASK>?",
    "I'm inquiring about the procedure for <TASK>.",
    "Could you share your expertise on <TASK>?",
    "I'd like some guidance on <TASK>.",
    "What's your recommendation for <TASK>?",
    "I'm seeking your input on how to <TASK>.",
    "Can you provide some insights into <TASK>?",
    "How can I successfully accomplish <TASK>?",
    "What steps are involved in <TASK>?",
    "I'm curious about the best way to <TASK>.",
    "Could you show me the ropes for <TASK>?",
    "I need to know how to go about <TASK>.",
    "What are the essential steps for <TASK>?",
    "Is there a specific method for <TASK>?",
    "I'd like to get some advice on <TASK>.",
    "Can you explain the process of <TASK>?",
    "I'm looking for guidance on how to approach <TASK>.",
    "What's the proper way to handle <TASK>?",
    "How should I proceed with <TASK>?",
    "I'm interested in your expertise on <TASK>.",
    "Could you walk me through the steps for <TASK>?",
    "I'm not sure where to begin when it comes to <TASK>.",
    "What should I prioritize when doing <TASK>?",
    "How can I ensure success with <TASK>?",
    "I'd appreciate some tips on <TASK>.",
    "Can you provide a roadmap for <TASK>?",
    "What's the recommended course of action for <TASK>?",
    "I'm seeking your guidance on <TASK>.",
    "Could you offer some suggestions for <TASK>?",
    "I'd like to know the steps to take for <TASK>.",
    "What's the most effective way to achieve <TASK>?",
    "How can I make the most of <TASK>?",
    "I'm wondering about the best approach to <TASK>.",
    "Can you share your insights on <TASK>?",
    "What steps should I follow to complete <TASK>?",
    "I'm looking for advice on <TASK>.",
    "What's the strategy for successfully completing <TASK>?",
    "How should I prepare myself for <TASK>?",
    "I'm not sure where to start with <TASK>.",
    "What's the procedure for <TASK>?",
    "Could you provide some guidance on <TASK>?",
    "I'd like to get some tips on how to <TASK>.",
    "Can you explain how to tackle <TASK> step by step?",
    "I'm interested in understanding the process of <TASK>.",
    "What are the key steps to <TASK>?",
    "Is there a specific method that works for <TASK>?",
    "I'd appreciate your advice on successfully completing <TASK>.",
    "Can you shed light on the best way to <TASK>?",
    "What would you recommend as the first step to <TASK>?",
    "How do I initiate <TASK>?",
    "I'm inquiring about the recommended steps for <TASK>.",
    "Could you share some insights into <TASK>?",
    "I'm seeking your expertise on <TASK>.",
    "What's your recommended approach for <TASK>?",
    "I'd like some guidance on where to start with <TASK>.",
    "Can you provide recommendations for <TASK>?",
    "What's your advice for someone looking to <TASK>?",
    "I'm seeking your input on the process of <TASK>.",
    "How can I achieve success with <TASK>?",
    "What's the best way to navigate <TASK>?",
    "I'm curious about the steps required for <TASK>.",
    "Could you show me the proper way to <TASK>?",
    "I need to know the necessary steps for <TASK>.",
    "What's the most efficient method for <TASK>?",
    "I'd appreciate your guidance on <TASK>.",
    "Can you explain the steps involved in <TASK>?",
    "I'm looking for recommendations on how to approach <TASK>.",
    "What's the right way to handle <TASK>?",
    "How should I manage <TASK>?",
    "I'm interested in your insights on <TASK>.",
    "Could you provide a step-by-step guide for <TASK>?",
    "I'm not sure how to start when it comes to <TASK>.",
    "What are the key factors to consider for <TASK>?",
    "How can I ensure a successful outcome with <TASK>?",
    "I'd like some tips and tricks for <TASK>.",
    "Can you offer a roadmap for accomplishing <TASK>?",
    "What's the preferred course of action for <TASK>?",
    "I'm seeking your expert advice on <TASK>.",
    "Could you suggest some best practices for <TASK>?",
    "I'd like to understand the necessary steps to complete <TASK>.",
    "What's the most effective strategy for <TASK>?",
]

question_en = ["<img><Image></img> {} A:",
                     "<img><Image></img> {} Answer:",
                     "<img><Image></img> {} The answer is:",
                     "<img><Image></img> {}",
                     "<img><Image></img> {}",
                     "<img><Image></img> Q: {} A:",
                     "<img><Image></img> Question: {} Answer:",
                     ]

question_cn = ["<img><Image></img> {} 答：",
                     "<img><Image></img> {} 答案是：",
                     "<img><Image></img> {}",
                     "<img><Image></img> {}",
                     "<img><Image></img> 问：{} 答：",
                     "<img><Image></img> 问：{} ",
                     "<img><Image></img> Q: {} A:",
                     "<img><Image></img> {} A:",
                     ]

================================================
FILE: utils/utils/vision.py
================================================
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
import torch

class BlipImageEvalProcessor:
    def __init__(self, image_size=384, mean=None, std=None):
        super().__init__()
        if mean is None:
            mean = (0.48145466, 0.4578275, 0.40821073)
        if std is None:
            std = (0.26862954, 0.26130258, 0.27577711)

        self.normalize = transforms.Normalize(mean, std)

        self.transform = transforms.Compose(
            [
                transforms.Resize(
                    (image_size, image_size), interpolation=InterpolationMode.BICUBIC
                ),
                transforms.ToTensor(),
                self.normalize,
            ]
        )

    def __call__(self, item):
        return self.transform(item)

from functools import partial

def blip2_image_processor_func_with_inputs(image_processor, image):
    return {'image': image_processor(image).unsqueeze(0), 'input_ids': torch.zeros(1, 1, dtype=torch.long), 'position_ids': None, 'attention_mask': torch.ones(1, 1, dtype=torch.long)}

def get_image_processor(image_size):
    return partial(blip2_image_processor_func_with_inputs, BlipImageEvalProcessor(image_size))