Full Code of shufangxun/LLaVA-MoD for AI

main 481ca57856c8 cached
181 files
9.9 MB
2.6M tokens
1352 symbols
1 requests
Copy disabled (too large) Download .txt
Showing preview only (10,407K chars total). Download the full file to get everything.
Repository: shufangxun/LLaVA-MoD
Branch: main
Commit: 481ca57856c8
Files: 181
Total size: 9.9 MB

Directory structure:
gitextract_kubh4hae/

├── LICENSE
├── README.md
├── docs/
│   ├── INFERENCE.md
│   └── TRAIN_EVAL.md
├── llavamod/
│   ├── __init__.py
│   ├── config/
│   │   ├── __init__.py
│   │   ├── args.py
│   │   └── dpconfig/
│   │       ├── dpo_zero2.json
│   │       ├── dpo_zero2_offload.json
│   │       ├── zero2.json
│   │       ├── zero2_offload.json
│   │       ├── zero3.json
│   │       └── zero3_offload.json
│   ├── constants.py
│   ├── conversation.py
│   ├── data/
│   │   ├── __init__.py
│   │   ├── data_utils.py
│   │   └── dataset.py
│   ├── eval/
│   │   ├── __init__.py
│   │   ├── eval_gpt_mmhal.py
│   │   ├── eval_gpt_objhal.py
│   │   ├── eval_gpt_review.py
│   │   ├── eval_gpt_review_bench.py
│   │   ├── eval_gpt_review_visual.py
│   │   ├── eval_gqa.py
│   │   ├── eval_gqa_1.py
│   │   ├── eval_pope.py
│   │   ├── eval_science_qa.py
│   │   ├── eval_science_qa_gpt4.py
│   │   ├── eval_science_qa_gpt4_requery.py
│   │   ├── eval_textvqa.py
│   │   ├── generate_webpage_data_from_table.py
│   │   ├── gpt4_grpc.py
│   │   ├── m4c_evaluator.py
│   │   ├── model_qa.py
│   │   ├── model_vqa.py
│   │   ├── model_vqa_loader.py
│   │   ├── model_vqa_mmbench.py
│   │   ├── model_vqa_mmhal.py
│   │   ├── model_vqa_objhal.py
│   │   ├── model_vqa_qbench.py
│   │   ├── model_vqa_science.py
│   │   ├── qa_baseline_gpt35.py
│   │   ├── run_llava.py
│   │   ├── summarize_gpt_review.py
│   │   ├── table/
│   │   │   ├── answer/
│   │   │   │   ├── answer_alpaca-13b.jsonl
│   │   │   │   ├── answer_bard.jsonl
│   │   │   │   ├── answer_gpt35.jsonl
│   │   │   │   ├── answer_llama-13b.jsonl
│   │   │   │   └── answer_vicuna-13b.jsonl
│   │   │   ├── caps_boxes_coco2014_val_80.jsonl
│   │   │   ├── model.jsonl
│   │   │   ├── prompt.jsonl
│   │   │   ├── question.jsonl
│   │   │   ├── results/
│   │   │   │   ├── test_sqa_llava_13b_v0.json
│   │   │   │   └── test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json
│   │   │   ├── review/
│   │   │   │   ├── review_alpaca-13b_vicuna-13b.jsonl
│   │   │   │   ├── review_bard_vicuna-13b.jsonl
│   │   │   │   ├── review_gpt35_vicuna-13b.jsonl
│   │   │   │   └── review_llama-13b_vicuna-13b.jsonl
│   │   │   ├── reviewer.jsonl
│   │   │   └── rule.json
│   │   └── webpage/
│   │       ├── index.html
│   │       ├── script.js
│   │       └── styles.css
│   ├── mm_utils.py
│   ├── model/
│   │   ├── __init__.py
│   │   ├── apply_delta.py
│   │   ├── builder.py
│   │   ├── cache_utils.py
│   │   ├── consolidate.py
│   │   ├── import_utils.py
│   │   ├── language_model/
│   │   │   ├── gemma2/
│   │   │   │   ├── configuration_gemma2.py
│   │   │   │   ├── modeling_gemma2.py
│   │   │   │   ├── tokenization_gemma2.py
│   │   │   │   └── tokenization_gemma2_fast.py
│   │   │   ├── llama/
│   │   │   │   ├── configuration_llama.py
│   │   │   │   └── modeling_llama.py
│   │   │   ├── llava_gemma2.py
│   │   │   ├── llava_gemma2_moe.py
│   │   │   ├── llava_llama.py
│   │   │   ├── llava_llama_moe.py
│   │   │   ├── llava_minicpm.py
│   │   │   ├── llava_minicpm_moe.py
│   │   │   ├── llava_mistral.py
│   │   │   ├── llava_mistral_moe.py
│   │   │   ├── llava_mpt.py
│   │   │   ├── llava_phi.py
│   │   │   ├── llava_phi_moe.py
│   │   │   ├── llava_qwen.py
│   │   │   ├── llava_qwen1_5.py
│   │   │   ├── llava_qwen1_5_moe.py
│   │   │   ├── llava_qwen2.py
│   │   │   ├── llava_qwen2_moe.py
│   │   │   ├── llava_qwen_moe.py
│   │   │   ├── llava_stablelm.py
│   │   │   ├── llava_stablelm_moe.py
│   │   │   ├── minicpm/
│   │   │   │   ├── configuration_minicpm.py
│   │   │   │   └── modeling_minicpm.py
│   │   │   ├── mpt/
│   │   │   │   ├── adapt_tokenizer.py
│   │   │   │   ├── attention.py
│   │   │   │   ├── blocks.py
│   │   │   │   ├── configuration_mpt.py
│   │   │   │   ├── custom_embedding.py
│   │   │   │   ├── flash_attn_triton.py
│   │   │   │   ├── hf_prefixlm_converter.py
│   │   │   │   ├── meta_init_context.py
│   │   │   │   ├── modeling_mpt.py
│   │   │   │   ├── norm.py
│   │   │   │   └── param_init_fns.py
│   │   │   ├── phi/
│   │   │   │   ├── configuration_phi.py
│   │   │   │   └── modeling_phi.py
│   │   │   ├── qwen/
│   │   │   │   ├── configuration_qwen.py
│   │   │   │   ├── cpp_kernels.py
│   │   │   │   ├── modeling_qwen.py
│   │   │   │   ├── qwen_generation_utils.py
│   │   │   │   └── tokenization_qwen.py
│   │   │   ├── qwen1_5/
│   │   │   │   ├── configuration_qwen2.py
│   │   │   │   └── modeling_qwen2.py
│   │   │   ├── qwen2/
│   │   │   │   ├── configuration_qwen2.py
│   │   │   │   └── modeling_qwen2.py
│   │   │   └── stablelm/
│   │   │       ├── configuration_stablelm_epoch.py
│   │   │       ├── modeling_stablelm_epoch.py
│   │   │       └── tokenization_arcade100k.py
│   │   ├── llava_arch.py
│   │   ├── make_delta.py
│   │   ├── modeling_flash_attention_utils.py
│   │   ├── multimodal_encoder/
│   │   │   ├── builder.py
│   │   │   ├── clip_encoder.py
│   │   │   ├── clips2_encoder.py
│   │   │   └── siglip_encoder.py
│   │   ├── multimodal_projector/
│   │   │   ├── builder.py
│   │   │   ├── pool_block.py
│   │   │   ├── qformer.py
│   │   │   └── simple_block.py
│   │   └── utils.py
│   ├── serve/
│   │   ├── __init__.py
│   │   ├── cli.py
│   │   └── utils.py
│   ├── train/
│   │   ├── __init__.py
│   │   ├── align_train.py
│   │   ├── align_trainer.py
│   │   ├── dpo_train.py
│   │   ├── dpo_trainer.py
│   │   ├── llava_trainer.py
│   │   ├── train.py
│   │   └── train_utils.py
│   └── utils.py
├── requirements.txt
├── scripts/
│   ├── activated_params.py
│   ├── convert_gqa_for_eval.py
│   ├── convert_mmbench_for_submission.py
│   ├── convert_mmvet_for_eval.py
│   ├── convert_seed_for_submission.py
│   ├── convert_sqa_to_llava.py
│   ├── convert_sqa_to_llava_base_prompt.py
│   ├── convert_vizwiz_for_submission.py
│   ├── convert_vqav2_for_submission.py
│   ├── extract_mm_projector.py
│   ├── kill.py
│   ├── merge_lora_weights.py
│   ├── merge_moe_lora_weights.py
│   ├── sqa_eval_batch.sh
│   └── sqa_eval_gather.sh
└── shells/
    ├── eval/
    │   ├── gqa.sh
    │   ├── mmbench.sh
    │   ├── mmbench_cn.sh
    │   ├── mme.sh
    │   ├── mmhal.sh
    │   ├── objhal.sh
    │   ├── pope.sh
    │   ├── sqa.sh
    │   ├── textvqa.sh
    │   └── vizwiz.sh
    ├── inference/
    │   └── cli_inference.sh
    └── train/
        └── qwen/
            ├── dense2dense_distillation.sh
            ├── dense2sparse_distillation.sh
            ├── finetune.sh
            ├── finetune_moe.sh
            ├── preference_distillation.sh
            └── pretrain.sh

================================================
FILE CONTENTS
================================================

================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
# <div align="center">LLaVA-MoD: Making LLaVA Tiny via MoE Knowledge Distillation<div>

<div align="center">
<!-- [![arXiv](https://img.shields.io/badge/Arxiv-2408.15881-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2408.15881) -->
  <a href="[https://arxiv.org/abs/2408.15881](https://arxiv.org/abs/2408.15881)"><img src="https://img.shields.io/static/v1?label=Paper&message=Arxiv:LLaVA-MOD&color=red&logo=arxiv"></a>
  
  [![License](https://img.shields.io/badge/License-Apache%202.0-yellow)](https://github.com/shufangxun/LLaVA-MoD/blob/main/LICENSE)
</div>

---

## 📢 News
- Jan 23, 2025: 🔥 LLaVA-MoD is accepted by ICLR 2025.
- Dec 24, 2024: 🏋️‍♀️ Training and evaluation codes are released.
- Aug 28, 2024: 🤗 LLaVA-MoD is featured on [Huggingface Daily Papers](https://huggingface.co/papers/2408.15881).
- Aug 28, 2024: 📖 Paper is available on [Arxiv](https://arxiv.org/pdf/2408.15881). 

🌟 **Star us if you think it's helpful.** Your support means a lot! ⭐️

---

## ✨ Contents

- [🧭 Overview](#-overview)
- [🛠️ Installation](#-installation)
- [🗂️ Data Construction](#-data-construction)
- [🏋️‍♂️ Training and Evaluation](#-training-and-evaluation)
- [🚀 Inference](#-inference)
- [📖 Citation](#-citation)
- [🏆 Acknowledgement](#-acknowledgement)
- [📄 License](#-license)

---

## 🧭 Overview

**<b>TL; DR:</b>** LLaVA-MoD is an efficient framework for training small-scale Multimodal Language Models by distilling knowledge from larger models.

<details>
  <summary>🚀 <b>CLICK for the full abstract</b></summary>

We introduce **LLaVA-MoD**, a novel framework designed to enable the efficient training of small-scale Multimodal Language Models by distilling knowledge from large-scale MLLM. Our approach addresses two fundamental challenges in MLLM distillation:

- **Network Optimization**: We enhance the s-MLLM structure by integrating a sparse Mixture of Experts (MoE) architecture, balancing computational efficiency and model expressiveness.
  
- **Progressive Knowledge Transfer**: We propose a two-stage transfer strategy:
  1. **Mimic Distillation**: Minimizing Kullback-Leibler (KL) divergence between output distributions to help the student model emulate the teacher's understanding.
  2. **Preference Distillation**: Using Direct Preference Optimization (DPO), where the student model learns to outperform the teacher, especially in hallucination benchmarks.

Extensive experiments show **LLaVA-MoD** outperforms existing models across multimodal benchmarks while activating only a minimal number of parameters and keeping computational costs low. With **only 2B activated parameters**, **LLaVA-MoD** surpasses **Qwen-VL-Chat-7B** by an average of **8.8%**, using merely **0.3% of the training data** and **23% trainable parameters**.

These results highlight **LLaVA-MoD**’s success in distilling comprehensive knowledge from its teacher model, making it a groundbreaking solution for developing more efficient MLLMs.
</details>

---

## 🛠️ Installation

1. First install `anaconda`, and install `torch`, We recommend installing `torch==2.1.2` and `cuda==11.8`.

```bash
# CUDA 11.8
pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu118
```

2. Then install the packages in `requirements`
```bash
pip install -r requirements.txt
```
---

## 🗂️ Data Construction
### 📚 Mimic Distillation
We follow LLaVA to construct the data as following format:
```json
{
  "id": "000000052846",
  "image": "COCO2017/train/000000052846.jpg",
  "conversations": [
     {
        "from": "human",
        "value": "Where is the cat positioned in the image?\n<image>"
     },
     {
        "from": "gpt",
        "value": "The cat is positioned on top of the back of the couch in the living room."
     },
     {
        "from": "human",
        "value": "What is the cat doing in the image?"
     },
     {
        "from": "gpt",
        "value": "The cat is coming out from some curtains onto the couch and is sitting or standing on top of it."
     }
  ]
}
```

### 🧠 Preference Distillation
We follow RLAIF-V to construct the data as following format:
```json
{
    "id": "OmniLMM-12B_OmniLMM-12B_0",
    "image": "RLAIF-V-Dataset_000_OmniLMM-12B_OmniLMM-12B_0.jpg",
    "chosen": [
      {
        "from": "human",
        "value": "<image>\nWho is more likely to use these tools a leather crafter or a paper crafter?"
      },
      {
        "from": "gpt",
        "value": "A leather crafter is more likely to use these tools. The image shows various crafting tools, including scissors and a hole punch, which are commonly used in leatherworking projects. Leather is a material that requires cutting, shaping, and precise hole-punching techniques to create desired designs or patterns. In contrast, paper crafters typically use different types of tools, such as adhesives, decorative papers, or specialized cutting machines like the Silhouette Cameo, for their projects."
      }
    ],
    "rejected": [
      {
        "from": "human",
        "value": "<image>\nWho is more likely to use these tools a leather crafter or a paper crafter?"
      },
      {
        "from": "gpt",
        "value": "A leather crafter is more likely to use these tools as they consist of a hole punch, scissors, and a knife. These items are typically used in crafting projects involving fabric or leather materials for various designs and patterns. Paper crafters may also benefit from some of these tools, but their primary focus would be on paper-related projects, which might require different types of tools such as paper cutters or scrapbooking supplies."
      }
    ]
}
```

---

## 🏋️‍♂️ Training and Evaluation
The full details for training and evaluation can be found in the [TRAIN_EVAL.md](docs/TRAIN_EVAL.md).

---

## 🚀 Inference
For instructions on inference, please refer to the [INFERENCE.md](docs/INFERENCE.md).

---

## 📖 Citation
If you find our project useful for your research and applications, please star it and cite the paper using this BibTeX:
```BibTeX
@article{shu2024llavamod,
  title={LLaVA-MoD: Making LLaVA Tiny via MoE Knowledge Distillation},
  author={Shu, Fangxun and Liao, Yue and Zhuo, Le and Xu, Chenning and Zhang, Lei and Zhang, Guanghao and Shi, Haonan and Chen, Long and Zhong, Tao and He, Wanggui and Fu, Siming and others},
  journal={arXiv preprint arXiv:2408.15881},
  year={2024}
}
```
---

## 🏆 Acknowledgement
Our project is built upon [MoE-LLaVA](https://github.com/PKU-YuanGroup/MoE-LLaVA) and [LLaVA](https://github.com/haotian-liu/LLaVA). We are deeply grateful for the excellent codebase they provide. Additionally, we express our appreciation to [MobileVLM](https://github.com/Meituan-AutoML/MobileVLM) and [RLAIF-V](https://github.com/RLHF-V/RLAIF-V) for their meticulously processed datasets. Their contributions have been of immeasurable value in shaping our work.

---

## 📄 License
Our project is released under the Apache 2.0 license.



================================================
FILE: docs/INFERENCE.md
================================================
## Inference

We provide commandline inference and batch inference scripts.

### CLI Inference
```Shell
deepspeed --include localhost:0 --master_port 20019 llavamod/serve/cli.py \
  --model-path ${MODEL_PATH} --image-file ${IMAGE_FILE}
```
### Batch Inference
```Shell
deepspeed --master_port 20014 llavamod/eval/model_vqa.py \
    --model-path ${MODEL_PATH} \
    --question-file ${QUESTION_FILE} \
    --image-folder ${IMAGE_FOLDER} \
    --answers-file ${ANSWER_FILE} \
    --temperature 0.0 \
    --conv-mode qwen
```

================================================
FILE: docs/TRAIN_EVAL.md
================================================
## Preliminary
### Download Pretrained Checkpoints
We use [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) as the vision encoder for both teacher and student models. Additionally, we use [Qwen-1.5](https://huggingface.co/collections/Qwen/qwen15-65c0a2f577b1ecb76d786524) / [Qwen-2](https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f) of different sizes respectively as the LLM for the teacher and student models. These pretrained checkpoints can be downloaded from HuggingFace.

### Prepare Teacher Model
We follow the approach of [LLaVA-1.5](https://github.com/haotian-liu/LLaVA) to train the teacher model, replacing Vicuna-1.5-7B with Qwen-2-7B, while keeping the training dataset and strategy unchanged. 

## Training 
The training of LLaVA-MoD comprises three stages:
- Adaptor Initialization: 0.6 million general captioning samples are employed to bridge the gap between visual and language modalities.
- Mimic Distillation: 
  - Dense-to-Dense Distillation: 2.4 million general captioning and conversation samples are utilized to distill general knowledge.
  - Dense-to-Sparse Distillation: 1.4 million multi-task data, including VQA, documents, science, and OCR, are used to distill specialized knowledge.
- Preference Distillation tuning stage: 80,000 preference data samples are utilized to distill preference knowledge.

### Adaptor Initialization
- first, download the caption dataset [LLaVA-Pretrain](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain) 
- then run the following scripts:
```shell
bash shells/train/qwen/pretrain.sh
```

### Mimic Distillation
In this stage, we initially conduct Dense-to-Dense Distillation on the dense student model. Subsequently, we up-cycle the student model from dense to sparse and conduct Dense-to-Sparse Distillation.

#### Dense-to-Dense Distillation 
- first, download general caption datasets ([ShareGPT4V-Captioner](https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/share-captioner_coco_lcs_sam_1246k_1107.json) and [ALLaVA-Caption-LAION-4V](https://huggingface.co/datasets/FreedomIntelligence/ALLaVA-4V/blob/main/allava_laion/ALLaVA-Caption-LAION-4V.json)) and general conversation datasets ([SViT](https://github.com/BAAI-DCAI/Visual-Instruction-Tuning), [LVIS](https://github.com/X2FD/LVIS-INSTRUCT4V), [LRV](https://github.com/FuxiaoLiu/LRV-Instruction), [MIMIC-IT](https://github.com/Luodian/Otter)). The general datasets have also been packaged and can be downloaded from [MoE-LLaVA](https://huggingface.co/datasets/LanguageBind/MoE-LLaVA).
- then, set the distillation and model configuration:
```python
# KD config
POLICY_MODEL_TYPE='dense'
REF_MODEL_TYPE='dense'
LOSS_TYPE='only_kd'  # kd_lm | only_kd
DISTILL_ALL_TOKENS=False  # False: only response, True: multimodal instruction + response

# MoE config
MOE_LOSS_ENABLE=False
MOE_ENABLE=False
MOE_FINETUNE=False
MOE_MODE="sparse"
NUM_EXPERTS=4
TOP_K_EXPERTS=2
USE_RESIDUAL=False
ROUTER_AUX_LOSS_COEF=0.01
CAPACITY_FACTOR=1.5
```
- finally, run the following scripts:
```shell
bash shells/train/qwen/dense2dense_distillation.sh
```

#### Dense-to-Sparse Distillation
- first, download multi-task datasets ([Text-VQA](https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip), 
  [IConQA](https://drive.google.com/file/d/1Xqdt1zMcMZU5N_u1SAIjk-UAclriynGx/edit), [SQA](https://drive.google.com/drive/folders/1w8imCXWYn2LxajmGeGH_g5DaL2rabHev), [SBU](https://huggingface.co/datasets/sbu_captions), follow [ShareGPT4V](https://github.com/InternLM/InternLM-XComposer/blob/main/projects/ShareGPT4V/docs/Data.md) to download images from:
  [LAION-CC-SBU-558K](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/blob/main/images.zip), [COCO](http://images.cocodataset.org/zips/train2017.zip), [WebData](https://drive.google.com/drive/folders/1tCUQ-sq6vdshZVkF0ZeF3K4eztkXJgax?usp=sharing), [SAM](https://drive.google.com/file/d/1dKumdOKSXtV7lIXdrG7jsIK_z2vZv2gs/view?usp=drive_link), [GQA](https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip), [OCR-VQA](https://drive.google.com/drive/folders/1_GYPY5UkUy7HIcR0zq3ZCFgeZN7BAfm_?usp=sharing), [TextVQA](https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip), [VisualGnome](https://cs.stanford.edu/people/rak248/VG_100K_2) ([Part1](https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip), [Part2](https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip)), follow [InternVL](https://huggingface.co/datasets/OpenGVLab/InternVL-Chat-V1-2-SFT-Data) to download [DVQA](https://github.com/kushalkafle/DVQA_dataset), [ChartQA](https://github.com/vis-nlp/ChartQA), [AI2D](https://allenai.org/data/diagrams), [DocVQA](https://www.docvqa.org/datasets), [GeoQA+](https://github.com/SCNU203/GeoQA-Plus), and [SynthDoG-EN](https://huggingface.co/datasets/naver-clova-ix/synthdog-en)). The json files have also been packaged and can be downloaded from [MobileVLM](https://huggingface.co/datasets/mtgv/MobileVLM_V2_FT_Mix2M) and [InternVL](https://huggingface.co/datasets/OpenGVLab/InternVL-Chat-V1-2-SFT-Data).
- then, set the distillation and model configuration:
```python
# KD config
POLICY_MODEL_TYPE='dense'
REF_MODEL_TYPE='dense'
LOSS_TYPE='only_kd'  # kd_lm | only_kd
DISTILL_ALL_TOKENS=False  # False: only response, True: multimodal instruction + response

# MoE config
MOE_LOSS_ENABLE=False
MOE_ENABLE=False
MOE_FINETUNE=False
MOE_MODE="sparse"
NUM_EXPERTS=4
TOP_K_EXPERTS=2
USE_RESIDUAL=False
ROUTER_AUX_LOSS_COEF=0.01
CAPACITY_FACTOR=1.5
```
- finally, run the following scripts:
```shell
bash shells/train/qwen/dense2sparse_distillation.sh
```


### Preference Distillation
- first, download preference dataset from [RLAIF-V](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset).
- then, set the distillation and model configuration:
```python
# KD config
POLICY_MODEL_TYPE='sparse'
REF_MODEL_TYPE='dense'
LOSS_TYPE='kto_pair'  # kto_pair | sigmoid
DISTILL_ALL_TOKENS=False  # False: only response, True: multimodal instruction + response


# MoE config
MOE_LOSS_ENABLE=True
MOE_ENABLE=True
MOE_FINETUNE=True
MOE_MODE="sparse"
NUM_EXPERTS=4
TOP_K_EXPERTS=2
USE_RESIDUAL=False
ROUTER_AUX_LOSS_COEF=0.01
CAPACITY_FACTOR=1.5
```
- finally, run the following scripts:
```shell
bash shells/train/qwen/preference_distillation.sh
```

## Evaluation
We follow [LLaVA-1.5](https://github.com/haotian-liu/LLaVA) to evaluate on comprehension benchmarks (TextVQA, GQA, ScienceQA, VizWiz, MME, MMBench) and [RLAIF-V](https://github.com/RLHF-V/RLAIF-V) to evaluate on hallucination benchmarks (MMHal Bench, POPE and Object HalBench). Please refer to these resources to organize the evaluation datasets. All the evaluation scripts are located under `shells/eval`. Here is an example for MMBench.
```shell
#!/bin/bash
MODEL_NAME='your_model_name'
MODEL_PATH='your_model_path'

CONV="qwen"
SPLIT="mmbench_dev_en_20231003"
EVAL="benchmark"

deepspeed --include localhost:0 --master_port 20029 llavamod/eval/model_vqa_mmbench.py \
     --model-path ${MODEL_PATH} \
     --question-file ${EVAL}/mmbench/$SPLIT.tsv \
     --answers-file ${EVAL}/mmbench/answers/$SPLIT/${MODEL_NAME}.jsonl \
     --single-pred-prompt \
     --temperature 0 \
     --conv-mode ${CONV}

mkdir -p ${EVAL}/mmbench/answers_upload/$SPLIT

python3 scripts/convert_mmbench_for_submission.py \
    --annotation-file ${EVAL}/mmbench/$SPLIT.tsv \
    --result-dir ${EVAL}/mmbench/answers/$SPLIT \
    --upload-dir ${EVAL}/mmbench/answers_upload/$SPLIT \
    --experiment ${MODEL_NAME}
```




================================================
FILE: llavamod/__init__.py
================================================
from .model import LlavaLlamaForCausalLM
from .model import LLaVAMoDLlamaForCausalLM
from .model import LlavaQWenForCausalLM
from .model import LLaVAMoDLlamaForCausalLM
import transformers
a, b, c = transformers.__version__.split('.')[:3]
if a == '4' and int(b) >= 34:
    from .model import LlavaMistralForCausalLM
    from .model import LLaVAMoDMistralForCausalLM
if a == '4' and int(b) >= 36:
    from .model import LlavaMiniCPMForCausalLM
    from .model import LLaVAMoDMiniCPMForCausalLM
    from .model import LlavaPhiForCausalLM
    from .model import LLaVAMoDPhiForCausalLM
    from .model import LlavaStablelmForCausalLM
    from .model import LLaVAMoDStablelmForCausalLM
if a == '4' and int(b) >= 37:
    from .model import LlavaQwen1_5ForCausalLM
    from .model import LLaVAMoDQwen1_5ForCausalLM


================================================
FILE: llavamod/config/__init__.py
================================================


================================================
FILE: llavamod/config/args.py
================================================
from typing import Optional, List
from dataclasses import field

from llavamod.data.dataset import *
from llavamod.train.train_utils import *


@dataclass
class ModelArguments:
    model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
    version: Optional[str] = field(default="v0")
    freeze_backbone: bool = field(default=False)
    tune_llm_ffn_only: bool = field(default=False)
    tune_mm_mlp_adapter: bool = field(default=False)
    mm_vision_select_layer: Optional[int] = field(default=-1)  # default to the last layer
    pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
    mm_use_im_start_end: bool = field(default=False)
    mm_use_im_patch_token: bool = field(default=True)
    mm_vision_select_feature: Optional[str] = field(default="patch")
    s2: bool = field(default=False)
    s2_scales: Optional[str] = field(default="336,672")

    # ===================================================================
    image_tower: Optional[str] = field(default=None)
    video_tower: Optional[str] = field(default=None)
    image_projector_type: Optional[str] = field(default='linear')
    video_projector_type: Optional[str] = field(default='linear')
    video_global_proj: bool = field(default=False)
    video_temproal_proj: bool = field(default=False)
    video_spatial_proj: bool = field(default=False)
    # ===================================================================

    # =============================================================
    only_lora_ffn: bool = True
    moe_enable: bool = False
    train_modules: Optional[List[str]] = field(default=None, metadata={"help": ""})
    moe_mode: str = field(
        default="second_half",
        metadata={
            "help": "The backend to be used for half precision.",
            "choices": ["first_half", "second_half", "sparse", "dense"],
        },
    )
    moe_layers_idx: Optional[List[int]] = field(default=None, metadata={"help": "where to place moe layers."})
    ep_size: int = 1
    num_experts: Optional[List[int]] = field(default=4, metadata={"help": "number of experts for each moe layer."})
    top_k_experts: int = field(
        default=2,
        metadata={
            "help": "Top-k experts to deal with tokens.",
            "choices": [1, 2, 3, 4],
        },
    )
    capacity_factor: float = 1.
    eval_capacity_factor: float = 2.
    min_capacity: int = 0
    use_residual: bool = False
    router_aux_loss_coef: float = 0.01
    # =============================================================


@dataclass
class DataArguments:
    lazy_preprocess: bool = False
    is_multimodal: bool = False
    image_aspect_ratio: str = 'square'
    # ===================================================================
    data_path: Optional[List[str]] = field(default=None, metadata={"help": "Path to the training data."})
    image_folder: Optional[str] = field(default=None)
    video_folder: Optional[str] = field(default=None)
    num_frames: int = 8
    # ===================================================================


@dataclass
class TrainingArguments(transformers.TrainingArguments):
    cache_dir: Optional[str] = field(default=None)
    optim: str = field(default="adamw_torch")
    remove_unused_columns: bool = field(default=False)
    freeze_mm_mlp_adapter: bool = field(default=False)
    mpt_attn_impl: Optional[str] = field(default="triton")
    model_max_length: int = field(
        default=512,
        metadata={
            "help":
                "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
        },
    )
    double_quant: bool = field(
        default=True,
        metadata={"help": "Compress the quantization statistics through double quantization."}
    )
    quant_type: str = field(
        default="nf4",
        metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
    )
    bits: int = field(
        default=16,
        metadata={"help": "How many bits to use."}
    )
    lora_enable: bool = False
    lora_r: int = 128
    lora_alpha: int = 256
    lora_dropout: float = 0.05
    lora_weight_path: str = ""
    lora_bias: str = "none"
    mm_projector_lr: Optional[float] = None
    group_by_modality_length: bool = field(default=False)
    moe_finetune: bool = field(default=False)
    distill_all_tokens: bool = field(default=False)
    attn_implementation: str = field(default="flash_attention_2", metadata={"help": "Use transformers attention implementation."})


@dataclass
class AlignArguments:
    policy_model_type: str = field(default='sparse')
    ref_model_type: str = field(default='dense')
    loss_type: str = field(default='only_kd')
    policy_model_name_or_path: str = field(default=None)
    policy_pretrain_mm_mlp_adapter: str = field(default=None)
    ref_model_name_or_path: str = field(default=None)
    ref_pretrain_mm_mlp_adapter: str = field(default=None)
    moe_loss_enable: bool = field(default=False)


@dataclass
class DPOArguments:
    policy_model_type: str = field(default='sparse')
    ref_model_type: str = field(default='dense')
    loss_type: str = field(default='sigmoid')
    policy_model_name_or_path: str = field(default=None)
    ref_model_name_or_path: str = field(default=None)
    moe_loss_enable: bool = field(default=False)




================================================
FILE: llavamod/config/dpconfig/dpo_zero2.json
================================================
{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "bf16": {
        "enabled": "auto"
    },

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupDecayLR",
        "params": {
            "warmup_min_lr": 0,
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto",
            "warmup_type": "linear",
            "total_num_steps": "auto"
        }
    },

    "train_micro_batch_size_per_gpu": "auto",
    "train_batch_size": "auto",
    "gradient_accumulation_steps": "auto",
    "zero_optimization": {
        "stage": 2,
        "overlap_comm": true,
        "sub_group_size": 1e9,
        "allgather_partitions": true,
        "allgather_bucket_size": 2e8,
        "reduce_scatter": true,
        "reduce_bucket_size": 2e8,
        "contiguous_gradients": true
    }
}

================================================
FILE: llavamod/config/dpconfig/dpo_zero2_offload.json
================================================
{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 100,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1e-15
    },

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupDecayLR",
        "params": {
            "warmup_min_lr": 0,
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto",
            "warmup_type": "linear",
            "total_num_steps": "auto"
        }
    },

    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "allgather_partitions": true,
        "allgather_bucket_size": 2e8,
        "overlap_comm": true,
        "reduce_scatter": true,
        "reduce_bucket_size": 2e8,
        "contiguous_gradients": true
    },

    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}

================================================
FILE: llavamod/config/dpconfig/zero2.json
================================================
{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "bf16": {
        "enabled": "auto"
    },
    "train_micro_batch_size_per_gpu": "auto",
    "train_batch_size": "auto",
    "gradient_accumulation_steps": "auto",
    "zero_optimization": {
        "stage": 2,
        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1e9,
        "reduce_bucket_size": "auto"
    }
}

================================================
FILE: llavamod/config/dpconfig/zero2_offload.json
================================================
{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 100,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "bf16": {
        "enabled": "auto"
    },
    "train_micro_batch_size_per_gpu": "auto",
    "train_batch_size": "auto",
    "gradient_accumulation_steps": "auto",
    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {
            "device": "cpu"
        },
        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1e9,
        "reduce_bucket_size": "auto"
    }
}

================================================
FILE: llavamod/config/dpconfig/zero3.json
================================================
{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 100,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "bf16": {
        "enabled": "auto"
    },
    "train_micro_batch_size_per_gpu": "auto",
    "train_batch_size": "auto",
    "gradient_accumulation_steps": "auto",
    "zero_optimization": {
        "stage": 3,
        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1e9,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_gather_16bit_weights_on_model_save": true
    }
}


================================================
FILE: llavamod/config/dpconfig/zero3_offload.json
================================================
{
  "fp16": {
    "enabled": "auto",
    "loss_scale": 0,
    "loss_scale_window": 1000,
    "initial_scale_power": 16,
    "hysteresis": 2,
    "min_loss_scale": 1
  },
  "bf16": {
    "enabled": "auto"
  },
  "optimizer": {
    "type": "AdamW",
    "params": {
      "lr": "auto",
      "betas": "auto",
      "eps": "auto",
      "weight_decay": "auto"
    }
  },
  "scheduler": {
    "type": "WarmupLR",
    "params": {
      "warmup_min_lr": "auto",
      "warmup_max_lr": "auto",
      "warmup_num_steps": "auto"
    }
  },
  "zero_optimization": {
    "stage": 3,
    "offload_optimizer": {
      "device": "cpu",
      "pin_memory": true
    },
    "offload_param": {
      "device": "cpu",
      "pin_memory": true
    },
    "overlap_comm": true,
    "contiguous_gradients": true,
    "sub_group_size": 1e9,
    "reduce_bucket_size": "auto",
    "stage3_prefetch_bucket_size": "auto",
    "stage3_param_persistence_threshold": "auto",
    "stage3_max_live_parameters": 1e9,
    "stage3_max_reuse_distance": 1e9,
    "gather_16bit_weights_on_model_save": true
  },
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "steps_per_print": 1e5,
  "wall_clock_breakdown": false
}


================================================
FILE: llavamod/constants.py
================================================
CONTROLLER_HEART_BEAT_EXPIRATION = 30
WORKER_HEART_BEAT_INTERVAL = 15

LOGDIR = "."

IGNORE_INDEX = -100

IMAGE_TOKEN_INDEX = -200
DEFAULT_IMAGE_TOKEN = "<image>"
DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
DEFAULT_IM_START_TOKEN = "<im_start>"
DEFAULT_IM_END_TOKEN = "<im_end>"
IMAGE_PLACEHOLDER = "<image-placeholder>"

# ======================================================================================================
DEFAULT_VIDEO_TOKEN = "<video>"
DEFAULT_VIDEO_PATCH_TOKEN = "<im_patch>"
DEFAULT_VID_START_TOKEN = "<vid_start>"
DEFAULT_VID_END_TOKEN = "<vid_end>"
VIDEO_PLACEHOLDER = "<video-placeholder>"
# ======================================================================================================

MAX_IMAGE_LENGTH = 16
MAX_VIDEO_LENGTH = 1  # current video datasets only have 1 video?

PAD_LENGTH = 620

================================================
FILE: llavamod/conversation.py
================================================
import dataclasses
from enum import auto, Enum
from typing import List, Tuple


class SeparatorStyle(Enum):
    """Different separator style."""
    SINGLE = auto()
    TWO = auto()
    MPT = auto()
    PLAIN = auto()
    LLAMA_2 = auto()
    GEMMA_2 = auto()



@dataclasses.dataclass
class Conversation:
    """A class that keeps all conversation history."""
    system: str
    roles: List[str]
    messages: List[List[str]]
    offset: int
    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
    sep: str = "###"
    sep2: str = None
    version: str = "Unknown"

    skip_next: bool = False

    def get_prompt(self):
        messages = self.messages
        if len(messages) > 0 and type(messages[0][1]) is tuple:
            messages = self.messages.copy()
            init_role, init_msg = messages[0].copy()
            init_msg = init_msg[0].replace("<image>", "").strip()
            if 'mmtag' in self.version:
                messages[0] = (init_role, init_msg)
                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
                messages.insert(1, (self.roles[1], "Received."))
            else:
                messages[0] = (init_role, "<image>\n" + init_msg)

        if self.sep_style == SeparatorStyle.SINGLE:
            ret = self.system + self.sep
            for role, message in messages:
                if message:
                    if type(message) is tuple:
                        message, _, _ = message
                    ret += role + ": " + message + self.sep
                else:
                    ret += role + ":"
        elif self.sep_style == SeparatorStyle.TWO:
            seps = [self.sep, self.sep2]
            ret = self.system + seps[0]
            for i, (role, message) in enumerate(messages):
                if message:
                    if type(message) is tuple:
                        message, _, _ = message
                    ret += role + ": " + message + seps[i % 2]
                else:
                    ret += role + ":"
        elif self.sep_style == SeparatorStyle.MPT:
            ret = self.system + self.sep
            for role, message in messages:
                if message:
                    if type(message) is tuple:
                        message, _, _ = message
                    ret += role + message + self.sep
                else:
                    ret += role
        elif self.sep_style == SeparatorStyle.LLAMA_2:
            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
            ret = ""

            for i, (role, message) in enumerate(messages):
                try:
                    if i == 0:
                        assert message, "first message should not be none"
                        assert role == self.roles[0], "first message should come from user"
                    if message:
                        if type(message) is tuple:
                            message, _, _ = message
                        if i == 0: message = wrap_sys(self.system) + message
                        if i % 2 == 0:
                            message = wrap_inst(message)
                            ret += self.sep + message
                        else:
                            ret += " " + message + " " + self.sep2
                    else:
                        ret += ""
                except Exception as e:
                    print("#### error")
                    print(e)
                    print(messages)
                    print(role)
                    print(message)
            ret = ret.lstrip(self.sep)
        elif self.sep_style == SeparatorStyle.GEMMA_2:
            seps = [self.sep, self.sep2]
            ret = self.system + seps[0]
            for i, (role, message) in enumerate(messages):
                if message:
                    if type(message) is tuple:
                        message, _, _ = message
                    ret += "<start_of_turn>" + role + "\n" + message + "<end_of_turn>\n" + seps[i % 2]
                else:
                    ret += "<start_of_turn>" + role + "\n"
        elif self.sep_style == SeparatorStyle.PLAIN:
            seps = [self.sep, self.sep2]
            ret = self.system
            for i, (role, message) in enumerate(messages):
                if message:
                    if type(message) is tuple:
                        message, _, _ = message
                    ret += message + seps[i % 2]
                else:
                    ret += ""
        else:
            raise ValueError(f"Invalid style: {self.sep_style}")

        return ret

    def append_message(self, role, message):
        self.messages.append([role, message])

    def get_images(self, return_pil=False):
        images = []
        for i, (role, msg) in enumerate(self.messages[self.offset:]):
            if i % 2 == 0:
                if type(msg) is tuple:
                    import base64
                    from io import BytesIO
                    from PIL import Image
                    msg, image, image_process_mode = msg
                    if image_process_mode == "Pad":
                        def expand2square(pil_img, background_color=(122, 116, 104)):
                            width, height = pil_img.size
                            if width == height:
                                return pil_img
                            elif width > height:
                                result = Image.new(pil_img.mode, (width, width), background_color)
                                result.paste(pil_img, (0, (width - height) // 2))
                                return result
                            else:
                                result = Image.new(pil_img.mode, (height, height), background_color)
                                result.paste(pil_img, ((height - width) // 2, 0))
                                return result
                        image = expand2square(image)
                    elif image_process_mode in ["Default", "Crop"]:
                        pass
                    elif image_process_mode == "Resize":
                        image = image.resize((336, 336))
                    else:
                        raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
                    max_hw, min_hw = max(image.size), min(image.size)
                    aspect_ratio = max_hw / min_hw
                    max_len, min_len = 800, 400
                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
                    longest_edge = int(shortest_edge * aspect_ratio)
                    W, H = image.size
                    if longest_edge != max(image.size):
                        if H > W:
                            H, W = longest_edge, shortest_edge
                        else:
                            H, W = shortest_edge, longest_edge
                        image = image.resize((W, H))
                    if return_pil:
                        images.append(image)
                    else:
                        buffered = BytesIO()
                        image.save(buffered, format="PNG")
                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
                        images.append(img_b64_str)
        return images

    def to_gradio_chatbot(self):
        ret = []
        for i, (role, msg) in enumerate(self.messages[self.offset:]):
            if i % 2 == 0:
                if type(msg) is tuple:
                    import base64
                    from io import BytesIO
                    msg, image, image_process_mode = msg
                    max_hw, min_hw = max(image.size), min(image.size)
                    aspect_ratio = max_hw / min_hw
                    max_len, min_len = 800, 400
                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
                    longest_edge = int(shortest_edge * aspect_ratio)
                    W, H = image.size
                    if H > W:
                        H, W = longest_edge, shortest_edge
                    else:
                        H, W = shortest_edge, longest_edge
                    image = image.resize((W, H))
                    buffered = BytesIO()
                    image.save(buffered, format="JPEG")
                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
                    msg = img_str + msg.replace('<image>', '').strip()
                    ret.append([msg, None])
                else:
                    ret.append([msg, None])
            else:
                ret[-1][-1] = msg
        return ret

    def copy(self):
        return Conversation(
            system=self.system,
            roles=self.roles,
            messages=[[x, y] for x, y in self.messages],
            offset=self.offset,
            sep_style=self.sep_style,
            sep=self.sep,
            sep2=self.sep2,
            version=self.version)

    def dict(self):
        if len(self.get_images()) > 0:
            return {
                "system": self.system,
                "roles": self.roles,
                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
                "offset": self.offset,
                "sep": self.sep,
                "sep2": self.sep2,
            }
        return {
            "system": self.system,
            "roles": self.roles,
            "messages": self.messages,
            "offset": self.offset,
            "sep": self.sep,
            "sep2": self.sep2,
        }


conv_vicuna_v0 = Conversation(
    system="A chat between a curious human and an artificial intelligence assistant. "
           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
    roles=("Human", "Assistant"),
    messages=(
        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
        ("Assistant",
            "Renewable energy sources are those that can be replenished naturally in a relatively "
            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
            "renewable and non-renewable energy sources:\n"
            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
            "energy sources are finite and will eventually run out.\n"
            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
            "and other negative effects.\n"
            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
            "have lower operational costs than non-renewable sources.\n"
            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
            "locations than non-renewable sources.\n"
            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
    ),
    offset=2,
    sep_style=SeparatorStyle.SINGLE,
    sep="###",
)

conv_vicuna_v1 = Conversation(
    system="A chat between a curious user and an artificial intelligence assistant. "
    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
    roles=("USER", "ASSISTANT"),
    version="v1",
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.TWO,
    sep=" ",
    sep2="</s>",
)

conv_mistral = Conversation(
    system="A chat between a curious user and an artificial intelligence assistant. "
    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
    roles=("USER", "ASSISTANT"),
    version="mistral",
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.TWO,
    sep=" ",
    sep2="</s>",
)

conv_openchat = Conversation(
    system="A chat between a curious user and an artificial intelligence assistant. "
    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
    roles=("USER", "ASSISTANT"),
    version="openchat",
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.TWO,
    sep=" ",
    sep2="<|end_of_turn|>",
)

conv_minicpm = Conversation(
    system="A chat between a curious user and an artificial intelligence assistant. "
    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
    roles=("USER", "ASSISTANT"),
    version="minicpm",
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.TWO,
    sep=" ",
    sep2="</s>",
)

conv_phi = Conversation(
    system="A chat between a curious user and an artificial intelligence assistant. "
    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
    roles=("USER", "ASSISTANT"),
    version="phi",
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.TWO,
    sep=" ",
    sep2="<|endoftext|>",
)


conv_stablelm = Conversation(
    system="A chat between a curious user and an artificial intelligence assistant. "
    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
    roles=("USER", "ASSISTANT"),
    version="stablelm",
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.TWO,
    sep=" ",
    sep2="<|endoftext|>",
)


conv_llama_2 = Conversation(
    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
    roles=("USER", "ASSISTANT"),
    version="llama_v2",
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.LLAMA_2,
    sep="<s>",
    sep2="</s>",
)

conv_gemma_2 = Conversation(
    system="",
    roles=("user", "model"),
    version="gemma_2",
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.GEMMA_2,
    sep="",
    sep2="<eos>",
)

conv_llava_llama_2 = Conversation(
    system="You are a helpful language and vision assistant. "
           "You are able to understand the visual content that the user provides, "
           "and assist the user with a variety of tasks using natural language.",
    roles=("USER", "ASSISTANT"),
    version="llama_v2",
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.LLAMA_2,
    sep="<s>",
    sep2="</s>",
)

conv_mpt = Conversation(
    system="""<|im_start|>system
A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
    version="mpt",
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.MPT,
    sep="<|im_end|>",
)

conv_llava_plain = Conversation(
    system="",
    roles=("", ""),
    messages=(
    ),
    offset=0,
    sep_style=SeparatorStyle.PLAIN,
    sep="\n",
)

conv_llava_v0 = Conversation(
    system="A chat between a curious human and an artificial intelligence assistant. "
           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
    roles=("Human", "Assistant"),
    messages=(
    ),
    offset=0,
    sep_style=SeparatorStyle.SINGLE,
    sep="###",
)

conv_llava_v0_mmtag = Conversation(
    system="A chat between a curious user and an artificial intelligence assistant. "
           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
           "The visual content will be provided with the following format: <Image>visual content</Image>.",
    roles=("Human", "Assistant"),
    messages=(
    ),
    offset=0,
    sep_style=SeparatorStyle.SINGLE,
    sep="###",
    version="v0_mmtag",
)

conv_llava_v1 = Conversation(
    system="A chat between a curious human and an artificial intelligence assistant. "
           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
    roles=("USER", "ASSISTANT"),
    version="v1",
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.TWO,
    sep=" ",
    sep2="</s>",
)

conv_llava_v1_mmtag = Conversation(
    system="A chat between a curious user and an artificial intelligence assistant. "
           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
           "The visual content will be provided with the following format: <Image>visual content</Image>.",
    roles=("USER", "ASSISTANT"),
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.TWO,
    sep=" ",
    sep2="</s>",
    version="v1_mmtag",
)

default_conversation = conv_vicuna_v1
conv_templates = {
    "default": conv_vicuna_v0,
    "v0": conv_vicuna_v0,
    "v1": conv_vicuna_v1,
    "openchat": conv_openchat,
    "mistral": conv_mistral,
    "minicpm": conv_minicpm,
    "phi": conv_phi,
    "qwen": conv_phi,
    "stablelm": conv_stablelm,
    "vicuna_v1": conv_vicuna_v1,
    "llama_2": conv_llama_2,
    "gemma_2": conv_gemma_2,

    "plain": conv_llava_plain,
    "v0_plain": conv_llava_plain,
    "llava_v0": conv_llava_v0,
    "v0_mmtag": conv_llava_v0_mmtag,
    "llava_v1": conv_llava_v1,
    "v1_mmtag": conv_llava_v1_mmtag,
    "llava_llama_2": conv_llava_llama_2,

    "mpt": conv_mpt,
}


if __name__ == "__main__":
    print(default_conversation.get_prompt())


================================================
FILE: llavamod/data/__init__.py
================================================


================================================
FILE: llavamod/data/data_utils.py
================================================
import copy
from typing import Dict, Sequence

import torch

from llavamod.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \
    DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_TOKEN, DEFAULT_VID_START_TOKEN, DEFAULT_VID_END_TOKEN, MAX_IMAGE_LENGTH, \
    MAX_VIDEO_LENGTH

from llavamod import conversation as conversation_lib
from llavamod.model import *
from llavamod.mm_utils import tokenizer_image_token
from PIL import Image

local_rank = None


def smart_tokenizer_and_embedding_resize(
        special_tokens_dict: Dict,
        tokenizer: transformers.PreTrainedTokenizer,
        model: transformers.PreTrainedModel,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
            dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
            dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg
        output_embeddings[-num_new_tokens:] = output_embeddings_avg


def _tokenize_fn(strings: Sequence[str],
                 tokenizer: transformers.PreTrainedTokenizer) -> Dict:
    """Tokenize a list of strings."""
    tokenized_list = [
        tokenizer(
            text,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        ) for text in strings
    ]
    input_ids = labels = [
        tokenized.input_ids[0] for tokenized in tokenized_list
    ]
    input_ids_lens = labels_lens = [
        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
        for tokenized in tokenized_list
    ]
    return dict(
        input_ids=input_ids,
        labels=labels,
        input_ids_lens=input_ids_lens,
        labels_lens=labels_lens,
    )


def _mask_targets(target, tokenized_lens, speakers):
    # cur_idx = 0
    cur_idx = tokenized_lens[0]
    tokenized_lens = tokenized_lens[1:]
    target[:cur_idx] = IGNORE_INDEX
    for tokenized_len, speaker in zip(tokenized_lens, speakers):
        if speaker == "human":
            target[cur_idx + 2:cur_idx + tokenized_len] = IGNORE_INDEX
        cur_idx += tokenized_len


def _add_speaker_and_signal(header, source, get_conversation=True):
    """Add speaker and start/end signal on each round."""
    BEGIN_SIGNAL = "### "
    END_SIGNAL = "\n"
    conversation = header
    for sentence in source:
        from_str = sentence["from"]
        if from_str.lower() == "human":
            from_str = conversation_lib.default_conversation.roles[0]
        elif from_str.lower() == "gpt":
            from_str = conversation_lib.default_conversation.roles[1]
        else:
            from_str = 'unknown'
        sentence["value"] = (BEGIN_SIGNAL + from_str + ": " +
                             sentence["value"] + END_SIGNAL)
        if get_conversation:
            conversation += sentence["value"]
    conversation += BEGIN_SIGNAL
    return conversation


def preprocess_multimodal(
        sources: Sequence[str],
        data_args
) -> Dict:
    is_multimodal = data_args.is_multimodal
    if not is_multimodal:
        return sources

    for source in sources:
        # try:
        for sentence in source:

            # ======================================================================================================
            if sentence['value'].startswith(DEFAULT_IMAGE_TOKEN) or sentence['value'].startswith(
                    DEFAULT_VIDEO_TOKEN):  # run with multi-im, multi-vid, multi-im & multi-vid
                # <video><video><image><image>\nxxxxxxxxxxxxx  # must <video> first
                # <image>\nxxxxxxxxxxxxx -> <image>\nxxxxxxxxxxxxx
                # <video>\nxxxxxxxxxxxxx -> <video>\nxxxxxxxxxxxxx

                if "mmtag" in conversation_lib.default_conversation.version:
                    sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN,
                                                                  '<Image>' + DEFAULT_IMAGE_TOKEN + '</Image>')

                IMAGE_TOKEN_NUM = sentence['value'].count(DEFAULT_IMAGE_TOKEN)
                if IMAGE_TOKEN_NUM > MAX_IMAGE_LENGTH:
                    sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM,
                                                                  DEFAULT_IMAGE_TOKEN * MAX_IMAGE_LENGTH).strip()
                VIDEO_TOKEN_NUM = sentence['value'].count(DEFAULT_VIDEO_TOKEN)
                if VIDEO_TOKEN_NUM > MAX_VIDEO_LENGTH:
                    raise ValueError(f"{sentence['value']}")
                    sentence['value'] = sentence['value'].replace(DEFAULT_VIDEO_TOKEN * VIDEO_TOKEN_NUM,
                                                                  DEFAULT_VIDEO_TOKEN * MAX_VIDEO_LENGTH).strip()

            # a <video> is treated as `num_frames * <image>`
            replace_token, vid_replace_token = DEFAULT_IMAGE_TOKEN, DEFAULT_IMAGE_TOKEN * data_args.num_frames
            if data_args.mm_use_im_start_end:
                replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
                vid_replace_token = DEFAULT_VID_START_TOKEN + vid_replace_token + DEFAULT_VID_END_TOKEN

            # <video><video><image><image>\nxxxxxxxxxxxxx -> `num_frames*<image>``num_frames*<image>`<image><image>\nxxxxxxxxxxxxx
            # <video>\nxxxxxxxxxxxxx -> `num_frames*<image>`\nxxxxxxxxxxxxx
            # print('before replace_token:', [sentence['value']])
            sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token)
            sentence['value'] = sentence['value'].replace(DEFAULT_VIDEO_TOKEN, vid_replace_token)
            # print('after replace_token:', [sentence['value']])
            # ======================================================================================================
        # except Exception as e:
        #     print(e)
        #     print(source)
    return sources


def preprocess_llama_2(
        sources,
        tokenizer: transformers.PreTrainedTokenizer,
        has_image: bool = False
) -> Dict:
    conv = conversation_lib.default_conversation.copy()
    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}

    # Apply prompt templates
    conversations = []
    for i, source in enumerate(sources):
        if roles[source[0]["from"]] != conv.roles[0]:
            # Skip the first one if it is not from human
            source = source[1:]

        conv.messages = []
        for j, sentence in enumerate(source):
            role = roles[sentence["from"]]
            assert role == conv.roles[j % 2], f"{i}"
            conv.append_message(role, sentence["value"])
        conversations.append(conv.get_prompt())

    # Tokenize conversations
    if has_image:
        input_ids = torch.stack(
            [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
    else:
        input_ids = tokenizer(
            conversations,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        ).input_ids

    targets = input_ids.clone()

    assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_2

    # Mask targets
    sep = "[/INST] "
    for conversation, target in zip(conversations, targets):
        total_len = int(target.ne(tokenizer.pad_token_id).sum())

        rounds = conversation.split(conv.sep2)
        cur_len = 1
        target[:cur_len] = IGNORE_INDEX
        for i, rou in enumerate(rounds):
            if rou == "":
                break

            parts = rou.split(sep)
            if len(parts) != 2:
                break
            parts[0] += sep

            if has_image:
                round_len = len(tokenizer_image_token(rou, tokenizer))
                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
            else:
                round_len = len(tokenizer(rou).input_ids)
                instruction_len = len(tokenizer(parts[0]).input_ids) - 2

            target[cur_len: cur_len + instruction_len] = IGNORE_INDEX

            cur_len += round_len
        target[cur_len:] = IGNORE_INDEX

        if cur_len < tokenizer.model_max_length:
            if cur_len != total_len:
                target[:] = IGNORE_INDEX
                print(
                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
                    f" (ignored)"
                )

    return dict(
        input_ids=input_ids,
        labels=targets,
    )


def preprocess_v1(
        sources,
        tokenizer: transformers.PreTrainedTokenizer,
        has_image: bool = False
) -> Dict:
    conv = conversation_lib.default_conversation.copy()
    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}

    # print('00000000000', sources)
    # Apply prompt templates
    conversations = []

    for i, source in enumerate(sources):
        if roles[source[0]["from"]] != conv.roles[0]:
            # Skip the first one if it is not from human
            source = source[1:]

        conv.messages = []
        for j, sentence in enumerate(source):
            role = roles[sentence["from"]]
            assert role == conv.roles[j % 2], f"{i}"
            conv.append_message(role, sentence["value"])
        conversations.append(conv.get_prompt())
    # Tokenize conversations
    # print('before tokenizer_image_token', conversations)
    if has_image:
        input_ids = torch.stack(
            [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
    else:
        input_ids = tokenizer(
            conversations,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        ).input_ids

    targets = input_ids.clone()

    assert conv.sep_style == conversation_lib.SeparatorStyle.TWO

    sep = conv.sep + conv.roles[1] + ": "
    for conversation, target in zip(conversations, targets):
        total_len = int(target.ne(tokenizer.pad_token_id).sum())
        rounds = conversation.split(conv.sep2)
        cur_len = 1
        target[:cur_len] = IGNORE_INDEX
        for i, rou in enumerate(rounds):
            if rou == "":
                break

            parts = rou.split(sep)
            if len(parts) != 2:
                break
            parts[0] += sep

            if has_image:
                round_len = len(tokenizer_image_token(rou, tokenizer))
                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
            else:
                round_len = len(tokenizer(rou).input_ids)
                instruction_len = len(tokenizer(parts[0]).input_ids) - 2

            target[cur_len: cur_len + instruction_len] = IGNORE_INDEX

            cur_len += round_len
        target[cur_len:] = IGNORE_INDEX

        if cur_len < tokenizer.model_max_length:
            if cur_len != total_len:
                target[:] = IGNORE_INDEX
                print(
                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
                    f" (ignored)"
                )

    return dict(
        input_ids=input_ids,
        labels=targets,
    )


def preprocess_phi(
        sources,
        tokenizer: transformers.PreTrainedTokenizer,
        has_image: bool = False
) -> Dict:
    conv = conversation_lib.default_conversation.copy()
    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}

    conversations = []
    for i, source in enumerate(sources):
        if roles[source[0]["from"]] != conv.roles[0]:
            # Skip the first one if it is not from human
            source = source[1:]

        conv.messages = []
        for j, sentence in enumerate(source):
            role = roles[sentence["from"]]
            assert role == conv.roles[j % 2], f"{i}"
            conv.append_message(role, sentence["value"])
        conversations.append(conv.get_prompt())

    if has_image:
        input_ids = torch.stack(
            [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
    else:
        input_ids = tokenizer(
            conversations,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        ).input_ids

    targets = input_ids.clone()

    assert conv.sep_style == conversation_lib.SeparatorStyle.TWO

    sep = conv.sep + conv.roles[1] + ": "

    for conversation, target in zip(conversations, targets):
        total_len = int(target.ne(tokenizer.pad_token_id).sum())
        rounds = conversation.split(conv.sep2)
        cur_len = 0
        target[:cur_len] = IGNORE_INDEX
        for i, rou in enumerate(rounds):
            if rou == "":
                break

            parts = rou.split(sep)
            if len(parts) != 2:
                break
            parts[0] += sep

            if has_image:
                round_len = len(tokenizer_image_token(rou, tokenizer)) + 1  # for eos_token
                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
            else:
                round_len = len(tokenizer(rou).input_ids) + 1  # for eos_token
                instruction_len = len(tokenizer(parts[0]).input_ids) - 1

            target[cur_len: cur_len + instruction_len] = IGNORE_INDEX  # instruction_len is before the answer

            cur_len += round_len
        target[cur_len:] = IGNORE_INDEX

        if cur_len < tokenizer.model_max_length:

            if cur_len != total_len:
                target[:] = IGNORE_INDEX
                print(
                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
                    f" (ignored)"
                )
    return dict(
        input_ids=input_ids,
        labels=targets,
    )


def preprocess_openchat(
        sources,
        tokenizer: transformers.PreTrainedTokenizer,
        has_image: bool = False
) -> Dict:
    conv = conversation_lib.default_conversation.copy()
    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}

    # Apply prompt templates
    conversations = []

    for i, source in enumerate(sources):
        if roles[source[0]["from"]] != conv.roles[0]:
            # Skip the first one if it is not from human
            source = source[1:]

        conv.messages = []
        for j, sentence in enumerate(source):
            role = roles[sentence["from"]]
            assert role == conv.roles[j % 2], f"{i}"
            conv.append_message(role, sentence["value"])
        conversations.append(conv.get_prompt())

    if has_image:
        input_ids = torch.stack(
            [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
    else:
        input_ids = tokenizer(
            conversations,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        ).input_ids

    targets = input_ids.clone()

    assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
    # Mask targets
    sep = conv.sep + conv.roles[1] + ": "
    for conversation, target in zip(conversations, targets):
        total_len = int(target.ne(tokenizer.pad_token_id).sum())
        rounds = conversation.split(conv.sep2)
        cur_len = 1
        target[:cur_len] = IGNORE_INDEX
        for i, rou in enumerate(rounds):
            if rou == "":
                break

            parts = rou.split(sep)
            # print('i rou, parts\n', i, rou, parts)
            if len(parts) != 2:
                break
            parts[0] += sep
            # print('after add sep, parts\n', parts)

            if has_image:
                round_len = len(tokenizer_image_token(rou, tokenizer))
                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
            else:
                round_len = len(tokenizer(rou).input_ids)
                instruction_len = len(tokenizer(parts[0]).input_ids) - 2

            target[cur_len: cur_len + instruction_len] = IGNORE_INDEX  # instruction_len is before the answer

            cur_len += round_len
        target[cur_len:] = IGNORE_INDEX

        if cur_len < tokenizer.model_max_length:
            if cur_len != total_len:
                target[:] = IGNORE_INDEX
                print(
                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
                    f" (ignored)"
                )
    return dict(
        input_ids=input_ids,
        labels=targets,
    )


def preprocess_mpt(
        sources,
        tokenizer: transformers.PreTrainedTokenizer,
) -> Dict:
    conv = conversation_lib.default_conversation.copy()
    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}

    # Apply prompt templates
    conversations = []
    for i, source in enumerate(sources):
        if roles[source[0]["from"]] != conv.roles[0]:
            # Skip the first one if it is not from human
            source = source[1:]

        conv.messages = []
        for j, sentence in enumerate(source):
            role = roles[sentence["from"]]
            assert role == conv.roles[j % 2], f"{i}"
            conv.append_message(role, sentence["value"])
        conversations.append(conv.get_prompt())

    # Tokenize conversations
    input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations],
                            dim=0)
    targets = input_ids.clone()
    assert conv.sep_style == conversation_lib.SeparatorStyle.MPT

    # Mask targets
    sep = conv.sep + conv.roles[1]
    for conversation, target in zip(conversations, targets):
        total_len = int(target.ne(tokenizer.pad_token_id).sum())

        rounds = conversation.split(conv.sep)
        re_rounds = [conv.sep.join(rounds[:3])]  # system + user + gpt
        for conv_idx in range(3, len(rounds), 2):
            re_rounds.append(conv.sep.join(rounds[conv_idx:conv_idx + 2]))  # user + gpt
        cur_len = 0
        target[:cur_len] = IGNORE_INDEX
        for i, rou in enumerate(re_rounds):
            if rou == "":
                break

            parts = rou.split(sep)
            if len(parts) != 2:
                break
            parts[0] += sep
            round_len = len(tokenizer_image_token(rou, tokenizer)) + len(tokenizer_image_token(conv.sep, tokenizer))
            instruction_len = len(tokenizer_image_token(parts[0], tokenizer))
            target[cur_len: cur_len + instruction_len] = IGNORE_INDEX

            cur_len += round_len
        target[cur_len:] = IGNORE_INDEX

        if cur_len < tokenizer.model_max_length:
            if cur_len != total_len:
                target[:] = IGNORE_INDEX
                print(
                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
                    f" (ignored)"
                )

    return dict(
        input_ids=input_ids,
        labels=targets,
    )


def preprocess_gemma_2(
        sources,
        tokenizer: transformers.PreTrainedTokenizer,
        has_image: bool = False
) -> Dict:
    conv = conversation_lib.default_conversation.copy()
    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}

    # Apply prompt templates
    conversations = []
    for i, source in enumerate(sources):
        if roles[source[0]["from"]] != conv.roles[0]:
            # Skip the first one if it is not from human
            source = source[1:]

        conv.messages = []
        for j, sentence in enumerate(source):
            role = roles[sentence["from"]]
            assert role == conv.roles[j % 2], f"{i}"
            conv.append_message(role, sentence["value"])
        conversations.append(conv.get_prompt())

    # Tokenize conversations
    if has_image:
        input_ids = torch.stack(
            [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
    else:
        input_ids = tokenizer(
            conversations,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        ).input_ids

    targets = input_ids.clone()
    assert conv.sep_style == conversation_lib.SeparatorStyle.GEMMA_2

    # Mask targets
    sep = "<start_of_turn>" + conv.sep + conv.roles[1] + "\n"
    for conversation, target in zip(conversations, targets):
        total_len = int(target.ne(tokenizer.pad_token_id).sum())

        rounds = conversation.split(conv.sep2)
        cur_len = 1
        target[:cur_len] = IGNORE_INDEX
        for i, rou in enumerate(rounds):
            if rou == "":
                break

            parts = rou.split(sep)
            if len(parts) != 2:
                print(f"WARNING: parts!=: {parts}")
                break
            parts[0] += sep

            if has_image:
                round_len = len(tokenizer_image_token(rou, tokenizer))
                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1  # exclude <bos>
            else:
                round_len = len(tokenizer(rou).input_ids)
                instruction_len = len(tokenizer(parts[0]).input_ids) - 1  # exclude <bos>

            target[cur_len: cur_len + instruction_len] = IGNORE_INDEX

            cur_len += round_len
        target[cur_len:] = IGNORE_INDEX

        if cur_len < tokenizer.model_max_length:
            if cur_len != total_len:
                target[:] = IGNORE_INDEX
                print(
                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
                    f" (ignored)"
                )

    return dict(
        input_ids=input_ids,
        labels=targets,
    )


def preprocess_plain(
        sources: Sequence[str],
        tokenizer: transformers.PreTrainedTokenizer,
) -> Dict:
    # add end signal and concatenate together
    # print('sources', sources)
    conversations = []
    for source in sources:
        assert len(source) == 2
        assert DEFAULT_IMAGE_TOKEN in source[0]['value']
        source[0]['value'] = DEFAULT_IMAGE_TOKEN
        conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep
        conversations.append(conversation)
    # print('conversations', conversations)
    # tokenize conversations
    input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations]
    # print('after tokenizer_image_token', input_ids)
    targets = copy.deepcopy(input_ids)
    for target, source in zip(targets, sources):
        tokenized_len = len(tokenizer_image_token(source[0]['value'], tokenizer))
        target[:tokenized_len] = IGNORE_INDEX

    # print('target:', target)
    return dict(input_ids=input_ids, labels=targets)


def preprocess(
        sources: Sequence[str],
        tokenizer: transformers.PreTrainedTokenizer,
        has_image: bool = False
) -> Dict:
    """
    Given a list of sources, each is a conversation list. This transform:
    1. Add signal '### ' at the beginning each sentence, with end signal '\n';
    2. Concatenate conversations together;
    3. Tokenize the concatenated conversation;
    4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.
    """
    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
        return preprocess_plain(sources, tokenizer)
    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.LLAMA_2:
        return preprocess_llama_2(sources, tokenizer, has_image=has_image)
    if conversation_lib.default_conversation.version.startswith("phi") or \
            conversation_lib.default_conversation.version.startswith("qwen"):  # for phi and qwen
        return preprocess_phi(sources, tokenizer, has_image=has_image)
    if conversation_lib.default_conversation.version.startswith("stablelm"):  # stablelm same as phi
        return preprocess_phi(sources, tokenizer, has_image=has_image)
    if conversation_lib.default_conversation.version.startswith("openchat") or \
            conversation_lib.default_conversation.version.startswith("mistral"):  # for openchat
        return preprocess_openchat(sources, tokenizer, has_image=has_image)
    if conversation_lib.default_conversation.version.startswith("minicpm"):  # minicpm same as openchat
        return preprocess_openchat(sources, tokenizer, has_image=has_image)
    if conversation_lib.default_conversation.version.startswith("v1"):
        return preprocess_v1(sources, tokenizer, has_image=has_image)
    if conversation_lib.default_conversation.version == "mpt":
        return preprocess_mpt(sources, tokenizer)
    elif conversation_lib.default_conversation.version.startswith("gemma_2"):
        return preprocess_gemma_2(sources, tokenizer, has_image=has_image)
    # add end signal and concatenate together
    conversations = []
    for source in sources:
        header = f"{conversation_lib.default_conversation.system}\n\n"
        conversation = _add_speaker_and_signal(header, source)
        conversations.append(conversation)

    # tokenize conversations
    def get_tokenize_len(prompts):
        return [len(tokenizer_image_token(prompt, tokenizer)) for prompt in prompts]

    if has_image:
        input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations]
    else:
        conversations_tokenized = _tokenize_fn(conversations, tokenizer)
        input_ids = conversations_tokenized["input_ids"]

    targets = copy.deepcopy(input_ids)
    for target, source in zip(targets, sources):
        if has_image:
            tokenized_lens = get_tokenize_len([header] + [s["value"] for s in source])
        else:
            tokenized_lens = _tokenize_fn([header] + [s["value"] for s in source], tokenizer)["input_ids_lens"]
        speakers = [sentence["from"] for sentence in source]
        _mask_targets(target, tokenized_lens, speakers)

    return dict(input_ids=input_ids, labels=targets)


def expand2square(pil_img, background_color):
    width, height = pil_img.size
    if width == height:
        return pil_img
    elif width > height:
        result = Image.new(pil_img.mode, (width, width), background_color)
        result.paste(pil_img, (0, (width - height) // 2))
        return result
    else:
        result = Image.new(pil_img.mode, (height, height), background_color)
        result.paste(pil_img, ((height - width) // 2, 0))
        return result


================================================
FILE: llavamod/data/dataset.py
================================================
import os
import json
import random
from dataclasses import dataclass

from torch.utils.data import Dataset

from llavamod.utils import order_pick_k
from llavamod.data.data_utils import *

local_rank = None


def rank0_print(*args):
    if local_rank == 0:
        print(*args)


"""
#############################################################
############# Supervised Finetuning Dataset  ################
#############################################################
"""

class LazySupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""

    def __init__(self, data_path: str,
                 tokenizer: transformers.PreTrainedTokenizer,
                 data_args):
        super(LazySupervisedDataset, self).__init__()
        # ================================================
        list_data_dict = []
        for data in data_path:
            rank0_print("#### read from", data)
            data = json.load(open(data, "r"))
            rank0_print("#### len: ", len(data))
            for i in data:
                i['id'] = len(list_data_dict)
                list_data_dict.append(i)
        rank0_print("#### total len:", len(list_data_dict))
        # ================================================

        rank0_print("Formatting inputs...Skip in lazy mode")
        self.tokenizer = tokenizer
        self.list_data_dict = list_data_dict
        self.data_args = data_args

    def __len__(self):
        return len(self.list_data_dict)

    @property
    def modality_lengths(self):
        length_list = []
        for sample in self.list_data_dict:
            cur_len = sum(len(conv['value'].split()) for conv in sample['conversations'])
            # ===========================================================================
            cur_len = cur_len if ('image' in sample or 'video' in sample) else -cur_len
            # ===========================================================================
            length_list.append(cur_len)
        return length_list

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:

        try:
            sources = self.list_data_dict[i]
            if isinstance(i, int):
                sources = [sources]
            assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME

            if 'image' in sources[0] and 'video' not in sources[0]:
                # rank0_print('image')
                image_file = self.list_data_dict[i]['image']
                image_folder = self.data_args.image_folder
                image_processor = self.data_args.image_processor
                image_file = image_file if isinstance(image_file, list) else [image_file]
                image_file = order_pick_k(image_file, MAX_IMAGE_LENGTH)
                # print(f"total {len(self.list_data_dict[i]['image'])} now {len(image_file)}")
                fallback_image = Image.new(mode="RGB", size=(224, 224), color=(0, 0, 0))
                image = []
                for file in image_file:
                    try:
                        img = Image.open(os.path.join(image_folder, file)).convert('RGB')
                        image.append(img)
                    except Exception as e:
                        print(f"Error opening image {file}: {e}, using fallback image.")
                        image.append(fallback_image)

                if self.data_args.image_aspect_ratio == 'pad':
                    image = [expand2square(i, tuple(int(x * 255) for x in image_processor.image_mean)) for i in image]
                    image = [image_processor.preprocess(i, return_tensors='pt')['pixel_values'][0] for i in image]
                else:
                    image = [image_processor.preprocess(i, return_tensors='pt')['pixel_values'][0] for i in image]

                sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)

                data_dict = preprocess(sources, self.tokenizer, has_image=True)

            elif 'image' not in sources[0] and 'video' in sources[0]:
                # rank0_print('video')
                video_file = self.list_data_dict[i]['video']
                video_folder = self.data_args.video_folder
                video_processor = self.data_args.video_processor
                video_file = video_file if isinstance(video_file, list) else [video_file]
                video_file = order_pick_k(video_file, MAX_VIDEO_LENGTH)
                video = [os.path.join(video_folder, file) for file in video_file]
                image = [video_processor(i, return_tensors='pt')['pixel_values'][0] for i in video]  # fake image
                sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)
                data_dict = preprocess(sources, self.tokenizer, has_image=True)

            elif 'image' in sources[0] and 'video' in sources[0]:

                video_file = self.list_data_dict[i]['video']
                video_folder = self.data_args.video_folder
                video_processor = self.data_args.video_processor

                image_file = self.list_data_dict[i]['image']
                image_folder = self.data_args.image_folder
                image_processor = self.data_args.image_processor

                image_file = image_file if isinstance(image_file, list) else [image_file]
                image_file = order_pick_k(image_file, MAX_IMAGE_LENGTH)
                image = [Image.open(os.path.join(image_folder, file)).convert('RGB') for file in image_file]
                if self.data_args.image_aspect_ratio == 'pad':
                    image = [expand2square(i, tuple(int(x * 255) for x in image_processor.image_mean)) for i in image]
                    image = [image_processor.preprocess(i, return_tensors='pt')['pixel_values'][0] for i in image]
                else:
                    image = [image_processor.preprocess(i, return_tensors='pt')['pixel_values'][0] for i in image]

                video_file = video_file if isinstance(video_file, list) else [video_file]
                video_file = order_pick_k(video_file, MAX_VIDEO_LENGTH)
                video = [os.path.join(video_folder, file) for file in video_file]
                video = [video_processor(i, return_tensors='pt')['pixel_values'][0] for i in video]  # fake image

                image = video + image  # video must before image

                sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)
                data_dict = preprocess(sources, self.tokenizer, has_image=True)
            else:
                sources = copy.deepcopy([e["conversations"] for e in sources])
                data_dict = preprocess(sources, self.tokenizer, has_image=False)

            # ==========================================================================================================

            if isinstance(i, int):
                data_dict = dict(input_ids=data_dict["input_ids"][0],
                                 labels=data_dict["labels"][0])

            if 'image' in self.list_data_dict[i] or 'video' in self.list_data_dict[i]:
                data_dict['image'] = image
            elif self.data_args.is_multimodal:
                # the image does not exist in the data, but the model is multimodal
                if hasattr(self.data_args.image_processor, 'crop_size'):
                    crop_size = self.data_args.image_processor.crop_size
                    data_dict['image'] = [torch.zeros(3, crop_size['height'], crop_size['width'])]
                else:
                    size = self.data_args.image_processor.size
                    data_dict['image'] = [torch.zeros(3, size['height'], size['width'])]

            return data_dict

        except Exception as e:
            print(f'Error with {e}')
            return self.__getitem__(random.randint(0, self.__len__() - 1))


@dataclass
class DataCollatorForSupervisedDataset(object):
    """Collate examples for supervised fine-tuning."""

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple([instance[key] for instance in instances]
                                  for key in ("input_ids", "labels"))
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids,
            batch_first=True,
            padding_value=self.tokenizer.pad_token_id)
        labels = torch.nn.utils.rnn.pad_sequence(labels,
                                                 batch_first=True,
                                                 padding_value=IGNORE_INDEX)

        input_ids = input_ids[:, :self.tokenizer.model_max_length]
        labels = labels[:, :self.tokenizer.model_max_length]

        batch = dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )

        # print('after Collator', batch)
        # print(input_ids, labels, input_ids.ne(self.tokenizer.pad_token_id))
        # ======================================================================================================
        # origin image, if batch_size=6: [[image], [image], [video], [image, image], [video, video], [video, image]]
        '''
            will be converted to a sequence of list, if batch size=6:
            [
                image(3, 224, 224),      # sample 1
                image(3, 224, 224),      # sample 2
                video(8, 3, 224, 224),   # sample 3
                image(3, 224, 224),      # sample 4
                image(3, 224, 224),      # sample 4
                video(8, 3, 224, 224),   # sample 5
                video(8, 3, 224, 224),   # sample 5
                video(8, 3, 224, 224),   # sample 6
                image(3, 224, 224),      # sample 6
            ]
        '''
        if 'image' in instances[0]:
            images = [instance['image'] for instance in instances]

            # adapt to multi-video or multi-image or multi-image & video
            new_images = []
            for image in images:
                if type(image) is list:
                    for i in image:
                        new_images.append(i)
                else:
                    new_images.append(image)
            images = new_images

            # ==========Too many videos or images may lead to OOM, so we encode them one by one======================
            batch['images'] = images

        else:
            raise ValueError(f'pretrain, {instances}')



        return batch


def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,
                                data_args) -> Dict:
    """Make dataset and collator for supervised fine-tuning."""
    train_dataset = LazySupervisedDataset(tokenizer=tokenizer,
                                          data_path=data_args.data_path,
                                          data_args=data_args)
    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
    return dict(train_dataset=train_dataset,
                eval_dataset=None,
                data_collator=data_collator)


"""
##################################################
################### DPO Dataset ##################
##################################################
"""

class LazyDPODataset(Dataset):
    """Dataset for preference learning."""

    def __init__(self, data_path: str,
                 tokenizer: transformers.PreTrainedTokenizer,
                 data_args):
        super(LazyDPODataset, self).__init__()
        # ================================================
        list_data_dict = []
        for data in data_path:
            rank0_print("#### read from", data)
            data = json.load(open(data, "r"))
            rank0_print("#### len: ", len(data))
            for i in data:
                i['id'] = len(list_data_dict)
                list_data_dict.append(i)
        rank0_print("#### total len:", len(list_data_dict))
        # ================================================

        rank0_print("Formatting inputs...Skip in lazy mode")
        self.tokenizer = tokenizer
        self.list_data_dict = list_data_dict
        self.data_args = data_args

    def __len__(self):
        return len(self.list_data_dict)

    @property
    def modality_lengths(self):
        length_list = []
        for sample in self.list_data_dict:
            cur_len = sum(len(conv['value'].split()) for conv in sample['conversations'])
            # ===========================================================================
            cur_len = cur_len if ('image' in sample or 'video' in sample) else -cur_len
            # ===========================================================================
            length_list.append(cur_len)
        return length_list

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        """
            image:
            {
                'id': 'GCC_train_001870201',
                'image': 'GCC_train_001870201.jpg',
                'chosen': [
                    {
                        'from': 'human',
                        'value': '<image>\nProvide a brief description of the given image.'},
                    {
                        'from': 'gpt',
                        'value': 'a cartoon illustration of a winged buffalo with an angry expression .'}
                ]
                'rejected': [
                    {
                        'from': 'human',
                        'value': '<image>\nProvide a brief description of the given image.'},
                    {
                        'from': 'gpt',
                        'value': 'a cartoon illustration of a winged buffalo with an angry expression .'}
                ]
            }
        """
        try:
            sources = self.list_data_dict[i]
            if isinstance(i, int):
                sources = [sources]
            assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
            # ======================================================================================================
            if 'image' in sources[0] and 'video' not in sources[0]:
                # rank0_print('image')
                image_file = self.list_data_dict[i]['image']
                image_folder = self.data_args.image_folder
                image_processor = self.data_args.image_processor
                image_file = image_file if isinstance(image_file, list) else [image_file]
                image_file = order_pick_k(image_file, MAX_IMAGE_LENGTH)
                # print(f"total {len(self.list_data_dict[i]['image'])} now {len(image_file)}")
                fallback_image = Image.new(mode="RGB", size=(224, 224), color=(0, 0, 0))
                image = []
                for file in image_file:
                    try:
                        img = Image.open(os.path.join(image_folder, file)).convert('RGB')
                        image.append(img)
                    except IOError as e:
                        print(f"Error opening image {file}: {e}, using fallback image.")
                        image.append(fallback_image)

                if self.data_args.image_aspect_ratio == 'pad':
                    image = [expand2square(i, tuple(int(x * 255) for x in image_processor.image_mean)) for i in image]
                    image = [image_processor.preprocess(i, return_tensors='pt')['pixel_values'][0] for i in image]
                else:
                    image = [image_processor.preprocess(i, return_tensors='pt')['pixel_values'][0] for i in image]

                chosen_sources = preprocess_multimodal(copy.deepcopy([e["chosen"] for e in sources]),
                                                       self.data_args)
                rejected_sources = preprocess_multimodal(copy.deepcopy([e["rejected"] for e in sources]),
                                                         self.data_args)
                chosen_data_dict = preprocess(chosen_sources, self.tokenizer, has_image=True)
                rejected_data_dict = preprocess(rejected_sources, self.tokenizer, has_image=True)


            elif 'image' not in sources[0] and 'video' in sources[0]:
                # rank0_print('video')
                video_file = self.list_data_dict[i]['video']
                video_folder = self.data_args.video_folder
                video_processor = self.data_args.video_processor
                video_file = video_file if isinstance(video_file, list) else [video_file]
                video_file = order_pick_k(video_file, MAX_VIDEO_LENGTH)
                video = [os.path.join(video_folder, file) for file in video_file]
                image = [video_processor(i, return_tensors='pt')['pixel_values'][0] for i in video]  # fake image
                # image = [torch.randn(3, 8, 224, 224) for i in video]  # fake image
                chosen_sources = preprocess_multimodal(copy.deepcopy([e["chosen"] for e in sources]),
                                                       self.data_args)
                rejected_sources = preprocess_multimodal(copy.deepcopy([e["rejected"] for e in sources]),
                                                         self.data_args)
                chosen_data_dict = preprocess(chosen_sources, self.tokenizer, has_image=True)
                rejected_data_dict = preprocess(rejected_sources, self.tokenizer, has_image=True)

            elif 'image' in sources[0] and 'video' in sources[0]:
                video_file = self.list_data_dict[i]['video']
                video_folder = self.data_args.video_folder
                video_processor = self.data_args.video_processor

                image_file = self.list_data_dict[i]['image']
                image_folder = self.data_args.image_folder
                image_processor = self.data_args.image_processor

                image_file = image_file if isinstance(image_file, list) else [image_file]
                image_file = order_pick_k(image_file, MAX_IMAGE_LENGTH)
                image = [Image.open(os.path.join(image_folder, file)).convert('RGB') for file in image_file]
                if self.data_args.image_aspect_ratio == 'pad':
                    image = [expand2square(i, tuple(int(x * 255) for x in image_processor.image_mean)) for i in image]
                    image = [image_processor.preprocess(i, return_tensors='pt')['pixel_values'][0] for i in image]
                else:
                    image = [image_processor.preprocess(i, return_tensors='pt')['pixel_values'][0] for i in image]

                video_file = video_file if isinstance(video_file, list) else [video_file]
                video_file = order_pick_k(video_file, MAX_VIDEO_LENGTH)
                video = [os.path.join(video_folder, file) for file in video_file]
                video = [video_processor(i, return_tensors='pt')['pixel_values'][0] for i in video]  # fake image

                image = video + image  # video must before image

                chosen_sources = preprocess_multimodal(copy.deepcopy([e["chosen"] for e in sources]),
                                                       self.data_args)
                rejected_sources = preprocess_multimodal(copy.deepcopy([e["rejected"] for e in sources]),
                                                         self.data_args)
                chosen_data_dict = preprocess(chosen_sources, self.tokenizer, has_image=True)
                rejected_data_dict = preprocess(rejected_sources, self.tokenizer, has_image=True)
            else:
                chosen_sources = preprocess_multimodal(copy.deepcopy([e["chosen"] for e in sources]),
                                                       self.data_args)
                rejected_sources = preprocess_multimodal(copy.deepcopy([e["rejected"] for e in sources]),
                                                         self.data_args)
                chosen_data_dict = preprocess(chosen_sources, self.tokenizer, has_image=False)
                rejected_data_dict = preprocess(rejected_sources, self.tokenizer, has_image=False)

            if isinstance(i, int):
                data_dict = dict(
                    chosen_input_ids=chosen_data_dict["input_ids"][0],
                    chosen_labels=chosen_data_dict["labels"][0],
                    rejected_input_ids=rejected_data_dict["input_ids"][0],
                    rejected_labels=rejected_data_dict["labels"][0],
                )

            # image exists in the data
            if 'image' in self.list_data_dict[i] or 'video' in self.list_data_dict[i]:
                data_dict['image'] = image
            elif self.data_args.is_multimodal:
                # the image does not exist in the data, but the model is multimodal
                if hasattr(self.data_args.image_processor, 'crop_size'):
                    crop_size = self.data_args.image_processor.crop_size
                    data_dict['image'] = [torch.zeros(3, crop_size['height'], crop_size['width'])]
                else:
                    size = self.data_args.image_processor.size
                    data_dict['image'] = [torch.zeros(3, size['height'], size['width'])]
            return data_dict
        except Exception as e:
            print(f'Error with {e}')
            return self.__getitem__(random.randint(0, self.__len__() - 1))


@dataclass
class DataCollatorForDPODataset(object):
    """Collate examples for dpo training."""

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        chosen_input_ids, chosen_labels = tuple([instance[key] for instance in instances]
                                                for key in ("chosen_input_ids", "chosen_labels"))
        # print('before Collator', input_ids)
        chosen_input_ids = torch.nn.utils.rnn.pad_sequence(
            chosen_input_ids,
            batch_first=True,
            padding_value=self.tokenizer.pad_token_id)
        chosen_labels = torch.nn.utils.rnn.pad_sequence(
            chosen_labels,
            batch_first=True,
            padding_value=IGNORE_INDEX)

        rejected_input_ids, rejected_labels = tuple([instance[key] for instance in instances]
                                                    for key in ("rejected_input_ids", "rejected_labels"))
        rejected_input_ids = torch.nn.utils.rnn.pad_sequence(
            rejected_input_ids,
            batch_first=True,
            padding_value=self.tokenizer.pad_token_id)
        rejected_labels = torch.nn.utils.rnn.pad_sequence(
            rejected_labels,
            batch_first=True,
            padding_value=IGNORE_INDEX)

        # FIXME: chosen + rejected <= model_max_length
        batch = dict(
            chosen_input_ids=chosen_input_ids,
            chosen_labels=chosen_labels,
            chosen_attention_mask=chosen_input_ids.ne(self.tokenizer.pad_token_id),
            rejected_input_ids=rejected_input_ids,
            rejected_labels=rejected_labels,
            rejected_attention_mask=rejected_input_ids.ne(self.tokenizer.pad_token_id),
        )

        '''
            will be converted to a sequence of list, if batch size=6:
            [
                image(3, 224, 224),      # sample 1
                image(3, 224, 224),      # sample 2
                video(8, 3, 224, 224),   # sample 3
                image(3, 224, 224),      # sample 4
                image(3, 224, 224),      # sample 4
                video(8, 3, 224, 224),   # sample 5
                video(8, 3, 224, 224),   # sample 5
                video(8, 3, 224, 224),   # sample 6
                image(3, 224, 224),      # sample 6
            ]
        '''
        if 'image' in instances[0]:
            images = [instance['image'] for instance in instances]
            # adapt to multi-video or multi-image or multi-image & video
            new_images = []
            for image in images:
                if type(image) is list:
                    for i in image:
                        new_images.append(i)
                else:
                    new_images.append(image)
            images = new_images

            # ==========Too many videos or images may lead to OOM, so we encode them one by one======================
            batch['images'] = images
        else:
            raise ValueError(f'pretrain, {instances}')

        return batch


def make_dpo_data_module(tokenizer: transformers.PreTrainedTokenizer,
                         data_args) -> Dict:
    """Make dataset and collator for dpo training."""
    train_dataset = LazyDPODataset(tokenizer=tokenizer,
                                   data_path=data_args.data_path,
                                   data_args=data_args)
    data_collator = DataCollatorForDPODataset(tokenizer=tokenizer)
    return dict(train_dataset=train_dataset,
                eval_dataset=None,
                data_collator=data_collator)


================================================
FILE: llavamod/eval/__init__.py
================================================


================================================
FILE: llavamod/eval/eval_gpt_mmhal.py
================================================
import argparse
import json
import time
from gpt4_grpc import Chat

template = '''Please act as an impartial and objective judge and evaluate the quality of the response provided by a Large Multimodal Model (LMM) to the user question. Your evaluation should be mainly based on whether the response is informative, and whether the response contains any hallucination. Hallucination, in this context, refers to a situation where the LMM generates a response that includes information not present or implied in the image or previous conversation. A hallucination could be a false claim about an object, action, emotion, or any other detail that is not grounded in the image.

For clarity, consider these examples:

### Example 1:
- Image Contents: Lynx, Cat
- Question: How many cats can you see in this image?
- Standard Human-Generated Answer: There are two cats visible in the image.
- LMM Response to Evaluate: There is one cat in the image.
- Analysis: The LMM's response is false because it claims there is only one cat in the image, when in fact there are two.
- Hallucination: Yes.

### Example 2:
- Image Contents: Tree, Plant, Panda, Animal
- Question: What can you see from this photo?
- Standard Human-Generated Answer: In this photo we can see branches. We can see a panda on a branch. The background is blurry.
- LMM Response to Evaluate: In the photo, I can see a large black and white panda sitting on a tree branch. The image captures the panda's unique appearance and its natural habitat, showcasing the beauty and rarity of these animals.
- Analysis: The LMM's response is not false because it does not make any false claims about the image contents. For example, it does not claim that the panda is eating bamboo.
- Hallucination: No.

### Example 3:
- Image Contents: Fashion accessory, Mammal, Dog, Umbrella
- Question: Describe this image in details.
- Standard Human-Generated Answer: In this picture I can see a dog laying under a yellow umbrella on a beach, and there is a rope on the right side. In the background I can see few persons talking or lying. At the top of the image, I can see the blue and clear sky.
- LMM Response to Evaluate: The image features a brown dog laying on the sand at the beach, underneath a yellow umbrella. The dog is positioned near the water, and there are people in the background. The beach is sandy, and the sun is shining, creating a pleasant atmosphere. The dog is enjoying the shade provided by the umbrella, which is a common way to protect pets from the sun's harmful rays. The presence of people in the background suggests that the beach is a popular spot for leisure activities, such as sunbathing, swimming, and relaxing.
- Analysis: The LMM's response is false because it claims that the dog is positioned near the water, but there is no water in the image.
- Hallucination: Yes.

### Example 4:
- Image Contents: Clothing, Luggage and bags, Man, Building, Woman, Person
- Question: Where is this photo taken?
- Standard Human-Generated Answer: From the slogan "Welcome to Houston, Texas", we can infer that this photo is taken at an airport in Houston, Texas.
- LMM Response to Evaluate: This photo is taken at an airport, specifically at the entrance of a terminal. However, the specific location of this airport cannot be determined from the image.
- Analysis: Although the LMM's response does not explicitly mention the location "Houston, Texas" and does not perfectly answer the question, it makes no false claims about the image contents. Therefore, this response is not considered hallucination. It is OK if the LMM fails to directly answer the question, expresses uncertainty, or cannot include all the objects or details in the human-generated answer.
- Hallucination: No.

### Example 5:
- Image Contents: Tree, Tower
- Question: What environmental concerns are involved in this photo?
- Standard Human-Generated Answer: This photo shows a smokestack emitting smoke. It may cause severe air pollution which threatens human health.
- LMM Response to Evaluate: In the photo, there are several environmental concerns related to the smokestack emitting smoke. The smoke from the smokestack is a byproduct of industrial processes, which can contribute to air pollution, climate change, and negative impacts on human health. The smoke contains harmful pollutants, such as particulate matter, sulfur dioxide, nitrogen oxides, and carbon monoxide, which can lead to respiratory problems, heart disease, and other health issues. Additionally, the smoke contributes to the greenhouse effect, which can lead to global warming and climate change. The smokestack's emissions also affect the environment, as they can harm wildlife, vegetation, and ecosystems. Therefore, it is essential to address these environmental concerns by implementing measures to reduce emissions and promote sustainable practices in industrial processes.
- Analysis: Although the LMM's response is significantly longer than the standard human-generated answer, it does not contain any false claims about the image contents. Instead, it provides additional general information about the environmental concerns, which can be inferred from the smoke emission. Such detailed analysis or reasoning should be considered as a positive aspect, as long as it contains no false claims.
- Hallucination: No.

With these examples in mind, please help me evaluate whether the response by the LMM is informative, and whether hallucination exists in it, based on the comparison between the LMM's response and the factual information provided in the image contents, question, and the standard human-generated answer below.

Please note that the standard human-generated answer may only contain factual information but may not give a detailed analysis. Also, the standard human-generated answer may not be completely comprehensive in describing all the objects and their attributes, so please be a bit more cautious during evalutation. LMM's detailed analysis or reasoning should be encouraged.

To evaluate the LMM responses, first, begin your evaluation by providing a short explanation. Second, after providing your explanation, you must rate the response by choosing from the following options:
- Rating: 6, very informative with good analysis or reasoning, no hallucination
- Rating: 5, very informative, no hallucination
- Rating: 4, somewhat informative, no hallucination
- Rating: 3, not informative, no hallucination
- Rating: 2, very informative, with hallucination
- Rating: 1, somewhat informative, with hallucination
- Rating: 0, not informative, with hallucination

### Image Contents
{}

### Question
{}

### Standard Human-Generated Answer
{}

### LMM Response to Evaluate
{}
'''

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--response', type=str, default='responses/idefics_80b.json', help='response file containing images, questions, and model responses')
    parser.add_argument('--evaluation', type=str, default=None, help='GPT-4 evaluation results to be saved')
    parser.add_argument('--api-key', type=str, required=True)
    parser.add_argument('--gpt-model', type=str, default='gpt-4-0314')
    parser.add_argument('--temp_new', action='store_true')
    args = parser.parse_args()

    print(args)

    # load json file
    with open(args.response, 'r') as f:
        records = json.load(f)

    assert len(records) == 96

    chat = Chat(model=args.gpt_model, timeout_sec=100, openai_apikey=args.api_key)

    # ask GPT-4 to evaluate
    responses = []
    for i, record in enumerate(records):
        image_content = ', '.join(record['image_content'])

        input_text = template.format(image_content, record['question'], record['gt_answer'], record['model_answer'])

        response = None
        while response is None:
            try:
                response = chat.chat_completion(
                    messages=[
                        {"role": "user", "content": input_text}
                    ],
                    temperature=0.0,
                )
                try:
                    if response['model'] != chat.model:
                        real_model = response['model']
                        print(f'Except f{chat.model}, but got message from f{real_model}', flush=True)

                        response = None
                        continue

                    print(response['model'])
                    content = response["choices"][0]["message"]["content"]
                    time.sleep(5)
                except:
                    print(f'Response: {response}')

            except Exception as e:
                print(e)
                print('retrying...')
                time.sleep(10)
                continue

        print(i, response['choices'][0]['message']['content'], flush=True)
        responses.append(response)
        time.sleep(1)

    # save responses
    if args.evaluation is not None:
        with open(args.evaluation, 'w') as f:
            json.dump(responses, f, indent=2)

    # analyze responses
    scores = []
    for i, response in enumerate(responses):
        response = response['choices'][0]['message']['content']
        scores_found = []
        for s in range(7):
            if f'rating: {s}' in response.lower():
                scores_found.append(s)
        if len(scores_found) == 1:
            scores.append(scores_found[0])
        else:
            print('Warning: multiple or zero scores found')
            print(i, response)
            scores.append(0)

    hallucination = []
    for s in scores:
        if s >= 3:
            hallucination.append(0)
        else:
            hallucination.append(1)

    scores_each = [[] for _ in range(8)]
    # assuming order of 96 questions is not changed
    for i in range(96):
        question_type = i % 8
        scores_each[question_type].append(scores[i])

    print('Average score: {:.2f}'.format(sum(scores) / len(scores)))
    print('Hallucination rate: {:.2f}'.format(sum(hallucination) / len(hallucination)))
    print('Average score for each question type:', ','.join([str(round(sum(scores_each[i]) / len(scores_each[i]), 2)) for i in range(8)]), flush=True)

================================================
FILE: llavamod/eval/eval_gpt_objhal.py
================================================
import os
import sys
import ssl
import json
import copy
import glob
import time
import pathlib
import random
import jsonlines

import nltk
import spacy
import argparse
import concurrent.futures

from concurrent.futures import ThreadPoolExecutor
from nltk.stem import *
from gpt4_grpc import Chat
from tqdm import tqdm

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nlp = spacy.load("en_core_web_trf")
lemma = nltk.wordnet.WordNetLemmatizer()


def parse_object_list(content):
    try:
        content = json.loads(content)
    except:
        if '["' in content:
            try:
                content = json.loads(content.strip().split('\n')[-1])
            except:
                raise ValueError('Content is not json interpretable')
        else:
            raise ValueError('Content is not json interpretable')
    return content


prompt_template = """You are an expert in image objects extraction according to a question answer pair. We asked an examiner to answer a question about a picture.

[Start of Question]

<image> {question}

[End of Question]

[Start of Examiner's Answer]

{answer}

[End of Examiner's Answer]


Assume that the answer is correct, please identify all visible objects that are directly shown in the image. Please following the instructions in below:

1. You should only mention objects that are explicitly mentioned in the examiner's answer.
2. You should only extract the object names without the attributes of the objects.
3. You should not include the properties of the object, like the color, material, etc. as part of the object name in your result.
4. Make your answer precise. Present the results in a JSON list format: [\"object_1\", ..., \"object_n\"].
5. You should return an empty JSON list () if no visible objects can be found.
"""


def preprocess_coh_results(caps):
    new_caps = []
    for cap in caps:
        cap_text = cap["caption"]
        if "The following is a response without hallucination." in cap_text:
            new_cap_text = cap_text.split("The following is a response without hallucination.")[-1].strip()
        elif "The following is a response with hallucination." in cap_text:
            new_cap_text = cap_text.split("The following is a response with hallucination.")[0].strip()
        elif "Generate a response without errors." in cap_text:
            new_cap_text = cap_text.split("Generate a response without errors.")[-1].strip()
        elif "Generate a response with errors." in cap_text:
            new_cap_text = cap_text.split("Generate a response with errors.")[0].strip()
        else:
            new_cap_text = cap_text
        cap['caption'] = new_cap_text
        new_caps.append(cap)

    return new_caps


def combine_coco_captions(annotation_path):
    if not os.path.exists('%s/captions_%s2014.json' % (annotation_path, 'val')):
        raise Exception("Please download MSCOCO caption annotations for val set")
    if not os.path.exists('%s/captions_%s2014.json' % (annotation_path, 'train')):
        raise Exception("Please download MSCOCO caption annotations for train set")

    val_caps = json.load(open('%s/captions_%s2014.json' % (annotation_path, 'val')))
    train_caps = json.load(open('%s/captions_%s2014.json' % (annotation_path, 'train')))
    all_caps = {'info': train_caps['info'],
                'licenses': train_caps['licenses'],
                'images': val_caps['images'] + train_caps['images'],
                'annotations': val_caps['annotations'] + train_caps['annotations']}

    return all_caps


def combine_coco_instances(annotation_path):
    if not os.path.exists('%s/instances_%s2014.json' % (annotation_path, 'val')):
        raise Exception("Please download MSCOCO instance annotations for val set")
    if not os.path.exists('%s/instances_%s2014.json' % (annotation_path, 'train')):
        raise Exception("Please download MSCOCO instance annotations for train set")

    val_instances = json.load(open('%s/instances_%s2014.json' % (annotation_path, 'val')))
    train_instances = json.load(open('%s/instances_%s2014.json' % (annotation_path, 'train')))
    all_instances = {'info': train_instances['info'],
                     'licenses': train_instances['licenses'],
                     'type': train_instances['licenses'],
                     'categories': train_instances['categories'],
                     'images': train_instances['images'] + val_instances['images'],
                     'annotations': val_instances['annotations'] + train_instances['annotations']}

    return all_instances


class CHAIR(object):

    def __init__(self, imids, coco_path, openai_apikey):

        self.imid_to_objects = {imid: [] for imid in imids}

        self.coco_path = coco_path

        self.chat_model = Chat(model="gpt-3.5-turbo-0613", timeout_sec=100, openai_apikey=openai_apikey)
        self.fail_limit = 100

        # read in synonyms
        synonyms = open('./eval/data/synonyms_refine.txt').readlines()
        synonyms = [s.strip().split(', ') for s in synonyms]
        self.mscoco_objects = []  # mscoco objects and *all* synonyms
        self.inverse_synonym_dict = {}
        for synonym in synonyms:
            new_synonym = [s.strip() for s in synonym]
            self.mscoco_objects.extend(new_synonym)
            for s in new_synonym:
                self.inverse_synonym_dict[s] = new_synonym[0]

        coco_double_words = [word for word in self.inverse_synonym_dict.keys() if len(word.strip().split(' ')) >= 2]
        coco_double_words += ['home plate', 'train track']
        print("double word count:", len(coco_double_words))

        animal_words = ['bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'animal',
                        'cub']
        vehicle_words = ['jet', 'train']

        self.double_word_dict = {}
        for double_word in coco_double_words:
            self.double_word_dict[double_word] = double_word
        for animal_word in animal_words:
            self.double_word_dict['baby %s' % animal_word] = animal_word
            self.double_word_dict['adult %s' % animal_word] = animal_word
        for vehicle_word in vehicle_words:
            self.double_word_dict['passenger %s' % vehicle_word] = vehicle_word
        self.double_word_dict['bow tie'] = 'tie'
        self.double_word_dict['toilet seat'] = 'toilet'
        self.double_word_dict['wine glas'] = 'wine glass'

    def _load_generated_captions_into_evaluator(self, cap_file, sample_num, org_dir=None):

        '''
        Meant to save time so imid_to_objects does not always need to be recomputed.
        '''
        # Read in captions
        self.caps, imids, self.metrics = load_generated_captions(cap_file, org_dir=org_dir)
        self.caps = list(self.caps)
        for index, cap in enumerate(self.caps):
            cap['index'] = index
        if sample_num != -1:
            self.caps = random.sample(self.caps, sample_num)
        print("cal cap num:", len(self.caps))

        assert imids == set(self.imid_to_objects.keys())

    def get_double_words_only(self, word_list):
        i = 0
        double_words = []
        idxs = []
        words = word_list
        while i < len(words):
            idxs.append(i)
            double_word = ' '.join(words[i:i + 2])
            if double_word in self.double_word_dict:
                double_words.append(self.double_word_dict[double_word])
                i += 2
            else:
                #    double_words.append(words[i])
                i += 1
        words = double_words

        return words

    def caption_to_words(self, caption):

        '''
        Input: caption
        Output: MSCOCO words in the caption
        '''

        words = nltk.word_tokenize(caption.lower())
        words_2 = [lemma.lemmatize(w) for w in words]
        words = words_2

        # replace double words
        i = 0
        double_words = []
        idxs = []
        while i < len(words):
            idxs.append(i)
            double_word = ' '.join(words[i:i + 2])
            if double_word in self.double_word_dict:
                double_words.append(self.double_word_dict[double_word])
                i += 2
            else:
                double_words.append(words[i])
                i += 1
        words = double_words

        # toilet seat is not chair (sentences like "the seat of the toilet" will fire for "chair" if we do not include this line)
        if ('toilet' in words) & ('seat' in words): words = [word for word in words if word != 'seat']

        # get synonyms for all words in the caption
        idxs = [idxs[idx] for idx, word in enumerate(words) \
                if word in set(self.mscoco_objects)]
        words = [word for word in words if word in set(self.mscoco_objects)]
        node_words = []
        for word in words:
            node_words.append([word, self.inverse_synonym_dict[word]])
        # return all the MSCOCO objects in the caption
        return words, node_words, idxs, double_words

    def caption_objects_to_coco_objects(self, words):
        idxs = list(range(len(words)))
        if ('toilet' in words) & ('seat' in words): words = [word for word in words if word != 'seat']
        # get synonyms for all words in the caption
        idxs = [idxs[idx] for idx, word in enumerate(words) \
                if word in set(self.mscoco_objects)]
        words = [word for word in words if word in set(self.mscoco_objects)]
        node_words = []
        for word in words:
            node_words.append([word, self.inverse_synonym_dict[word]])

        # return all the MSCOCO objects in the caption
        return words, node_words, idxs

    def get_annotations_from_segments(self):
        '''
        Add objects taken from MSCOCO segmentation masks
        '''

        coco_segments = combine_coco_instances(self.coco_path)
        segment_annotations = coco_segments['annotations']

        id_to_name = {}  # dict with id to synsets
        for cat in coco_segments['categories']:
            id_to_name[cat['id']] = cat['name']

        for i, annotation in enumerate(segment_annotations):
            sys.stdout.write("\rGetting annotations for %d/%d segmentation masks"
                             % (i, len(segment_annotations)))
            imid = annotation['image_id']
            if imid in self.imid_to_objects:
                node_word = self.inverse_synonym_dict[id_to_name[annotation['category_id']]]
                self.imid_to_objects[imid].append(node_word)
        print("\n")
        for imid in self.imid_to_objects:
            self.imid_to_objects[imid] = set(self.imid_to_objects[imid])

    def get_annotations_from_captions(self):
        '''
        Add objects taken from MSCOCO ground truth captions
        '''

        coco_caps = combine_coco_captions(self.coco_path)
        caption_annotations = coco_caps['annotations']

        for i, annotation in enumerate(caption_annotations):
            sys.stdout.write('\rGetting annotations for %d/%d ground truth captions'
                             % (i, len(coco_caps['annotations'])))
            imid = annotation['image_id']
            if imid in self.imid_to_objects:
                _, node_words, _, _ = self.caption_to_words(annotation['caption'])
                self.imid_to_objects[imid].update([item[-1] for item in node_words])
        print("\n")

        for imid in self.imid_to_objects:
            self.imid_to_objects[imid] = set(self.imid_to_objects[imid])

    def get_annotations(self):

        '''
        Get annotations from both segmentation and captions.  Need both annotation types for CHAIR metric.
        '''

        self.get_annotations_from_segments()
        self.get_annotations_from_captions()

    def get_gpt_resp(self, data_item):
        prompt = copy.deepcopy(prompt_template)
        prompt = prompt.replace("{question}", data_item["question"])
        prompt = prompt.replace("{answer}", data_item["caption"])

        messages = [
            {"role": "system", "content": prompt}
        ]

        fail_cnt = 0
        used_tokens = {"total": 0, "input": 0, "output": 0}
        while True:
            if len(data_item["caption"].strip().split()) <= 3:
                data_item["extract_objs"] = []
                print(f"**[Short Answer]**@{data_item['caption']}@", data_item["extract_objs"])
                return data_item, used_tokens, {"total": 0, "input": 0, "output": 0}

            if fail_cnt == self.fail_limit:
                data_item["extract_objs"] = f'-1\n<no_response>'
                print("**[Wrong Return]**", data_item["extract_objs"])
                return data_item, used_tokens, {"total": 0, "input": 0, "output": 0}

            resp = None
            try:
                resp = self.chat_model.chat_completion(messages=messages)
                print(resp["model"])

                # Logging consumption
                used_tokens["total"] += resp['usage']["total_tokens"]
                used_tokens["input"] += resp['usage']["prompt_tokens"]
                used_tokens["output"] += resp['usage']["completion_tokens"]

                # Parsing ChatGPT response
                content = resp["choices"][0]["message"]["content"]
                content = parse_object_list(content)

                # API Rest
                time.sleep(5)

                data_item["extract_objs"] = content
                success_tokens = {"total": resp['usage']['total_tokens'],
                                  "input": resp['usage']['prompt_tokens'],
                                  "output": resp['usage']['completion_tokens']}
                return data_item, used_tokens, success_tokens
            except Exception as e:
                fail_cnt += 1
                # print(f'{data_item["index"]} Fail for other reasons')
                # print("message:", messages)
                print("Exception:", e, 'resp is ', resp)

                time.sleep(10 + fail_cnt)

    def gpt_caption_processor(self, max_workers=64):
        data_list = self.caps
        new_data = []
        all_used_tokens = {"total": 0, "input": 0, "output": 0}
        all_success_tokens = {"total": 0, "input": 0, "output": 0}
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            print("thread num:", len(data_list))

            futures = [
                executor.submit(self.get_gpt_resp, data_item)
                for data_item in data_list
            ]

            pb = tqdm(total=len(futures))

            for i, future in enumerate(concurrent.futures.as_completed(futures)):
                pb.update(1)

                try:
                    new_data_item, used_tokens, success_tokens = future.result()  # type = List
                    all_used_tokens = {key: all_used_tokens[key] + used_tokens[key] for key in all_used_tokens.keys()}
                    all_success_tokens = {key: all_success_tokens[key] + success_tokens[key] for key in
                                          all_success_tokens.keys()}
                    new_data.append(new_data_item)

                except Exception as e:
                    print(f"@@@ Exception: {e}\n")
        print(f'Done loop, waiting resource finalization', flush=True)

        return new_data, all_used_tokens, all_success_tokens

    def postagging(self, doc):
        obj_list = []
        temp_token = ""

        for token in doc:
            if token.tag_ in ["NNP", "NNPS", "NN", "NNS"]:
                temp_token += f" {token.lemma_}"
            else:
                if temp_token != "":
                    obj_list.append(temp_token.strip())
                    temp_token = ""
        if temp_token != "":
            obj_list.append(temp_token.strip())
            temp_token = ""
        return obj_list

    def get_pred_objs_match(self, caps):
        new_caps = []
        for item in caps:
            caps_gpt_objs = item["extract_objs"]
            assert caps_gpt_objs != f'-1\n<no_response>'
            refined_objs = []
            for text in caps_gpt_objs:
                text = f"a {text}"
                doc = nlp(text)
                single_tokens = [token.lemma_ for token in doc]
                double_words_objs = self.get_double_words_only(single_tokens)

                if double_words_objs != []:
                    refined_objs += double_words_objs
                    continue

                postagging_objs = self.postagging(doc)
                refined_objs += postagging_objs

            new_item = copy.deepcopy(item)

            # only append unique word in the list
            new_item["objs"] = []
            for robj in refined_objs:
                if robj not in new_item["objs"]:
                    new_item["objs"].append(robj)

            new_caps.append(new_item)

        return new_caps

    def compute_chair(self, cap_file, sample_num, gpt_process=False, org_dir=None):

        '''
        Given ground truth objects and generated captions, determine which sentences have hallucinated words.
        '''

        self._load_generated_captions_into_evaluator(cap_file, sample_num, org_dir=org_dir)

        imid_to_objects = self.imid_to_objects
        caps = self.caps

        if gpt_process:
            caps, all_used_tokens, all_success_tokens = self.gpt_caption_processor()
            caps = self.get_pred_objs_match(caps)
        else:
            all_used_tokens = {}
            all_success_tokens = {}

        num_caps = 0.
        num_coco_caps = 0.
        num_hallucinated_caps = 0.
        hallucinated_word_count = 0.
        coco_word_count = 0.
        gt_word_count = 0.
        coco_obj_cls_count = 0.

        output = {'sentences': []}
        total_cap_word_num = 0
        for i, cap_eval in enumerate(caps):

            cap = cap_eval['caption']
            total_cap_word_num += len(cap.strip().split(" "))
            imid = cap_eval['image_id']

            # get all words in the caption, as well as corresponding node word
            if gpt_process:
                ext_objs = cap_eval["objs"]
                words, node_words, idxs = self.caption_objects_to_coco_objects(ext_objs)
                raw_words = ext_objs
            else:
                words, node_words, idxs, raw_words = self.caption_to_words(cap)

            gt_objects = imid_to_objects[imid]
            gt_word_count += len(gt_objects)
            cap_dict = {'image_id': cap_eval['image_id'],
                        'caption': cap,  # org cap
                        'mscoco_hallucinated_words': [],
                        'mscoco_gt_words': list(gt_objects),  # gt coco objs
                        'mscoco_generated_words': list(node_words),  # gen mapped coco objs
                        'hallucination_idxs': [],
                        'words': raw_words
                        # gpt process -> map double words -> postagging results, or original text words lemmas
                        }

            cap_dict['metrics'] = {'CHAIRs': 0,
                                   'CHAIRi': 0}

            # count hallucinated words, if [word, coco_obj_cls] is unique, count as one prediction
            coco_word_count += len(node_words)
            caption_coco_obj_cls = []

            hallucinated = False
            for word, node_word, idx in zip(words, node_words, idxs):
                if node_word[-1] not in gt_objects:
                    hallucinated_word_count += 1
                    cap_dict['mscoco_hallucinated_words'].append((word, node_word))
                    cap_dict['hallucination_idxs'].append(idx)
                    hallucinated = True
                else:
                    caption_coco_obj_cls.append(node_word[-1])

            caption_coco_obj_cls = set(caption_coco_obj_cls)
            # print(caption_coco_obj_cls)
            coco_obj_cls_count += len(caption_coco_obj_cls)

            # count hallucinated caps
            num_caps += 1
            if hallucinated:
                num_hallucinated_caps += 1

            cap_dict['metrics']['CHAIRs'] = int(hallucinated)
            cap_dict['metrics']['CHAIRi'] = 0.
            if len(words) > 0:
                num_coco_caps += 1
                cap_dict['metrics']['CHAIRi'] = len(cap_dict['mscoco_hallucinated_words']) / float(len(words))

            output['sentences'].append(cap_dict)

        chair_s = (num_hallucinated_caps / num_caps)
        chair_s_refine = (num_hallucinated_caps / num_coco_caps)
        chair_i = (hallucinated_word_count / coco_word_count)
        avg_word_len = float(total_cap_word_num) / num_caps
        obj_rec = coco_obj_cls_count / gt_word_count

        output['overall_metrics'] = {'CHAIRs': chair_s,
                                     'CHAIRs_refine': chair_s_refine,
                                     'CHAIRi': chair_i,
                                     'obj_rec': obj_rec,
                                     'sentence_num': num_caps,
                                     'coco_sentence_num': num_coco_caps,
                                     'coco_word_count': coco_obj_cls_count,  # predict coco object classes
                                     'gt_word_count': gt_word_count,  # ground truth coco object classes
                                     'avg_word_len': avg_word_len,
                                     'all_gpt_used_tokens': all_used_tokens,
                                     'all_gpt_success_tokens': all_success_tokens,
                                     'correct_rate': 1 - chair_s_refine,
                                     'object_correct_rate': 1 - chair_i
                                     }

        return output


def read_jsonl(jsonl_file):
    data = []
    with open(jsonl_file, 'r', encoding='utf-8') as f1:
        for item in jsonlines.Reader(f1):
            data.append(item)
    return data


def load_generated_captions(cap_file, org_dir=None):
    if cap_file.endswith(".json"):
        # Read in captions
        caps = json.load(open(cap_file))
        try:
            metrics = caps['overall']
            caps = caps['imgToEval'].values()
            imids = set([cap['image_id'] for cap in caps])
        except:
            raise Exception(
                "Expect caption file to consist of a dictionary with sentences correspdonding to the key 'imgToEval'")
    elif cap_file.endswith(".jsonl"):

        caps = read_jsonl(cap_file)

        if "image_id" not in caps[0].keys():
            try:
                assert org_dir != None and org_dir.strip() != ""
            except:
                raise Exception("Expect origin test input file directory for .jsonl cap file")
            cap_name = cap_file.split("/")[-1]
            org_name = cap_name.split("__")[0].replace("_answer", ".jsonl")

            if org_dir.endswith(".jsonl"):
                org_data_path = org_dir
            else:
                org_data_path = os.path.join(org_dir, org_name)
            org_data = read_jsonl(org_data_path)

        metrics = {}
        new_caps = []
        imids = []
        for i in range(len(caps)):
            if "image_id" not in caps[i].keys():
                imgid = int(org_data[i]["image_id"])
            else:
                imgid = int(caps[i]["image_id"])

            imids.append(imgid)

            if "prompt" in caps[i].keys():
                question = caps[i]["prompt"]
            elif "question" in caps[i].keys():
                question = caps[i]["question"]
            else:
                raise Exception("Expect 'question' or 'prompt' in generated file")

            if "text" in caps[i].keys():
                answer = caps[i]["text"].replace("Assistant:", "").strip()
            elif "answer" in caps[i].keys():
                answer = caps[i]["answer"].replace("Assistant:", "").strip()
            else:
                raise Exception("Expect 'answer' or 'text' in generated file")
            new_item = {"image_id": imgid, "question": question, "caption": answer}
            new_caps.append(new_item)
        caps = new_caps
        imids = set(imids)

    elif "." not in cap_file:
        caps = json.load(open(cap_file))
        try:
            assert 'raw_question' in caps[0].keys()
        except:
            raise Exception("Expect origin test input file directory for .jsonl cap file")
        imids = set([int(cap['question_id'].replace('.jpg')) for cap in caps])
        metrics = {}
        new_caps = []
        for item in caps:
            new_item = {
                "image_id": int(item["question_id"].replace('.jpg')),
                "question": item["raw_question"],
                "caption": item["answer"].replace("Assistant:", "").strip()
            }
            new_caps.append(new_item)
        caps = new_caps

    if "coh" in cap_file:
        caps = preprocess_coh_results(caps)

    return caps, imids, metrics


def save_hallucinated_words(cap_file, cap_dict, save_dir, sample_num):
    tag = cap_file.split('/')[-1].replace(".jsonl", "")
    with open(os.path.join(save_dir, f'hall_{tag}_{sample_num}.json'), 'w') as f:
        json.dump(cap_dict, f, indent=4)


def print_metrics(hallucination_cap_dict, quiet=False):
    sentence_metrics = hallucination_cap_dict['overall_metrics']
    metric_string = "%0.001f\t%0.001f\t%0.001f\t%d\t%d\t%0.01f" % (
        sentence_metrics['CHAIRs'] * 100,
        sentence_metrics['CHAIRs_refine'] * 100,
        sentence_metrics['CHAIRi'] * 100,
        sentence_metrics['sentence_num'],
        sentence_metrics['coco_sentence_num'],
        sentence_metrics['avg_word_len'])

    if not quiet:
        print("CHAIRs\tCHAIRsr\tCHAIRi\tsent_num\tcoco_num\tavg_len")
        print(metric_string)

    else:
        return metric_string


if __name__ == '__main__':
    time_start = time.time()
    parser = argparse.ArgumentParser()
    parser.add_argument("--cap_file", type=str, default='')
    parser.add_argument("--cap_folder", type=str, default='')
    parser.add_argument("--org_folder", type=str, default='')
    parser.add_argument("--cap_type", type=str, default='')
    parser.add_argument("--coco_path", type=str, default='./coco2014/annotations')
    parser.add_argument("--sample_num", type=int, default=-1)
    parser.add_argument("--use_gpt", action='store_true')
    parser.add_argument("--openai_key", type=str, default='')
    args = parser.parse_args()

    print("use gpt:", args.use_gpt)
    if args.cap_folder != '':
        patterns = ['*', '*/*', '*/*/*', '*/*/*/*']
        f_list = sum([list(glob.glob(args.cap_folder + p)) for p in patterns], [])
        cap_file_list_path = [x for x in f_list if x.endswith('.jsonl') and args.cap_type in x]
        random.shuffle(cap_file_list_path)
        args.cap_file = cap_file_list_path[0]
    else:
        cap_file_list_path = [args.cap_file]

    print("=======load prediction=======")
    print("load imgids file:", args.cap_file)
    _, imids, _ = load_generated_captions(args.cap_file, org_dir=args.org_folder)
    # assert len(imids) == 300

    print("=======init evaluator=======")
    evaluator = CHAIR(imids, args.coco_path, args.openai_key)
    evaluator.get_annotations()

    print("========compute=========")
    for path in cap_file_list_path:
        print(path)
        tag = path.split('/')[-1].replace(".jsonl", "")

        save_dir = pathlib.Path(path).absolute().parent
        target_save_path_new = save_dir / f'hall_{tag}_{args.sample_num}.json'
        if target_save_path_new.exists():
            print("\teval file already exists!")
            continue
        else:
            print(f'Cannot find {target_save_path_new}')

        # if len(list(open(path))) != 300:
        #     continue

        temp_caps, temp_imids, _ = load_generated_captions(path, org_dir=args.org_folder)

        print("***do process***", flush=True)
        cap_dict = evaluator.compute_chair(path, args.sample_num, gpt_process=args.use_gpt, org_dir=args.org_folder)
        print(f'Done computing')

        save_hallucinated_words(path, cap_dict, save_dir, sample_num=args.sample_num)
        print(f'Done Saving')

        print_metrics(cap_dict)
        time_end = time.time()

        print("eval time:", time_end - time_start)


================================================
FILE: llavamod/eval/eval_gpt_review.py
================================================
import argparse
import json
import os

import openai
import tqdm
import ray
import time

NUM_SECONDS_TO_SLEEP = 3

@ray.remote(num_cpus=4)
def get_eval(content: str, max_tokens: int):
    while True:
        try:
            response = openai.ChatCompletion.create(
                model='gpt-4',
                messages=[{
                    'role': 'system',
                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
                }, {
                    'role': 'user',
                    'content': content,
                }],
                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
                max_tokens=max_tokens,
            )
            break
        except openai.error.RateLimitError:
            pass
        except Exception as e:
            print(e)
        time.sleep(NUM_SECONDS_TO_SLEEP)

    print('success!')
    return response['choices'][0]['message']['content']


def parse_score(review):
    try:
        score_pair = review.split('\n')[0]
        score_pair = score_pair.replace(',', ' ')
        sp = score_pair.split(' ')
        if len(sp) == 2:
            return [float(sp[0]), float(sp[1])]
        else:
            print('error', review)
            return [-1, -1]
    except Exception as e:
        print(e)
        print('error', review)
        return [-1, -1]


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
    parser.add_argument('-q', '--question')
    # parser.add_argument('-a', '--answer')
    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
    parser.add_argument('-r', '--rule')
    parser.add_argument('-o', '--output')
    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
    args = parser.parse_args()

    ray.init()

    f_q = open(os.path.expanduser(args.question))
    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))

    review_file = open(f'{args.output}', 'w')

    js_list = []
    handles = []
    idx = 0
    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
        # if idx == 1:
        #     break

        ques = json.loads(ques_js)
        ans1 = json.loads(ans1_js)
        ans2 = json.loads(ans2_js)

        category = json.loads(ques_js)['category']
        if category in rule_dict:
            rule = rule_dict[category]
        else:
            rule = rule_dict['default']
        prompt = rule['prompt']
        role = rule['role']
        content = (f'[Question]\n{ques["text"]}\n\n'
                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
                   f'[System]\n{prompt}\n\n')
        js_list.append({
            'id': idx+1,
            'question_id': ques['question_id'],
            'answer1_id': ans1['answer_id'],
            'answer2_id': ans2['answer_id'],
            'category': category})
        idx += 1
        handles.append(get_eval.remote(content, args.max_tokens))
        # To avoid the rate limit set by OpenAI
        time.sleep(NUM_SECONDS_TO_SLEEP)

    reviews = ray.get(handles)
    for idx, review in enumerate(reviews):
        scores = parse_score(review)
        js_list[idx]['content'] = review
        js_list[idx]['tuple'] = scores
        review_file.write(json.dumps(js_list[idx]) + '\n')
    review_file.close()


================================================
FILE: llavamod/eval/eval_gpt_review_bench.py
================================================
import argparse
import json
import os

import openai
import time

NUM_SECONDS_TO_SLEEP = 0.5


def get_eval(content: str, max_tokens: int):
    while True:
        try:
            response = openai.ChatCompletion.create(
                model='gpt-4-0314',
                messages=[{
                    'role': 'system',
                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
                }, {
                    'role': 'user',
                    'content': content,
                }],
                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
                max_tokens=max_tokens,
            )
            break
        except openai.error.RateLimitError:
            pass
        except Exception as e:
            print(e)
        time.sleep(NUM_SECONDS_TO_SLEEP)

    return response['choices'][0]['message']['content']


def parse_score(review):
    try:
        score_pair = review.split('\n')[0]
        score_pair = score_pair.replace(',', ' ')
        sp = score_pair.split(' ')
        if len(sp) == 2:
            return [float(sp[0]), float(sp[1])]
        else:
            print('error', review)
            return [-1, -1]
    except Exception as e:
        print(e)
        print('error', review)
        return [-1, -1]


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
    parser.add_argument('-q', '--question')
    parser.add_argument('-c', '--context')
    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
    parser.add_argument('-r', '--rule')
    parser.add_argument('-o', '--output')
    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
    args = parser.parse_args()

    f_q = open(os.path.expanduser(args.question))
    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))

    if os.path.isfile(os.path.expanduser(args.output)):
        cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
    else:
        cur_reviews = []

    review_file = open(f'{args.output}', 'a')

    context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
    image_to_context = {context['image']: context for context in context_list}

    handles = []
    idx = 0
    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
        ques = json.loads(ques_js)
        ans1 = json.loads(ans1_js)
        ans2 = json.loads(ans2_js)

        inst = image_to_context[ques['image']]

        if isinstance(inst['caption'], list):
            cap_str = '\n'.join(inst['caption'])
        else:
            cap_str = inst['caption']

        category = 'llava_bench_' + json.loads(ques_js)['category']
        if category in rule_dict:
            rule = rule_dict[category]
        else:
            assert False, f"Visual QA category not found in rule file: {category}."
        prompt = rule['prompt']
        role = rule['role']
        content = (f'[Context]\n{cap_str}\n\n'
                   f'[Question]\n{ques["text"]}\n\n'
                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
                   f'[System]\n{prompt}\n\n')
        cur_js = {
            'id': idx+1,
            'question_id': ques['question_id'],
            'answer1_id': ans1.get('answer_id', ans1['question_id']),
            'answer2_id': ans2.get('answer_id', ans2['answer_id']),
            'category': category
        }
        if idx >= len(cur_reviews):
            review = get_eval(content, args.max_tokens)
            scores = parse_score(review)
            cur_js['content'] = review
            cur_js['tuple'] = scores
            review_file.write(json.dumps(cur_js) + '\n')
            review_file.flush()
        else:
            print(f'Skipping {idx} as we already have it.')
        idx += 1
        print(idx)
    review_file.close()

================================================
FILE: llavamod/eval/eval_gpt_review_visual.py
================================================
import argparse
import json
import os

import openai
import time

NUM_SECONDS_TO_SLEEP = 0.5


def get_eval(content: str, max_tokens: int):
    while True:
        try:
            response = openai.ChatCompletion.create(
                model='gpt-4-0314',
                messages=[{
                    'role': 'system',
                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
                }, {
                    'role': 'user',
                    'content': content,
                }],
                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
                max_tokens=max_tokens,
            )
            break
        except openai.error.RateLimitError:
            pass
        except Exception as e:
            print(e)
        time.sleep(NUM_SECONDS_TO_SLEEP)

    return response['choices'][0]['message']['content']


def parse_score(review):
    try:
        score_pair = review.split('\n')[0]
        score_pair = score_pair.replace(',', ' ')
        sp = score_pair.split(' ')
        if len(sp) == 2:
            return [float(sp[0]), float(sp[1])]
        else:
            print('error', review)
            return [-1, -1]
    except Exception as e:
        print(e)
        print('error', review)
        return [-1, -1]


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
    parser.add_argument('-q', '--question')
    parser.add_argument('-c', '--context')
    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
    parser.add_argument('-r', '--rule')
    parser.add_argument('-o', '--output')
    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
    args = parser.parse_args()

    f_q = open(os.path.expanduser(args.question))
    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))

    if os.path.isfile(os.path.expanduser(args.output)):
        cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
    else:
        cur_reviews = []

    review_file = open(f'{args.output}', 'a')

    context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
    image_to_context = {context['image']: context for context in context_list}

    handles = []
    idx = 0
    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
        ques = json.loads(ques_js)
        ans1 = json.loads(ans1_js)
        ans2 = json.loads(ans2_js)

        inst = image_to_context[ques['image']]
        cap_str = '\n'.join(inst['captions'])
        box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']])

        category = json.loads(ques_js)['category']
        if category in rule_dict:
            rule = rule_dict[category]
        else:
            assert False, f"Visual QA category not found in rule file: {category}."
        prompt = rule['prompt']
        role = rule['role']
        content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n'
                   f'[Question]\n{ques["text"]}\n\n'
                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
                   f'[System]\n{prompt}\n\n')
        cur_js = {
            'id': idx+1,
            'question_id': ques['question_id'],
            'answer1_id': ans1.get('answer_id', ans1['question_id']),
            'answer2_id': ans2.get('answer_id', ans2['answer_id']),
            'category': category
        }
        if idx >= len(cur_reviews):
            review = get_eval(content, args.max_tokens)
            scores = parse_score(review)
            cur_js['content'] = review
            cur_js['tuple'] = scores
            review_file.write(json.dumps(cur_js) + '\n')
            review_file.flush()
        else:
            print(f'Skipping {idx} as we already have it.')
        idx += 1
        print(idx)
    review_file.close()


================================================
FILE: llavamod/eval/eval_gqa.py
================================================
# Evaluation code for GQA.
# Computes a suite of metrics such as accuracy, consistency, plausibility and scores per question type and length.
# Visit https://gqadataset.org/ for all information about the dataset, including examples, visualizations, paper and slides.
#
#
# Metrics:
# - Accuracy: Standard accuracy, computed over the balanced version of the dataset, which is more robust against
#             cheating by making educated guesses. For each question-answer pair (q,a), we give 1 point if the
#             predicted answer p matches a and 0 otherwise, and average over all questions in the dataset.
#
# - Consistency: A metric for the level of model's consistency across different questions. For each question-answer
#                pair (q,a), we define a set Eq={q1, q2, ..., qn} of entailed questions, the answers to which can
#                be unambiguously inferred given (q,a).
#                Denote Q the set of all questions the model answered correctly. For each question q in Q, we
#                measure the model's accuracy over the entailed questions Eq to get the score sq and finally
#                average these results across all questions in Q.
#
# - Validity: Measures whether the model gives a "valid" answer - one that can theoretically be an answer
#             to the question (e.g. a color to a color question, yes/no to a binary question etc.).
#             We provide a set of valid answers to each questions over the final answer vocabulary, in
#             the choices file, and use it to compute average validity across the dataset.
#
# - Plausibility: Measures whether the model answers are plausible, e.g. one that make sense in the real world,
#                 e.g. not answering "purple" to a question about apple color (unless it's really purple).
#                 We provide a set of all plausible answers to each questions, computed by looking at all
#                 attributes and relations hold for various objects throughout the whole dataset scene graphs,
#                 and use it to compute average model plausibility across the data.
#
# - Grounding: Only for attention models. Measures whether the model looks at the relevant regions in the
#              image when answering a question. Each question in the dataset is annotated with the visual regions
#              they refer to, which are then used to compute the level to which the model has a correct visual attention,
#              which will allow to identify whether it really answers based on the image of by language-based guesses.
#              Supports both spatial features and object-based features.
#
# - Distribution: Measures the overall match between the true answer distribution for different questions,
#                 vs the overall distribution predicted by the model through its answers for all the data.
#                 We use chi-square statistic to measure the degree of similarity between the distributions,
#                 giving indication to the level of overall world-knowledge of the model
#
# - Accuracy per type: accuracy per question structural types (logic, compare, choose), and semantic type
#                      (questions about attributes, relations, categories, objects or the whole scene).
#
# - Accuracy for length: accuracy as a function of the question length, in terms of (1) words number, and semantic
#                        complexity - number of reasoning steps.
#
# We may support additional metrics (e.g. coverage) in the future.
#
#
# Files format:
# - predictions file format: JSON array: [{"questionId": str, "prediction": str}]
# - attentions file format: JSON array:
#   Spatial attention: [{"questionId": str, "attention": [mapSize x mapSize: float] }].
#   Object-based attention:[{"questionId": str, "attention": [[x0, y0, x1, y1, float] x #regions] }]. 0 < x,y < 1.
# - questions and choices files are provided as part of the dataset.
#   see https://gqadataset.org/download.html for information about their format.
#
#
# If you have any questions or comments, please feel free to send an email,
# at dorarad@cs.stanford.edu. We hope you'll enjoy using the GQA dataset! :)
#
#

from collections import defaultdict
from tqdm import tqdm
import argparse
import os.path
import glob
import json
import math

##### Arguments
##########################################################################################

parser = argparse.ArgumentParser()
parser.add_argument('--tier', default="val", type=str, help="Tier, e.g. train, val")
parser.add_argument('--scenes', default="{tier}_sceneGraphs.json", type=str, help="Scene graphs file name format.")
parser.add_argument('--questions', default="{tier}_all_questions.json", type=str, help="Questions file name format.")
parser.add_argument('--choices', default="{tier}_choices.json", type=str, help="Choices file name format.")
parser.add_argument('--predictions', default="{tier}_predictions.json", type=str, help="Answers file name format.")
parser.add_argument('--attentions', default="{tier}_attentions.json", type=str, help="Attentions file name format.")
parser.add_argument('--consistency', action="store_true",
                    help="True to compute consistency score (Need to provide answers to questions in val_all_questions.json).")
parser.add_argument('--grounding', action="store_true",
                    help="True to compute grounding score (If model uses attention).")
parser.add_argument('--objectFeatures', action="store_true",
                    help="True for object-based attention (False for spatial).")
parser.add_argument('--mapSize', default=7, type=int,
                    help="Optional, only to get attention score. Images features map size, mapSize * mapSize")
args = parser.parse_args()

print(
    "Please make sure to use our provided visual features as gqadataset.org for better comparability. We provide both spatial and object-based features trained on GQA train set.")
print(
    "In particular please avoid using features from https://github.com/peteanderson80/bottom-up-attention since they were trained on images contained in the GQA validation set and thus may give false scores improvement.\n")

if not args.consistency:
    print("Please consider using --consistency to compute consistency scores for entailed questions.")
    print("If you do so, please provide answers to all questions in val_all_questions.json.\n")

if not args.grounding:
    print("Please consider using --grounding to compute attention scores.")
    print("If you do so, please provide attention maps through --attentions.\n")


##### Files Loading
##########################################################################################

def loadFile(name):
    # load standard json file
    if os.path.isfile(name):
        with open(name) as file:
            data = json.load(file)
    # load file chunks if too big
    elif os.path.isdir(name.split(".")[0]):
        data = {}
        chunks = glob.glob('{dir}/{dir}_*.{ext}'.format(dir=name.split(".")[0], ext=name.split(".")[1]))
        for chunk in chunks:
            with open(chunk) as file:
                data.update(json.load(file))
    else:
        raise Exception("Can't find {}".format(name))
    return data


# Load scene graphs
print("Loading scene graphs...")
try:
    scenes = loadFile(args.scenes.format(tier=args.tier))
except:
    print('Failed to load scene graphs -- cannot evaluate grounding')
    scenes = None  # for testdev

# Load questions
print("Loading questions...")
questions = loadFile(args.questions)

# Load choices
print("Loading choices...")
try:
    choices = loadFile(args.choices.format(tier=args.tier))
except:
    print('Failed to load choices -- cannot evaluate validity or plausibility')
    choices = None  # for testdev

# Load predictions and turn them into a dictionary
print("Loading predictions...")
predictions = loadFile(args.predictions.format(tier=args.tier))
predictions = {p["questionId"]: p["prediction"] for p in predictions}

# Make sure all question have predictions
for qid in questions:
    if (qid not in predictions) and (args.consistency or questions[qid]["isBalanced"]):
        print("no prediction for question {}. Please add prediction for all questions.".format(qid))
        raise Exception("missing predictions")

# Load attentions and turn them into a dictionary
attentions = None
if args.grounding:
    with open(args.attentions.format(tier=args.tier)) as attentionsFile:
        attentions = json.load(attentionsFile)
        attentions = {a["questionId"]: a["attention"] for a in attentions}


##### Scores data structures initialization
##########################################################################################

# book to float
def toScore(b):
    return float(1 if b else 0)


# Compute average of a list
def avg(l):
    if len(l) == 0:
        return 0
    return float(sum(l)) / len(l)


def wavg(l, w):
    if sum(w) == 0:
        return None
    return float(sum(l[i] * w[i] for i in range(len(l)))) / sum(w)


# Initialize data structure to track all metrics: e.g. accuracy, validity and plausibility, as well as
# accuracy per question type, length and number of reasoning steps.
scores = {
    "accuracy": [],  # list of accuracies per question (1 if correct else 0). Will be averaged ultimately.
    "binary": [],  # list of accuracies per a binary question (1 if correct else 0). Will be averaged ultimately.
    "open": [],  # list of accuracies per an open question (1 if correct else 0). Will be averaged ultimately.
    "validity": [],  # list of validity per question (1 if valid else 0).
    "plausibility": [],  # list of plausibility per question (1 if plausible else 0).
    "consistency": [],  # list of consistency scores for entailed questions.
    "accuracyPerStructuralType": defaultdict(list),
    # list of question accuracies for each structural type (e.g. compare, logic questions).
    "accuracyPerSemanticType": defaultdict(list),
    # list of question accuracies for each semantic type (e.g. questions about an object, an attribute, a relation).
    "accuracyPerLength": defaultdict(list),  # list of question accuracies per question's word number.
    "accuracyPerSteps": defaultdict(list),
    # list of question accuracies per question's reasoning length (steps number).
    "grounding": []  # list of grounding scores for each question.
}

# Initialize golden and predicted histograms per each question group. Used to compute the distribution metric.
dist = {
    "gold": defaultdict(lambda: defaultdict(int)),
    "predicted": defaultdict(lambda: defaultdict(int))
}


##### Question lengths - words numbers and reasoning steps number
##########################################################################################

# Compute question length (words number)
def getWordsNum(question):
    return len(question["question"].split())


# Compute number of reasoning steps (excluding the final "querying" step which doesn't increase effective reasoning length)
def getStepsNum(question):
    return len([c for c in question["semantic"] if not (any([o in "{}: {}".format(c["operation"], c["argument"])
                                                             for o in ["exist", "query: name", "choose name"]]))])


##### Functions for question annotations
##########################################################################################

# Utility function for converting question annotations string keys to slices
def toSlice(strSlice):
    sliceLims = (int(n) for n in strSlice.split(':'))
    return apply(slice, sliceLims)


# Utility function for converting question annotations string keys to indexes list:
# "1" => [0]
# "1:3" => [1, 2]
# "4:9:2" => [4, 6, 8]
def intsFromSlice(strSlice):
    slice_obj = get_slice_obj(slicearg)
    return (range(slice_obj.start or 0, slice_obj.stop or -1, slice_obj.step or 1))


##### Functions for validity and plausibility
##########################################################################################

def belongs(element, group, question):
    # normalization ()
    if "Common" in question["types"]["detailed"]:
        group = ["color", "material", "shape"]

    return element in group


##### Functions for consistency scores (for entailed questions ("inferred"))
##########################################################################################

def updateConsistency(questionId, question, questions):
    inferredQuestions = [eid for eid in question["entailed"] if eid != questionId]

    if correct and len(inferredQuestions) > 0:

        cosnsitencyScores = []
        for eid in inferredQuestions:
            gold = questions[eid]["answer"]
            predicted = predictions[eid]
            score = toScore(predicted == gold)
            cosnsitencyScores.append(score)

        scores["consistency"].append(avg(cosnsitencyScores))


##### Functions for grounding score (optional, only for attention models)
##########################################################################################

# Utility functions for working with bounding boxes.
# c = (x0, y0, x1, y1), r = (r0, r1)

def yrange(c):
    return (c[1], c[3])


def xrange(c):
    return (c[0], c[2])


def length(r):
    if r is None:
        return 0
    return float(r[1] - r[0])


def size(c):
    return length(xrange(c)) * length(yrange(c))


def intersection(r1, r2):
    ir = (max(r1[0], r2[0]), min(r1[1], r2[1]))
    if ir[1] > ir[0]:
        return ir
    return None


def intersectionSize(c1, c2):
    return length(intersection(xrange(c1), xrange(c2))) * length(intersection(yrange(c1), yrange(c2)))


def intersectionRate(c1, c2):
    return float(intersectionSize(c1, c2)) / size(c1)


# Get spatial cell
def getCell(i, j):
    edge = float(1) / args.mapSize
    return (edge * i, edge * j, edge * (i + 1), edge * (j + 1))


# Get bounding box of objectId in sceneGraph
def getRegion(sceneGraph, objectId):
    obj = sceneGraph["objects"][objectId]
    x0 = float(obj["x"]) / sceneGraph["width"]
    y0 = float(obj["y"]) / sceneGraph["height"]
    x1 = float(obj["x"] + obj["w"]) / sceneGraph["width"]
    y1 = float(obj["y"] + obj["h"]) / sceneGraph["height"]
    return (x0, y0, x1, y1)


# Compute grounding score. Computer amount of attention (probability) given to each of the regions
# the question and answers refer to.
def computeGroundingScore(question, sceneGraph, attentionMap):
    ## prepare gold regions
    regions = []
    # add question regions
    regions += [getRegion(sceneGraph, pointer) for pointer in question["annotations"]["question"].values()]
    # add answer regions
    regions += [getRegion(sceneGraph, pointer) for pointer in question["annotations"]["fullAnswer"].values()]
    # add all the image if the question refers to the whole scene
    if any(("scene" in c) for c in question["semantic"]):
        regions.append((0, 0, 1, 1))

    # prepare attention map
    if args.objectFeatures:
        cells = [((x0, y0, x1, y1), attention) for x0, y0, x1, y1, attention in cells]
    else:
        cells = [(getCell(i, j), attentionMap[i][j]) for i in range(args.mapSize) for j in range(args.mapSize)]

    # compare attention map to gold regions
    scores = []
    for region in regions:
        for cell, attention in cells:
            scores.append(attention * intersectionRate(cell, region))
    return sum(scores)


##### Functions for distribution score
##########################################################################################

# Compute chi square statistic of gold distribution vs predicted distribution,
# averaged over all question groups
def chiSquare(goldDist, predictedDist):
    sumScore, sumOverall = 0, 0

    for group in goldDist:
        score, overall = 0, 0

        for ans in goldDist[group]:
            e = goldDist[group][ans]
            o = predictedDist[group].get(ans, 0)
            score += ((float(o - e) ** 2) / e)
            overall += goldDist[group][ans]

        sumScore += score * overall
        sumOverall += overall

    avgScore = float(sumScore) / sumOverall

    return avgScore


##### Main score computation
##########################################################################################

# Loop over the questions and compute mterics
for qid, question in tqdm(questions.items()):

    # Compute scores over the balanced dataset (more robust against cheating by making educated guesses)
    if question["isBalanced"]:
        gold = question["answer"]
        predicted = predictions[qid]

        correct = (predicted == gold)
        score = toScore(correct)

        wordsNum = getWordsNum(question)
        stepsNum = getStepsNum(question)

        # Update accuracy
        scores["accuracy"].append(score)
        scores["accuracyPerLength"][wordsNum].append(score)
        scores["accuracyPerSteps"][stepsNum].append(score)
        scores["accuracyPerStructuralType"][question["types"]["structural"]].append(score)
        scores["accuracyPerSemanticType"][question["types"]["semantic"]].append(score)
        answerType = "open" if question["types"]["structural"] == "query" else "binary"
        scores[answerType].append(score)

        # Update validity score
        valid = (
            belongs(predicted, choices[qid]["valid"], question) if choices
            else False)
        scores["validity"].append(toScore(valid))

        # Update plausibility score
        plausible = (
            belongs(predicted, choices[qid]["plausible"], question) if choices
            else False)
        scores["plausibility"].append(toScore(plausible))

        # Optionally compute grounding (attention) score
        if attentions is not None:
            groundingScore = computeGroundingScore(question, scenes[question["imageId"]], attentions[qid])
            if groundingScore is not None:
                scores["grounding"].append(groundingScore)

        # Update histograms for gold and predicted answers
        globalGroup = question["groups"]["global"]
        if globalGroup is not None:
            dist["gold"][globalGroup][gold] += 1
            dist["predicted"][globalGroup][predicted] += 1

        if args.consistency:
            # Compute consistency (for entailed questions)
            updateConsistency(qid, question, questions)

# Compute distribution score
scores["distribution"] = chiSquare(dist["gold"], dist["predicted"]) / 100

# Average scores over all questions (in the balanced dataset) and print scores

metrics = [
    "binary",
    "open",
    "accuracy",
    "consistency",
    "validity",
    "plausibility",
    "grounding",
    "distribution"
]

detailedMetrics = [
    ("accuracyPerStructuralType", "Accuracy / structural type"),
    ("accuracyPerSemanticType", "Accuracy / semantic type"),
    ("accuracyPerSteps", "Accuracy / steps number"),
    ("accuracyPerLength", "Accuracy / words number")
]

subMetrics = {
    "attr": "attribute",
    "cat": "category",
    "global": "scene",
    "obj": "object",
    "rel": "relation"
}
# average
for k in metrics:
    if isinstance(scores[k], list):
        scores[k] = avg(scores[k]) * 100

for k, _ in detailedMetrics:
    for t in scores[k]:
        scores[k][t] = avg(scores[k][t]) * 100, len(scores[k][t])

# print
print("")
for m in metrics:
    # skip grounding and consistency scores if not requested
    if m == "grounding" and not args.grounding:
        continue
    if m == "consistency" and not args.consistency:
        continue

    # print score
    print("{title}: {score:.2f}{suffix}".format(title=m.capitalize(), score=scores[m],
                                                suffix=" (lower is better)" if m == "distribution" else "%"))

for m, mPrintName in detailedMetrics:
    print("")
    # print metric title
    print("{}:".format(mPrintName))

    for t in sorted(list(scores[m].keys())):
        # set sub-metric title
        tName = t
        if isinstance(scores[k], list):
            tName = subMetrics.get(t, t).capitalize()

        # print score
        print("  {title}: {score:.2f}{suffix} ({amount} questions)".format(title=tName,
                                                                           score=scores[m][t][0], suffix="%",
                                                                           amount=scores[m][t][1]))

================================================
FILE: llavamod/eval/eval_gqa_1.py
================================================
# Evaluation code for GQA.
# Computes a suite of metrics such as accuracy, consistency, plausibility and scores per question type and length.
# Visit https://gqadataset.org/ for all information about the dataset, including examples, visualizations, paper and slides.
#
#
# Metrics:
# - Accuracy: Standard accuracy, computed over the balanced version of the dataset, which is more robust against
#             cheating by making educated guesses. For each question-answer pair (q,a), we give 1 point if the
#             predicted answer p matches a and 0 otherwise, and average over all questions in the dataset.
#
# - Consistency: A metric for the level of model's consistency across different questions. For each question-answer
#                pair (q,a), we define a set Eq={q1, q2, ..., qn} of entailed questions, the answers to which can
#                be unambiguously inferred given (q,a).
#                Denote Q the set of all questions the model answered correctly. For each question q in Q, we
#                measure the model's accuracy over the entailed questions Eq to get the score sq and finally
#                average these results across all questions in Q.
#
# - Validity: Measures whether the model gives a "valid" answer - one that can theoretically be an answer
#             to the question (e.g. a color to a color question, yes/no to a binary question etc.).
#             We provide a set of valid answers to each questions over the final answer vocabulary, in
#             the choices file, and use it to compute average validity across the dataset.
#
# - Plausibility: Measures whether the model answers are plausible, e.g. one that make sense in the real world,
#                 e.g. not answering "purple" to a question about apple color (unless it's really purple).
#                 We provide a set of all plausible answers to each questions, computed by looking at all
#                 attributes and relations hold for various objects throughout the whole dataset scene graphs,
#                 and use it to compute average model plausibility across the data.
#
# - Grounding: Only for attention models. Measures whether the model looks at the relevant regions in the
#              image when answering a question. Each question in the dataset is annotated with the visual regions
#              they refer to, which are then used to compute the level to which the model has a correct visual attention,
#              which will allow to identify whether it really answers based on the image of by language-based guesses.
#              Supports both spatial features and object-based features.
#
# - Distribution: Measures the overall match between the true answer distribution for different questions,
#                 vs the overall distribution predicted by the model through its answers for all the data.
#                 We use chi-square statistic to measure the degree of similarity between the distributions,
#                 giving indication to the level of overall world-knowledge of the model
#
# - Accuracy per type: accuracy per question structural types (logic, compare, choose), and semantic type
#                      (questions about attributes, relations, categories, objects or the whole scene).
#
# - Accuracy for length: accuracy as a function of the question length, in terms of (1) words number, and semantic
#                        complexity - number of reasoning steps.
#
# We may support additional metrics (e.g. coverage) in the future.
#
#
# Files format:
# - predictions file format: JSON array: [{"questionId": str, "prediction": str}]
# - attentions file format: JSON array:
#   Spatial attention: [{"questionId": str, "attention": [mapSize x mapSize: float] }].
#   Object-based attention:[{"questionId": str, "attention": [[x0, y0, x1, y1, float] x #regions] }]. 0 < x,y < 1.
# - questions and choices files are provided as part of the dataset.
#   see https://gqadataset.org/download.html for information about their format.
#
#
# If you have any questions or comments, please feel free to send an email,
# at dorarad@cs.stanford.edu. We hope you'll enjoy using the GQA dataset! :)
#
#

from collections import defaultdict
from tqdm import tqdm
import argparse
import os.path
import glob
import json
import math

##### Arguments
##########################################################################################

parser = argparse.ArgumentParser()
parser.add_argument('--tier', default="val", type=str, help="Tier, e.g. train, val")
parser.add_argument('--scenes', default="{tier}_sceneGraphs.json", type=str, help="Scene graphs file name format.")
parser.add_argument('--questions', default="{tier}_questions.json", type=str, help="Questions file name format.")
parser.add_argument('--choices', default="{tier}_choices.json", type=str, help="Choices file name format.")
parser.add_argument('--predictions', default="{tier}_predictions.json", type=str, help="Answers file name format.")
parser.add_argument('--attentions', default="{tier}_attentions.json", type=str, help="Attentions file name format.")
parser.add_argument('--consistency', action="store_true",
                    help="True to compute consistency score (Need to provide answers to questions in val_all_questions.json).")
parser.add_argument('--grounding', action="store_true",
                    help="True to compute grounding score (If model uses attention).")
parser.add_argument('--objectFeatures', action="store_true",
                    help="True for object-based attention (False for spatial).")
parser.add_argument('--mapSize', default=7, type=int,
                    help="Optional, only to get attention score. Images features map size, mapSize * mapSize")
args = parser.parse_args()

print(
    "Please make sure to use our provided visual features as gqadataset.org for better comparability. We provide both spatial and object-based features trained on GQA train set.")
print(
    "In particular please avoid using features from https://github.com/peteanderson80/bottom-up-attention since they were trained on images contained in the GQA validation set and thus may give false scores improvement.\n")

if not args.consistency:
    print("Please consider using --consistency to compute consistency scores for entailed questions.")
    print("If you do so, please provide answers to all questions in val_all_questions.json.\n")

if not args.grounding:
    print("Please consider using --grounding to compute attention scores.")
    print("If you do so, please provide attention maps through --attentions.\n")


##### Files Loading
##########################################################################################

def loadFile(name):
    # load standard json file
    if os.path.isfile(name):
        with open(name) as file:
            data = json.load(file)
    # load file chunks if too big
    elif os.path.isdir(name.split(".")[0]):
        data = {}
        chunks = glob.glob('{dir}/{dir}_*.{ext}'.format(dir=name.split(".")[0], ext=name.split(".")[1]))
        for chunk in chunks:
            with open(chunk) as file:
                data.update(json.load(file))
    else:
        raise Exception("Can't find {}".format(name))
    return data


# Load scene graphs
# print("Loading scene graphs...")
# scenes = loadFile(args.scenes.format(tier = args.tier))

# Load questions
print("Loading questions...")
questions = loadFile(args.questions.format(tier=args.tier))

# # Load choices
# print("Loading choices...")
# choices = loadFile(args.choices.format(tier = args.tier))

# Load predictions and turn them into a dictionary
print("Loading predictions...")
predictions = loadFile(args.predictions.format(tier=args.tier))
predictions = {p["questionId"]: p["prediction"] for p in predictions}

miss_count = 0
# Make sure all question have predictions
for qid in questions:
    if (qid not in predictions) and (args.consistency or questions[qid]["isBalanced"]):
        # print("no prediction for question {}. Please add prediction for all questions.".format(qid))
        # raise Exception("missing predictions")
        miss_count += 1
print(miss_count)
# Load attentions and turn them into a dictionary
attentions = None
if args.grounding:
    with open(args.attentions.format(tier=args.tier)) as attentionsFile:
        attentions = json.load(attentionsFile)
        attentions = {a["questionId"]: a["attention"] for a in attentions}


##### Scores data structures initialization
##########################################################################################

# book to float
def toScore(b):
    return float(1 if b else 0)


# Compute average of a list
def avg(l):
    if len(l) == 0:
        return 0
    return float(sum(l)) / len(l)


def wavg(l, w):
    if sum(w) == 0:
        return None
    return float(sum(l[i] * w[i] for i in range(len(l)))) / sum(w)


# Initialize data structure to track all metrics: e.g. accuracy, validity and plausibility, as well as
# accuracy per question type, length and number of reasoning steps.
scores = {
    "accuracy": [],  # list of accuracies per question (1 if correct else 0). Will be averaged ultimately.
    "binary": [],  # list of accuracies per a binary question (1 if correct else 0). Will be averaged ultimately.
    "open": [],  # list of accuracies per an open question (1 if correct else 0). Will be averaged ultimately.
    "validity": [],  # list of validity per question (1 if valid else 0).
    "plausibility": [],  # list of plausibility per question (1 if plausible else 0).
    "consistency": [],  # list of consistency scores for entailed questions.
    "accuracyPerStructuralType": defaultdict(list),
    # list of question accuracies for each structural type (e.g. compare, logic questions).
    "accuracyPerSemanticType": defaultdict(list),
    # list of question accuracies for each semantic type (e.g. questions about an object, an attribute, a relation).
    "accuracyPerLength": defaultdict(list),  # list of question accuracies per question's word number.
    "accuracyPerSteps": defaultdict(list),
    # list of question accuracies per question's reasoning length (steps number).
    "grounding": []  # list of grounding scores for each question.
}

# Initialize golden and predicted histograms per each question group. Used to compute the distribution metric.
dist = {
    "gold": defaultdict(lambda: defaultdict(int)),
    "predicted": defaultdict(lambda: defaultdict(int))
}


##### Question lengths - words numbers and reasoning steps number
##########################################################################################

# Compute question length (words number)
def getWordsNum(question):
    return len(question["question"].split())


# Compute number of reasoning steps (excluding the final "querying" step which doesn't increase effective reasoning length)
def getStepsNum(question):
    return len([c for c in question["semantic"] if not (any([o in "{}: {}".format(c["operation"], c["argument"])
                                                             for o in ["exist", "query: name", "choose name"]]))])


##### Functions for question annotations
##########################################################################################

# Utility function for converting question annotations string keys to slices
def toSlice(strSlice):
    sliceLims = (int(n) for n in strSlice.split(':'))
    return apply(slice, sliceLims)


# Utility function for converting question annotations string keys to indexes list:
# "1" => [0]
# "1:3" => [1, 2]
# "4:9:2" => [4, 6, 8]
def intsFromSlice(strSlice):
    slice_obj = get_slice_obj(slicearg)
    return (range(slice_obj.start or 0, slice_obj.stop or -1, slice_obj.step or 1))


##### Functions for validity and plausibility
##########################################################################################

def belongs(element, group, question):
    # normalization ()
    if "Common" in question["types"]["detailed"]:
        group = ["color", "material", "shape"]

    return element in group


##### Functions for consistency scores (for entailed questions ("inferred"))
##########################################################################################

def updateConsistency(questionId, question, questions):
    inferredQuestions = [eid for eid in question["entailed"] if eid != questionId]

    if correct and len(inferredQuestions) > 0:

        cosnsitencyScores = []
        for eid in inferredQuestions:
            gold = questions[eid]["answer"]
            predicted = predictions[eid]
            score = toScore(predicted == gold)
            cosnsitencyScores.append(score)

        scores["consistency"].append(avg(cosnsitencyScores))


##### Functions for grounding score (optional, only for attention models)
##########################################################################################

# Utility functions for working with bounding boxes.
# c = (x0, y0, x1, y1), r = (r0, r1)

def yrange(c):
    return (c[1], c[3])


def xrange(c):
    return (c[0], c[2])


def length(r):
    if r is None:
        return 0
    return float(r[1] - r[0])


def size(c):
    return length(xrange(c)) * length(yrange(c))


def intersection(r1, r2):
    ir = (max(r1[0], r2[0]), min(r1[1], r2[1]))
    if ir[1] > ir[0]:
        return ir
    return None


def intersectionSize(c1, c2):
    return length(intersection(xrange(c1), xrange(c2))) * length(intersection(yrange(c1), yrange(c2)))


def intersectionRate(c1, c2):
    return float(intersectionSize(c1, c2)) / size(c1)


# Get spatial cell
def getCell(i, j):
    edge = float(1) / args.mapSize
    return (edge * i, edge * j, edge * (i + 1), edge * (j + 1))


# Get bounding box of objectId in sceneGraph
def getRegion(sceneGraph, objectId):
    obj = sceneGraph["objects"][objectId]
    x0 = float(obj["x"]) / sceneGraph["width"]
    y0 = float(obj["y"]) / sceneGraph["height"]
    x1 = float(obj["x"] + obj["w"]) / sceneGraph["width"]
    y1 = float(obj["y"] + obj["h"]) / sceneGraph["height"]
    return (x0, y0, x1, y1)


# Compute grounding score. Computer amount of attention (probability) given to each of the regions
# the question and answers refer to.
def computeGroundingScore(question, sceneGraph, attentionMap):
    ## prepare gold regions
    regions = []
    # add question regions
    regions += [getRegion(sceneGraph, pointer) for pointer in question["annotations"]["question"].values()]
    # add answer regions
    regions += [getRegion(sceneGraph, pointer) for pointer in question["annotations"]["fullAnswer"].values()]
    # add all the image if the question refers to the whole scene
    if any(("scene" in c) for c in question["semantic"]):
        regions.append((0, 0, 1, 1))

    # prepare attention map
    if args.objectFeatures:
        cells = [((x0, y0, x1, y1), attention) for x0, y0, x1, y1, attention in cells]
    else:
        cells = [(getCell(i, j), attentionMap[i][j]) for i in range(args.mapSize) for j in range(args.mapSize)]

    # compare attention map to gold regions
    scores = []
    for region in regions:
        for cell, attention in cells:
            scores.append(attention * intersectionRate(cell, region))
    return sum(scores)


##### Functions for distribution score
##########################################################################################

# Compute chi square statistic of gold distribution vs predicted distribution,
# averaged over all question groups
def chiSquare(goldDist, predictedDist):
    sumScore, sumOverall = 0, 0

    for group in goldDist:
        score, overall = 0, 0

        for ans in goldDist[group]:
            e = goldDist[group][ans]
            o = predictedDist[group].get(ans, 0)
            score += ((float(o - e) ** 2) / e)
            overall += goldDist[group][ans]

        sumScore += score * overall
        sumOverall += overall

    avgScore = float(sumScore) / sumOverall

    return avgScore


##### Main score computation
##########################################################################################

# Loop over the questions and compute mterics
for qid, question in tqdm(questions
Download .txt
gitextract_kubh4hae/

├── LICENSE
├── README.md
├── docs/
│   ├── INFERENCE.md
│   └── TRAIN_EVAL.md
├── llavamod/
│   ├── __init__.py
│   ├── config/
│   │   ├── __init__.py
│   │   ├── args.py
│   │   └── dpconfig/
│   │       ├── dpo_zero2.json
│   │       ├── dpo_zero2_offload.json
│   │       ├── zero2.json
│   │       ├── zero2_offload.json
│   │       ├── zero3.json
│   │       └── zero3_offload.json
│   ├── constants.py
│   ├── conversation.py
│   ├── data/
│   │   ├── __init__.py
│   │   ├── data_utils.py
│   │   └── dataset.py
│   ├── eval/
│   │   ├── __init__.py
│   │   ├── eval_gpt_mmhal.py
│   │   ├── eval_gpt_objhal.py
│   │   ├── eval_gpt_review.py
│   │   ├── eval_gpt_review_bench.py
│   │   ├── eval_gpt_review_visual.py
│   │   ├── eval_gqa.py
│   │   ├── eval_gqa_1.py
│   │   ├── eval_pope.py
│   │   ├── eval_science_qa.py
│   │   ├── eval_science_qa_gpt4.py
│   │   ├── eval_science_qa_gpt4_requery.py
│   │   ├── eval_textvqa.py
│   │   ├── generate_webpage_data_from_table.py
│   │   ├── gpt4_grpc.py
│   │   ├── m4c_evaluator.py
│   │   ├── model_qa.py
│   │   ├── model_vqa.py
│   │   ├── model_vqa_loader.py
│   │   ├── model_vqa_mmbench.py
│   │   ├── model_vqa_mmhal.py
│   │   ├── model_vqa_objhal.py
│   │   ├── model_vqa_qbench.py
│   │   ├── model_vqa_science.py
│   │   ├── qa_baseline_gpt35.py
│   │   ├── run_llava.py
│   │   ├── summarize_gpt_review.py
│   │   ├── table/
│   │   │   ├── answer/
│   │   │   │   ├── answer_alpaca-13b.jsonl
│   │   │   │   ├── answer_bard.jsonl
│   │   │   │   ├── answer_gpt35.jsonl
│   │   │   │   ├── answer_llama-13b.jsonl
│   │   │   │   └── answer_vicuna-13b.jsonl
│   │   │   ├── caps_boxes_coco2014_val_80.jsonl
│   │   │   ├── model.jsonl
│   │   │   ├── prompt.jsonl
│   │   │   ├── question.jsonl
│   │   │   ├── results/
│   │   │   │   ├── test_sqa_llava_13b_v0.json
│   │   │   │   └── test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json
│   │   │   ├── review/
│   │   │   │   ├── review_alpaca-13b_vicuna-13b.jsonl
│   │   │   │   ├── review_bard_vicuna-13b.jsonl
│   │   │   │   ├── review_gpt35_vicuna-13b.jsonl
│   │   │   │   └── review_llama-13b_vicuna-13b.jsonl
│   │   │   ├── reviewer.jsonl
│   │   │   └── rule.json
│   │   └── webpage/
│   │       ├── index.html
│   │       ├── script.js
│   │       └── styles.css
│   ├── mm_utils.py
│   ├── model/
│   │   ├── __init__.py
│   │   ├── apply_delta.py
│   │   ├── builder.py
│   │   ├── cache_utils.py
│   │   ├── consolidate.py
│   │   ├── import_utils.py
│   │   ├── language_model/
│   │   │   ├── gemma2/
│   │   │   │   ├── configuration_gemma2.py
│   │   │   │   ├── modeling_gemma2.py
│   │   │   │   ├── tokenization_gemma2.py
│   │   │   │   └── tokenization_gemma2_fast.py
│   │   │   ├── llama/
│   │   │   │   ├── configuration_llama.py
│   │   │   │   └── modeling_llama.py
│   │   │   ├── llava_gemma2.py
│   │   │   ├── llava_gemma2_moe.py
│   │   │   ├── llava_llama.py
│   │   │   ├── llava_llama_moe.py
│   │   │   ├── llava_minicpm.py
│   │   │   ├── llava_minicpm_moe.py
│   │   │   ├── llava_mistral.py
│   │   │   ├── llava_mistral_moe.py
│   │   │   ├── llava_mpt.py
│   │   │   ├── llava_phi.py
│   │   │   ├── llava_phi_moe.py
│   │   │   ├── llava_qwen.py
│   │   │   ├── llava_qwen1_5.py
│   │   │   ├── llava_qwen1_5_moe.py
│   │   │   ├── llava_qwen2.py
│   │   │   ├── llava_qwen2_moe.py
│   │   │   ├── llava_qwen_moe.py
│   │   │   ├── llava_stablelm.py
│   │   │   ├── llava_stablelm_moe.py
│   │   │   ├── minicpm/
│   │   │   │   ├── configuration_minicpm.py
│   │   │   │   └── modeling_minicpm.py
│   │   │   ├── mpt/
│   │   │   │   ├── adapt_tokenizer.py
│   │   │   │   ├── attention.py
│   │   │   │   ├── blocks.py
│   │   │   │   ├── configuration_mpt.py
│   │   │   │   ├── custom_embedding.py
│   │   │   │   ├── flash_attn_triton.py
│   │   │   │   ├── hf_prefixlm_converter.py
│   │   │   │   ├── meta_init_context.py
│   │   │   │   ├── modeling_mpt.py
│   │   │   │   ├── norm.py
│   │   │   │   └── param_init_fns.py
│   │   │   ├── phi/
│   │   │   │   ├── configuration_phi.py
│   │   │   │   └── modeling_phi.py
│   │   │   ├── qwen/
│   │   │   │   ├── configuration_qwen.py
│   │   │   │   ├── cpp_kernels.py
│   │   │   │   ├── modeling_qwen.py
│   │   │   │   ├── qwen_generation_utils.py
│   │   │   │   └── tokenization_qwen.py
│   │   │   ├── qwen1_5/
│   │   │   │   ├── configuration_qwen2.py
│   │   │   │   └── modeling_qwen2.py
│   │   │   ├── qwen2/
│   │   │   │   ├── configuration_qwen2.py
│   │   │   │   └── modeling_qwen2.py
│   │   │   └── stablelm/
│   │   │       ├── configuration_stablelm_epoch.py
│   │   │       ├── modeling_stablelm_epoch.py
│   │   │       └── tokenization_arcade100k.py
│   │   ├── llava_arch.py
│   │   ├── make_delta.py
│   │   ├── modeling_flash_attention_utils.py
│   │   ├── multimodal_encoder/
│   │   │   ├── builder.py
│   │   │   ├── clip_encoder.py
│   │   │   ├── clips2_encoder.py
│   │   │   └── siglip_encoder.py
│   │   ├── multimodal_projector/
│   │   │   ├── builder.py
│   │   │   ├── pool_block.py
│   │   │   ├── qformer.py
│   │   │   └── simple_block.py
│   │   └── utils.py
│   ├── serve/
│   │   ├── __init__.py
│   │   ├── cli.py
│   │   └── utils.py
│   ├── train/
│   │   ├── __init__.py
│   │   ├── align_train.py
│   │   ├── align_trainer.py
│   │   ├── dpo_train.py
│   │   ├── dpo_trainer.py
│   │   ├── llava_trainer.py
│   │   ├── train.py
│   │   └── train_utils.py
│   └── utils.py
├── requirements.txt
├── scripts/
│   ├── activated_params.py
│   ├── convert_gqa_for_eval.py
│   ├── convert_mmbench_for_submission.py
│   ├── convert_mmvet_for_eval.py
│   ├── convert_seed_for_submission.py
│   ├── convert_sqa_to_llava.py
│   ├── convert_sqa_to_llava_base_prompt.py
│   ├── convert_vizwiz_for_submission.py
│   ├── convert_vqav2_for_submission.py
│   ├── extract_mm_projector.py
│   ├── kill.py
│   ├── merge_lora_weights.py
│   ├── merge_moe_lora_weights.py
│   ├── sqa_eval_batch.sh
│   └── sqa_eval_gather.sh
└── shells/
    ├── eval/
    │   ├── gqa.sh
    │   ├── mmbench.sh
    │   ├── mmbench_cn.sh
    │   ├── mme.sh
    │   ├── mmhal.sh
    │   ├── objhal.sh
    │   ├── pope.sh
    │   ├── sqa.sh
    │   ├── textvqa.sh
    │   └── vizwiz.sh
    ├── inference/
    │   └── cli_inference.sh
    └── train/
        └── qwen/
            ├── dense2dense_distillation.sh
            ├── dense2sparse_distillation.sh
            ├── finetune.sh
            ├── finetune_moe.sh
            ├── preference_distillation.sh
            └── pretrain.sh
Download .txt
SYMBOL INDEX (1352 symbols across 120 files)

FILE: llavamod/config/args.py
  class ModelArguments (line 9) | class ModelArguments:
  class DataArguments (line 63) | class DataArguments:
  class TrainingArguments (line 76) | class TrainingArguments(transformers.TrainingArguments):
  class AlignArguments (line 115) | class AlignArguments:
  class DPOArguments (line 127) | class DPOArguments:

FILE: llavamod/conversation.py
  class SeparatorStyle (line 6) | class SeparatorStyle(Enum):
  class Conversation (line 18) | class Conversation:
    method get_prompt (line 31) | def get_prompt(self):
    method append_message (line 125) | def append_message(self, role, message):
    method get_images (line 128) | def get_images(self, return_pil=False):
    method to_gradio_chatbot (line 178) | def to_gradio_chatbot(self):
    method copy (line 209) | def copy(self):
    method dict (line 220) | def dict(self):

FILE: llavamod/data/data_utils.py
  function smart_tokenizer_and_embedding_resize (line 18) | def smart_tokenizer_and_embedding_resize(
  function _tokenize_fn (line 43) | def _tokenize_fn(strings: Sequence[str],
  function _mask_targets (line 70) | def _mask_targets(target, tokenized_lens, speakers):
  function _add_speaker_and_signal (line 81) | def _add_speaker_and_signal(header, source, get_conversation=True):
  function preprocess_multimodal (line 102) | def preprocess_multimodal(
  function preprocess_llama_2 (line 154) | def preprocess_llama_2(
  function preprocess_v1 (line 236) | def preprocess_v1(
  function preprocess_phi (line 318) | def preprocess_phi(
  function preprocess_openchat (line 397) | def preprocess_openchat(
  function preprocess_mpt (line 478) | def preprocess_mpt(
  function preprocess_gemma_2 (line 545) | def preprocess_gemma_2(
  function preprocess_plain (line 627) | def preprocess_plain(
  function preprocess (line 653) | def preprocess(
  function expand2square (line 714) | def expand2square(pil_img, background_color):

FILE: llavamod/data/dataset.py
  function rank0_print (line 14) | def rank0_print(*args):
  class LazySupervisedDataset (line 25) | class LazySupervisedDataset(Dataset):
    method __init__ (line 28) | def __init__(self, data_path: str,
    method __len__ (line 49) | def __len__(self):
    method modality_lengths (line 53) | def modality_lengths(self):
    method __getitem__ (line 63) | def __getitem__(self, i) -> Dict[str, torch.Tensor]:
  class DataCollatorForSupervisedDataset (line 168) | class DataCollatorForSupervisedDataset(object):
    method __call__ (line 173) | def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
  function make_supervised_data_module (line 235) | def make_supervised_data_module(tokenizer: transformers.PreTrainedTokeni...
  class LazyDPODataset (line 253) | class LazyDPODataset(Dataset):
    method __init__ (line 256) | def __init__(self, data_path: str,
    method __len__ (line 277) | def __len__(self):
    method modality_lengths (line 281) | def modality_lengths(self):
    method __getitem__ (line 291) | def __getitem__(self, i) -> Dict[str, torch.Tensor]:
  class DataCollatorForDPODataset (line 435) | class DataCollatorForDPODataset(object):
    method __call__ (line 440) | def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
  function make_dpo_data_module (line 508) | def make_dpo_data_module(tokenizer: transformers.PreTrainedTokenizer,

FILE: llavamod/eval/eval_gpt_objhal.py
  function parse_object_list (line 33) | def parse_object_list(content):
  function preprocess_coh_results (line 72) | def preprocess_coh_results(caps):
  function combine_coco_captions (line 92) | def combine_coco_captions(annotation_path):
  function combine_coco_instances (line 108) | def combine_coco_instances(annotation_path):
  class CHAIR (line 126) | class CHAIR(object):
    method __init__ (line 128) | def __init__(self, imids, coco_path, openai_apikey):
    method _load_generated_captions_into_evaluator (line 168) | def _load_generated_captions_into_evaluator(self, cap_file, sample_num...
    method get_double_words_only (line 184) | def get_double_words_only(self, word_list):
    method caption_to_words (line 202) | def caption_to_words(self, caption):
    method caption_objects_to_coco_objects (line 241) | def caption_objects_to_coco_objects(self, words):
    method get_annotations_from_segments (line 255) | def get_annotations_from_segments(self):
    method get_annotations_from_captions (line 278) | def get_annotations_from_captions(self):
    method get_annotations (line 298) | def get_annotations(self):
    method get_gpt_resp (line 307) | def get_gpt_resp(self, data_item):
    method gpt_caption_processor (line 359) | def gpt_caption_processor(self, max_workers=64):
    method postagging (line 390) | def postagging(self, doc):
    method get_pred_objs_match (line 406) | def get_pred_objs_match(self, caps):
    method compute_chair (line 437) | def compute_chair(self, cap_file, sample_num, gpt_process=False, org_d...
  function read_jsonl (line 549) | def read_jsonl(jsonl_file):
  function load_generated_captions (line 557) | def load_generated_captions(cap_file, org_dir=None):
  function save_hallucinated_words (line 639) | def save_hallucinated_words(cap_file, cap_dict, save_dir, sample_num):
  function print_metrics (line 645) | def print_metrics(hallucination_cap_dict, quiet=False):

FILE: llavamod/eval/eval_gpt_review.py
  function get_eval (line 13) | def get_eval(content: str, max_tokens: int):
  function parse_score (line 39) | def parse_score(review):

FILE: llavamod/eval/eval_gpt_review_bench.py
  function get_eval (line 11) | def get_eval(content: str, max_tokens: int):
  function parse_score (line 36) | def parse_score(review):

FILE: llavamod/eval/eval_gpt_review_visual.py
  function get_eval (line 11) | def get_eval(content: str, max_tokens: int):
  function parse_score (line 36) | def parse_score(review):

FILE: llavamod/eval/eval_gqa.py
  function loadFile (line 108) | def loadFile(name):
  function toScore (line 168) | def toScore(b):
  function avg (line 173) | def avg(l):
  function wavg (line 179) | def wavg(l, w):
  function getWordsNum (line 215) | def getWordsNum(question):
  function getStepsNum (line 220) | def getStepsNum(question):
  function toSlice (line 229) | def toSlice(strSlice):
  function intsFromSlice (line 238) | def intsFromSlice(strSlice):
  function belongs (line 246) | def belongs(element, group, question):
  function updateConsistency (line 257) | def updateConsistency(questionId, question, questions):
  function yrange (line 278) | def yrange(c):
  function xrange (line 282) | def xrange(c):
  function length (line 286) | def length(r):
  function size (line 292) | def size(c):
  function intersection (line 296) | def intersection(r1, r2):
  function intersectionSize (line 303) | def intersectionSize(c1, c2):
  function intersectionRate (line 307) | def intersectionRate(c1, c2):
  function getCell (line 312) | def getCell(i, j):
  function getRegion (line 318) | def getRegion(sceneGraph, objectId):
  function computeGroundingScore (line 329) | def computeGroundingScore(question, sceneGraph, attentionMap):
  function chiSquare (line 359) | def chiSquare(goldDist, predictedDist):

FILE: llavamod/eval/eval_gqa_1.py
  function loadFile (line 108) | def loadFile(name):
  function toScore (line 162) | def toScore(b):
  function avg (line 167) | def avg(l):
  function wavg (line 173) | def wavg(l, w):
  function getWordsNum (line 209) | def getWordsNum(question):
  function getStepsNum (line 214) | def getStepsNum(question):
  function toSlice (line 223) | def toSlice(strSlice):
  function intsFromSlice (line 232) | def intsFromSlice(strSlice):
  function belongs (line 240) | def belongs(element, group, question):
  function updateConsistency (line 251) | def updateConsistency(questionId, question, questions):
  function yrange (line 272) | def yrange(c):
  function xrange (line 276) | def xrange(c):
  function length (line 280) | def length(r):
  function size (line 286) | def size(c):
  function intersection (line 290) | def intersection(r1, r2):
  function intersectionSize (line 297) | def intersectionSize(c1, c2):
  function intersectionRate (line 301) | def intersectionRate(c1, c2):
  function getCell (line 306) | def getCell(i, j):
  function getRegion (line 312) | def getRegion(sceneGraph, objectId):
  function computeGroundingScore (line 323) | def computeGroundingScore(question, sceneGraph, attentionMap):
  function chiSquare (line 353) | def chiSquare(goldDist, predictedDist):

FILE: llavamod/eval/eval_pope.py
  function eval_pope (line 6) | def eval_pope(answers, label_file):

FILE: llavamod/eval/eval_science_qa.py
  function get_args (line 8) | def get_args():
  function convert_caps (line 19) | def convert_caps(results):
  function get_pred_idx (line 28) | def get_pred_idx(prediction, choices, options):

FILE: llavamod/eval/eval_science_qa_gpt4.py
  function get_args (line 9) | def get_args():
  function convert_caps (line 19) | def convert_caps(results):
  function get_pred_idx (line 28) | def get_pred_idx(prediction, choices, options):

FILE: llavamod/eval/eval_science_qa_gpt4_requery.py
  function get_args (line 9) | def get_args():
  function convert_caps (line 21) | def convert_caps(results):
  function get_pred_idx (line 30) | def get_pred_idx(prediction, choices, options):

FILE: llavamod/eval/eval_textvqa.py
  function get_args (line 9) | def get_args():
  function prompt_processor (line 17) | def prompt_processor(prompt):
  function eval_single (line 35) | def eval_single(annotation_file, result_file):

FILE: llavamod/eval/generate_webpage_data_from_table.py
  function read_jsonl (line 10) | def read_jsonl(path: str, key: str=None):
  function trim_hanging_lines (line 23) | def trim_hanging_lines(s: str, n: int) -> str:

FILE: llavamod/eval/gpt4_grpc.py
  class Chat (line 6) | class Chat:
    method __init__ (line 7) | def __init__(self, model="", timeout_sec=20, openai_apikey=''):
    method chat_completion (line 12) | def chat_completion(self, messages, temperature=0.2, top_p=1, max_toke...
  function get_eval (line 29) | def get_eval(model, content: str,

FILE: llavamod/eval/m4c_evaluator.py
  class EvalAIAnswerProcessor (line 7) | class EvalAIAnswerProcessor:
    method __init__ (line 178) | def __init__(self, *args, **kwargs):
    method word_tokenize (line 181) | def word_tokenize(self, word):
    method process_punctuation (line 186) | def process_punctuation(self, in_text):
    method process_digit_article (line 198) | def process_digit_article(self, in_text):
    method __call__ (line 213) | def __call__(self, item):
  class TextVQAAccuracyEvaluator (line 221) | class TextVQAAccuracyEvaluator:
    method __init__ (line 222) | def __init__(self):
    method _compute_answer_scores (line 225) | def _compute_answer_scores(self, raw_answers):
    method eval_pred_list (line 248) | def eval_pred_list(self, pred_list):
  class STVQAAccuracyEvaluator (line 260) | class STVQAAccuracyEvaluator:
    method __init__ (line 261) | def __init__(self):
    method eval_pred_list (line 264) | def eval_pred_list(self, pred_list):
  class STVQAANLSEvaluator (line 276) | class STVQAANLSEvaluator:
    method __init__ (line 277) | def __init__(self):
    method get_anls (line 282) | def get_anls(self, s1, s2):
    method eval_pred_list (line 289) | def eval_pred_list(self, pred_list):
  class TextCapsBleu4Evaluator (line 301) | class TextCapsBleu4Evaluator:
    method __init__ (line 302) | def __init__(self):
    method eval_pred_list (line 321) | def eval_pred_list(self, pred_list):

FILE: llavamod/eval/model_qa.py
  class KeywordsStoppingCriteria (line 14) | class KeywordsStoppingCriteria(StoppingCriteria):
    method __init__ (line 15) | def __init__(self, keywords, tokenizer, input_ids):
    method __call__ (line 21) | def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTe...
  function eval_model (line 33) | def eval_model(model_name, questions_file, answers_file):

FILE: llavamod/eval/model_vqa.py
  function split_list (line 19) | def split_list(lst, n):
  function get_chunk (line 25) | def get_chunk(lst, n, k):
  function eval_model (line 30) | def eval_model(args):

FILE: llavamod/eval/model_vqa_loader.py
  function split_list (line 20) | def split_list(lst, n):
  function get_chunk (line 26) | def get_chunk(lst, n, k):
  class CustomDataset (line 32) | class CustomDataset(Dataset):
    method __init__ (line 33) | def __init__(self, questions, image_folder, tokenizer, image_processor...
    method __getitem__ (line 40) | def __getitem__(self, index):
    method __len__ (line 61) | def __len__(self):
  function create_data_loader (line 66) | def create_data_loader(questions, image_folder, tokenizer, image_process...
  function eval_model (line 73) | def eval_model(args):

FILE: llavamod/eval/model_vqa_mmbench.py
  function split_list (line 22) | def split_list(lst, n):
  function get_chunk (line 28) | def get_chunk(lst, n, k):
  function is_none (line 33) | def is_none(value):
  function get_options (line 45) | def get_options(row, options):
  function eval_model (line 55) | def eval_model(args):

FILE: llavamod/eval/model_vqa_mmhal.py
  function split_list (line 20) | def split_list(lst, n):
  function get_chunk (line 26) | def get_chunk(lst, n, k):
  function eval_model (line 31) | def eval_model(args):

FILE: llavamod/eval/model_vqa_objhal.py
  function split_list (line 21) | def split_list(lst, n):
  function get_chunk (line 27) | def get_chunk(lst, n, k):
  function eval_model (line 32) | def eval_model(args):

FILE: llavamod/eval/model_vqa_qbench.py
  function load_image (line 18) | def load_image(image_file):
  function eval_model (line 27) | def eval_model(args):

FILE: llavamod/eval/model_vqa_science.py
  function split_list (line 19) | def split_list(lst, n):
  function get_chunk (line 25) | def get_chunk(lst, n, k):
  function eval_model (line 30) | def eval_model(args):

FILE: llavamod/eval/qa_baseline_gpt35.py
  function get_answer (line 16) | def get_answer(question_id: int, question: str, max_tokens: int):

FILE: llavamod/eval/run_llava.py
  function image_parser (line 29) | def image_parser(args):
  function load_image (line 34) | def load_image(image_file):
  function load_images (line 43) | def load_images(image_files):
  function eval_model (line 51) | def eval_model(args):

FILE: llavamod/eval/summarize_gpt_review.py
  function parse_args (line 10) | def parse_args():

FILE: llavamod/eval/webpage/script.js
  function text2Markdown (line 35) | function text2Markdown(text) {
  function capitalizeFirstChar (line 41) | function capitalizeFirstChar(str) {
  function updateQuestionSelect (line 48) | function updateQuestionSelect(question_id) {
  function updateModelSelect (line 64) | function updateModelSelect() {
  function populateModels (line 70) | function populateModels(models) {
  function populateQuestions (line 81) | function populateQuestions(questions) {
  function displayQuestion (line 110) | function displayQuestion(index) {
  function displayAnswers (line 116) | function displayAnswers(index) {
  function switchQuestionAndCategory (line 203) | function switchQuestionAndCategory() {
  function updateExpandButtonVisibility (line 226) | function updateExpandButtonVisibility(card) {

FILE: llavamod/mm_utils.py
  function load_image_from_base64 (line 10) | def load_image_from_base64(image):
  function expand2square (line 14) | def expand2square(pil_img, background_color):
  function process_images (line 28) | def process_images(images, image_processor, model_cfg):
  function tokenizer_image_token (line 43) | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOK...
  function get_model_name_from_path (line 65) | def get_model_name_from_path(model_path):
  class KeywordsStoppingCriteria (line 74) | class KeywordsStoppingCriteria(StoppingCriteria):
    method __init__ (line 75) | def __init__(self, keywords, tokenizer, input_ids):
    method call_for_batch (line 89) | def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.F...
    method __call__ (line 101) | def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTe...
  function is_gemma_tokenizer (line 108) | def is_gemma_tokenizer(tokenizer):

FILE: llavamod/model/apply_delta.py
  function apply_delta (line 13) | def apply_delta(base_model_path, target_model_path, delta_path):

FILE: llavamod/model/builder.py
  function load_pretrained_model (line 57) | def load_pretrained_model(model_path, model_base, model_name, load_8bit=...

FILE: llavamod/model/cache_utils.py
  class Cache (line 18) | class Cache(torch.nn.Module):
    method __init__ (line 23) | def __init__(self):
    method update (line 26) | def update(
    method get_seq_length (line 52) | def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
    method get_max_length (line 57) | def get_max_length(self) -> Optional[int]:
    method get_usable_length (line 61) | def get_usable_length(self, new_seq_length: int, layer_idx: Optional[i...
    method reorder_cache (line 72) | def reorder_cache(self, beam_idx: torch.LongTensor):
    method seen_tokens (line 81) | def seen_tokens(self):
  class CacheConfig (line 93) | class CacheConfig:
    method from_dict (line 101) | def from_dict(cls, config_dict, **kwargs):
    method to_json_file (line 122) | def to_json_file(self, json_file_path: Union[str, os.PathLike]):
    method to_dict (line 140) | def to_dict(self) -> Dict[str, Any]:
    method __iter__ (line 148) | def __iter__(self):
    method __repr__ (line 154) | def __repr__(self):
    method to_json_string (line 157) | def to_json_string(self):
    method update (line 166) | def update(self, **kwargs):
  class DynamicCache (line 189) | class DynamicCache(Cache):
    method __init__ (line 197) | def __init__(self) -> None:
    method __getitem__ (line 203) | def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
    method __iter__ (line 213) | def __iter__(self):
    method __len__ (line 221) | def __len__(self):
    method update (line 228) | def update(
    method get_seq_length (line 265) | def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
    method get_max_length (line 272) | def get_max_length(self) -> Optional[int]:
    method to_legacy_cache (line 276) | def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Te...
    method from_legacy_cache (line 285) | def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch...
    method crop (line 295) | def crop(self, max_length: int):
    method batch_split (line 310) | def batch_split(self, full_batch_size: int, split_size: int) -> List["...
    method from_batch_splits (line 323) | def from_batch_splits(cls, splits: List["DynamicCache"]) -> "DynamicCa...
    method batch_repeat_interleave (line 333) | def batch_repeat_interleave(self, repeats: int):
    method batch_select_indices (line 339) | def batch_select_indices(self, indices: torch.Tensor):
  class OffloadedCache (line 346) | class OffloadedCache(DynamicCache):
    method __init__ (line 359) | def __init__(self) -> None:
    method prefetch_layer (line 367) | def prefetch_layer(self, layer_idx: int):
    method evict_previous_layer (line 376) | def evict_previous_layer(self, layer_idx: int):
    method __getitem__ (line 384) | def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
    method reorder_cache (line 406) | def reorder_cache(self, beam_idx: torch.LongTensor):
    method update (line 413) | def update(
  class HybridCache (line 458) | class HybridCache(Cache):
    method __init__ (line 479) | def __init__(
    method _sliding_update (line 534) | def _sliding_update(self, cache_position, layer_idx, key_states, value...
    method _static_update (line 562) | def _static_update(self, cache_position, layer_idx, key_states, value_...
    method update (line 570) | def update(
    method get_max_length (line 598) | def get_max_length(self) -> Optional[int]:
    method get_seq_length (line 603) | def get_seq_length(self, layer_idx: Optional[int] = 0):
    method reset (line 606) | def reset(self):

FILE: llavamod/model/consolidate.py
  function consolidate_ckpt (line 13) | def consolidate_ckpt(src_path, dst_path):

FILE: llavamod/model/import_utils.py
  function _is_package_available (line 16) | def _is_package_available(pkg_name: str, return_version: bool = False) -...
  function is_flash_attn_greater_or_equal (line 50) | def is_flash_attn_greater_or_equal(library_version: str):

FILE: llavamod/model/language_model/gemma2/configuration_gemma2.py
  class Gemma2Config (line 9) | class Gemma2Config(PretrainedConfig):
    method __init__ (line 82) | def __init__(

FILE: llavamod/model/language_model/gemma2/modeling_gemma2.py
  function _prepare_4d_causal_attention_mask_with_cache_position (line 45) | def _prepare_4d_causal_attention_mask_with_cache_position(
  class Gemma2RMSNorm (line 97) | class Gemma2RMSNorm(nn.Module):
    method __init__ (line 98) | def __init__(self, dim: int, eps: float = 1e-6):
    method _norm (line 103) | def _norm(self, x):
    method forward (line 106) | def forward(self, x):
    method extra_repr (line 113) | def extra_repr(self):
  class Gemma2RotaryEmbedding (line 117) | class Gemma2RotaryEmbedding(nn.Module):
    method __init__ (line 118) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
    method forward (line 128) | def forward(self, x, position_ids, seq_len=None):
  function rotate_half (line 145) | def rotate_half(x):
  function apply_rotary_pos_emb (line 152) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_di...
  class Gemma2MLP (line 179) | class Gemma2MLP(nn.Module):
    method __init__ (line 180) | def __init__(self, config):
    method forward (line 190) | def forward(self, x):
  function repeat_kv (line 194) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
  class Gemma2Attention (line 206) | class Gemma2Attention(nn.Module):
    method __init__ (line 209) | def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = No...
    method forward (line 248) | def forward(
  class Gemma2FlashAttention2 (line 316) | class Gemma2FlashAttention2(Gemma2Attention):
    method __init__ (line 323) | def __init__(self, *args, **kwargs):
    method forward (line 331) | def forward(
  class Gemma2SdpaAttention (line 431) | class Gemma2SdpaAttention(Gemma2Attention):
    method forward (line 439) | def forward(
  class Gemma2DecoderLayer (line 528) | class Gemma2DecoderLayer(nn.Module):
    method __init__ (line 529) | def __init__(self, config: Gemma2Config, layer_idx: int):
    method forward (line 545) | def forward(
  class Gemma2PreTrainedModel (line 634) | class Gemma2PreTrainedModel(PreTrainedModel):
    method _init_weights (line 646) | def _init_weights(self, module):
  class Gemma2Model (line 739) | class Gemma2Model(Gemma2PreTrainedModel):
    method __init__ (line 747) | def __init__(self, config: Gemma2Config):
    method get_input_embeddings (line 762) | def get_input_embeddings(self):
    method set_input_embeddings (line 765) | def set_input_embeddings(self, value):
    method forward (line 769) | def forward(
    method _update_causal_mask (line 895) | def _update_causal_mask(
  class Gemma2ForCausalLM (line 932) | class Gemma2ForCausalLM(Gemma2PreTrainedModel):
    method __init__ (line 935) | def __init__(self, config):
    method get_input_embeddings (line 944) | def get_input_embeddings(self):
    method set_input_embeddings (line 947) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 950) | def get_output_embeddings(self):
    method set_output_embeddings (line 953) | def set_output_embeddings(self, new_embeddings):
    method set_decoder (line 956) | def set_decoder(self, decoder):
    method get_decoder (line 959) | def get_decoder(self):
    method forward (line 964) | def forward(
    method prepare_inputs_for_generation (line 1048) | def prepare_inputs_for_generation(
  class Gemma2ForSequenceClassification (line 1137) | class Gemma2ForSequenceClassification(Gemma2PreTrainedModel):
    method __init__ (line 1138) | def __init__(self, config):
    method get_input_embeddings (line 1147) | def get_input_embeddings(self):
    method set_input_embeddings (line 1150) | def set_input_embeddings(self, value):
    method forward (line 1154) | def forward(
  class Gemma2ForTokenClassification (line 1252) | class Gemma2ForTokenClassification(Gemma2PreTrainedModel):
    method __init__ (line 1253) | def __init__(self, config):
    method get_input_embeddings (line 1269) | def get_input_embeddings(self):
    method set_input_embeddings (line 1272) | def set_input_embeddings(self, value):
    method forward (line 1276) | def forward(

FILE: llavamod/model/language_model/gemma2/tokenization_gemma2.py
  class GemmaTokenizer (line 38) | class GemmaTokenizer(PreTrainedTokenizer):
    method __init__ (line 88) | def __init__(
    method __getstate__ (line 132) | def __getstate__(self):
    method __setstate__ (line 139) | def __setstate__(self, d):
    method vocab_size (line 146) | def vocab_size(self):
    method get_vocab (line 151) | def get_vocab(self):
    method _tokenize (line 157) | def _tokenize(self, text, **kwargs):
    method _convert_token_to_id (line 164) | def _convert_token_to_id(self, token):
    method _convert_id_to_token (line 169) | def _convert_id_to_token(self, index):
    method _decode (line 174) | def _decode(
    method convert_tokens_to_string (line 203) | def convert_tokens_to_string(self, tokens):
    method save_vocabulary (line 218) | def save_vocabulary(self, save_directory, filename_prefix: Optional[st...
    method build_inputs_with_special_tokens (line 246) | def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=No...
    method get_special_tokens_mask (line 258) | def get_special_tokens_mask(
    method create_token_type_ids_from_sequences (line 296) | def create_token_type_ids_from_sequences(

FILE: llavamod/model/language_model/gemma2/tokenization_gemma2_fast.py
  class GemmaTokenizerFast (line 37) | class GemmaTokenizerFast(PreTrainedTokenizerFast):
    method __init__ (line 90) | def __init__(
    method can_save_slow_tokenizer (line 121) | def can_save_slow_tokenizer(self) -> bool:
    method update_post_processor (line 125) | def update_post_processor(self):
    method add_eos_token (line 152) | def add_eos_token(self):
    method add_bos_token (line 156) | def add_bos_token(self):
    method add_eos_token (line 160) | def add_eos_token(self, value):
    method add_bos_token (line 165) | def add_bos_token(self, value):
    method save_vocabulary (line 170) | def save_vocabulary(self, save_directory: str, filename_prefix: Option...
    method build_inputs_with_special_tokens (line 190) | def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=No...

FILE: llavamod/model/language_model/llama/configuration_llama.py
  class LlamaConfig (line 31) | class LlamaConfig(PretrainedConfig):
    method __init__ (line 117) | def __init__(
    method _rope_scaling_validation (line 172) | def _rope_scaling_validation(self):

FILE: llavamod/model/language_model/llama/modeling_llama.py
  function _get_unpad_data (line 77) | def _get_unpad_data(attention_mask):
  function _expand_mask (line 89) | def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Option...
  function _make_causal_mask (line 96) | def _make_causal_mask(
  class LlamaRMSNorm (line 107) | class LlamaRMSNorm(nn.Module):
    method __init__ (line 108) | def __init__(self, hidden_size, eps=1e-6):
    method forward (line 116) | def forward(self, hidden_states):
  class LlamaRotaryEmbedding (line 127) | class LlamaRotaryEmbedding(nn.Module):
    method __init__ (line 128) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
    method _set_cos_sin_cache (line 142) | def _set_cos_sin_cache(self, seq_len, device, dtype):
    method forward (line 152) | def forward(self, x, seq_len=None):
  class LlamaLinearScalingRotaryEmbedding (line 163) | class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
    method __init__ (line 166) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
    method _set_cos_sin_cache (line 170) | def _set_cos_sin_cache(self, seq_len, device, dtype):
  class LlamaDynamicNTKScalingRotaryEmbedding (line 182) | class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
    method __init__ (line 185) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
    method _set_cos_sin_cache (line 189) | def _set_cos_sin_cache(self, seq_len, device, dtype):
  function rotate_half (line 208) | def rotate_half(x):
  function apply_rotary_pos_emb (line 215) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
  class LlamaMLP (line 243) | class LlamaMLP(nn.Module):
    method __init__ (line 244) | def __init__(self, config):
    method forward (line 254) | def forward(self, x):
  function repeat_kv (line 277) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
  class LlamaAttention (line 289) | class LlamaAttention(nn.Module):
    method __init__ (line 292) | def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None):
    method _init_rope (line 325) | def _init_rope(self):
    method _shape (line 352) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
    method forward (line 355) | def forward(
  class LlamaFlashAttention2 (line 460) | class LlamaFlashAttention2(LlamaAttention):
    method __init__ (line 467) | def __init__(self, *args, **kwargs):
    method forward (line 475) | def forward(
    method _flash_attention_forward (line 565) | def _flash_attention_forward(
    method _upad_input (line 624) | def _upad_input(self, query_layer, key_layer, value_layer, attention_m...
  class LlamaSdpaAttention (line 663) | class LlamaSdpaAttention(LlamaAttention):
    method forward (line 671) | def forward(
  class LlamaDecoderLayer (line 757) | class LlamaDecoderLayer(nn.Module):
    method __init__ (line 758) | def __init__(self, config: LlamaConfig, layer_idx: int):
    method forward (line 768) | def forward(
  class LlamaPreTrainedModel (line 851) | class LlamaPreTrainedModel(PreTrainedModel):
    method _init_weights (line 861) | def _init_weights(self, module):
  class LlamaModel (line 947) | class LlamaModel(LlamaPreTrainedModel):
    method __init__ (line 955) | def __init__(self, config: LlamaConfig):
    method get_input_embeddings (line 972) | def get_input_embeddings(self):
    method set_input_embeddings (line 975) | def set_input_embeddings(self, value):
    method forward (line 979) | def forward(
  class LlamaForCausalLM (line 1110) | class LlamaForCausalLM(LlamaPreTrainedModel):
    method __init__ (line 1113) | def __init__(self, config):
    method get_input_embeddings (line 1122) | def get_input_embeddings(self):
    method set_input_embeddings (line 1125) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 1128) | def get_output_embeddings(self):
    method set_output_embeddings (line 1131) | def set_output_embeddings(self, new_embeddings):
    method set_decoder (line 1134) | def set_decoder(self, decoder):
    method get_decoder (line 1137) | def get_decoder(self):
    method forward (line 1142) | def forward(
    method prepare_inputs_for_generation (line 1234) | def prepare_inputs_for_generation(
    method _reorder_cache (line 1291) | def _reorder_cache(past_key_values, beam_idx):
  class LlamaForSequenceClassification (line 1315) | class LlamaForSequenceClassification(LlamaPreTrainedModel):
    method __init__ (line 1316) | def __init__(self, config):
    method get_input_embeddings (line 1325) | def get_input_embeddings(self):
    method set_input_embeddings (line 1328) | def set_input_embeddings(self, value):
    method forward (line 1332) | def forward(

FILE: llavamod/model/language_model/llava_gemma2.py
  class LlavaGemma2Config (line 33) | class LlavaGemma2Config(Gemma2Config):
  class LlavaGemma2Model (line 37) | class LlavaGemma2Model(LlavaMetaModel, Gemma2Model):
    method __init__ (line 40) | def __init__(self, config: Gemma2Config):
  class LlavaGemma2ForCausalLM (line 44) | class LlavaGemma2ForCausalLM(Gemma2ForCausalLM, LlavaMetaForCausalLM):
    method __init__ (line 47) | def __init__(self, config):
    method get_model (line 56) | def get_model(self):
    method forward (line 59) | def forward(
    method prepare_inputs_for_generation (line 112) | def prepare_inputs_for_generation(

FILE: llavamod/model/language_model/llava_gemma2_moe.py
  function rank0_print (line 48) | def rank0_print(*args):
  class LLaVAMoDGemma2Config (line 53) | class LLaVAMoDGemma2Config(Gemma2Config):
    method __init__ (line 56) | def __init__(self,
  class LLaVAMoDGemma2Model (line 89) | class LLaVAMoDGemma2Model(LlavaMetaModel, Gemma2Model):
    method __init__ (line 92) | def __init__(self, config: Gemma2Config):
  class MoEBaseModelOutputWithPast (line 97) | class MoEBaseModelOutputWithPast(ModelOutput):
  class MoECausalLMOutputWithPast (line 106) | class MoECausalLMOutputWithPast(ModelOutput):
  function MoEGemma2DecoderLayer_forward (line 117) | def MoEGemma2DecoderLayer_forward(self):
  function MoEGemma2Model_forward (line 196) | def MoEGemma2Model_forward(self):
  class LLaVAMoDGemma2ForCausalLM (line 333) | class LLaVAMoDGemma2ForCausalLM(Gemma2ForCausalLM, LlavaMetaForCausalLM):
    method __init__ (line 336) | def __init__(self, config):
    method get_model (line 345) | def get_model(self):
    method forward (line 348) | def forward(
    method prepare_inputs_for_generation (line 450) | def prepare_inputs_for_generation(
    method initialize_moe_modules (line 472) | def initialize_moe_modules(self, model_args):
  class LLaVAMoDGemma2ForCausalLMFineTune (line 561) | class LLaVAMoDGemma2ForCausalLMFineTune(LLaVAMoDGemma2ForCausalLM):
    method __init__ (line 564) | def __init__(self, config):
    method initialize_moe_modules (line 616) | def initialize_moe_modules(self, model_args):
  class EvalLLaVAMoDGemma2ForCausalLM (line 626) | class EvalLLaVAMoDGemma2ForCausalLM(LLaVAMoDGemma2ForCausalLM):
    method __init__ (line 629) | def __init__(self, config):

FILE: llavamod/model/language_model/llava_llama.py
  class LlavaLlamaConfig (line 33) | class LlavaLlamaConfig(LlamaConfig):
  class LlavaLlamaModel (line 37) | class LlavaLlamaModel(LlavaMetaModel, LlamaModel):
    method __init__ (line 40) | def __init__(self, config: LlamaConfig):
  class LlavaLlamaForCausalLM (line 44) | class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
    method __init__ (line 47) | def __init__(self, config):
    method get_model (line 57) | def get_model(self):
    method forward (line 60) | def forward(
    method prepare_inputs_for_generation (line 112) | def prepare_inputs_for_generation(self, input_ids, past_key_values=Non...

FILE: llavamod/model/language_model/llava_llama_moe.py
  function rank0_print (line 47) | def rank0_print(*args):
  class LLaVAMoDLlamaConfig (line 52) | class LLaVAMoDLlamaConfig(LlamaConfig):
    method __init__ (line 55) | def __init__(self,
  class LLaVAMoDLlamaModel (line 87) | class LLaVAMoDLlamaModel(LlavaMetaModel, LlamaModel):
    method __init__ (line 90) | def __init__(self, config: LlamaConfig):
  class MoEBaseModelOutputWithPast (line 95) | class MoEBaseModelOutputWithPast(ModelOutput):
  class MoECausalLMOutputWithPast (line 104) | class MoECausalLMOutputWithPast(ModelOutput):
  function MoELlamaDecoderLayer_forward_before (line 115) | def MoELlamaDecoderLayer_forward_before(self):
  function MoELlamaModel_forward_before (line 170) | def MoELlamaModel_forward_before(self):
  function MoELlamaDecoderLayer_forward (line 316) | def MoELlamaDecoderLayer_forward(self):
  function MoELlamaModel_forward (line 388) | def MoELlamaModel_forward(self):
  class LLaVAMoDLlamaForCausalLM (line 535) | class LLaVAMoDLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
    method __init__ (line 538) | def __init__(self, config):
    method get_model (line 548) | def get_model(self):
    method forward (line 551) | def forward(
    method prepare_inputs_for_generation (line 653) | def prepare_inputs_for_generation(self, input_ids, past_key_values=Non...
    method initialize_moe_modules (line 662) | def initialize_moe_modules(self, model_args):
  class EvalLLaVAMoDLlamaForCausalLM (line 742) | class EvalLLaVAMoDLlamaForCausalLM(LLaVAMoDLlamaForCausalLM):
    method __init__ (line 745) | def __init__(self, config):

FILE: llavamod/model/language_model/llava_minicpm.py
  class LlavaMiniCPMConfig (line 32) | class LlavaMiniCPMConfig(MiniCPMConfig):
  class LlavaMiniCPMModel (line 36) | class LlavaMiniCPMModel(LlavaMetaModel, MiniCPMModel):
    method __init__ (line 39) | def __init__(self, config: MiniCPMConfig):
  class LlavaMiniCPMForCausalLM (line 43) | class LlavaMiniCPMForCausalLM(MiniCPMForCausalLM, LlavaMetaForCausalLM):
    method __init__ (line 46) | def __init__(self, config):
    method get_model (line 55) | def get_model(self):
    method forward (line 58) | def forward(
    method prepare_inputs_for_generation (line 112) | def prepare_inputs_for_generation(

FILE: llavamod/model/language_model/llava_minicpm_moe.py
  function rank0_print (line 46) | def rank0_print(*args):
  class LLaVAMoDMiniCPMConfig (line 51) | class LLaVAMoDMiniCPMConfig(MiniCPMConfig):
    method __init__ (line 54) | def __init__(self,
  class LLaVAMoDMiniCPMModel (line 87) | class LLaVAMoDMiniCPMModel(LlavaMetaModel, MiniCPMModel):
    method __init__ (line 90) | def __init__(self, config: MiniCPMConfig):
  class MoEBaseModelOutputWithPast (line 95) | class MoEBaseModelOutputWithPast(ModelOutput):
  class MoECausalLMOutputWithPast (line 104) | class MoECausalLMOutputWithPast(ModelOutput):
  function MoEMiniCPMDecoderLayer_forward (line 114) | def MoEMiniCPMDecoderLayer_forward(self):
  function MoEMiniCPMModel_forward (line 172) | def MoEMiniCPMModel_forward(self):
  class LLaVAMoDMiniCPMForCausalLM (line 316) | class LLaVAMoDMiniCPMForCausalLM(MiniCPMForCausalLM, LlavaMetaForCausalLM):
    method __init__ (line 319) | def __init__(self, config):
    method get_model (line 328) | def get_model(self):
    method forward (line 331) | def forward(
    method prepare_inputs_for_generation (line 426) | def prepare_inputs_for_generation(
    method initialize_moe_modules (line 448) | def initialize_moe_modules(self, model_args):
  class EvalLLaVAMoDMiniCPMForCausalLM (line 542) | class EvalLLaVAMoDMiniCPMForCausalLM(LLaVAMoDMiniCPMForCausalLM):
    method __init__ (line 545) | def __init__(self, config):

FILE: llavamod/model/language_model/llava_mistral.py
  class LlavaMistralConfig (line 31) | class LlavaMistralConfig(MistralConfig):
  class LlavaMistralModel (line 35) | class LlavaMistralModel(LlavaMetaModel, MistralModel):
    method __init__ (line 38) | def __init__(self, config: MistralConfig):
  class LlavaMistralForCausalLM (line 42) | class LlavaMistralForCausalLM(MistralForCausalLM, LlavaMetaForCausalLM):
    method __init__ (line 45) | def __init__(self, config):
    method get_model (line 55) | def get_model(self):
    method forward (line 58) | def forward(
    method prepare_inputs_for_generation (line 111) | def prepare_inputs_for_generation(

FILE: llavamod/model/language_model/llava_mistral_moe.py
  function rank0_print (line 42) | def rank0_print(*args):
  class LLaVAMoDMistralConfig (line 47) | class LLaVAMoDMistralConfig(MistralConfig):
    method __init__ (line 50) | def __init__(self,
  class LLaVAMoDMistralModel (line 82) | class LLaVAMoDMistralModel(LlavaMetaModel, MistralModel):
    method __init__ (line 85) | def __init__(self, config: MistralConfig):
  class MoEBaseModelOutputWithPast (line 90) | class MoEBaseModelOutputWithPast(ModelOutput):
  class MoECausalLMOutputWithPast (line 99) | class MoECausalLMOutputWithPast(ModelOutput):
  function MoEMistralDecoderLayer_forward (line 109) | def MoEMistralDecoderLayer_forward(self):
  function MoEMistralModel_forward (line 164) | def MoEMistralModel_forward(self):
  class LLaVAMoDMistralForCausalLM (line 323) | class LLaVAMoDMistralForCausalLM(MistralForCausalLM, LlavaMetaForCausalLM):
    method __init__ (line 326) | def __init__(self, config):
    method get_model (line 336) | def get_model(self):
    method forward (line 339) | def forward(
    method prepare_inputs_for_generation (line 440) | def prepare_inputs_for_generation(self, input_ids, past_key_values=Non...
    method initialize_moe_modules (line 449) | def initialize_moe_modules(self, model_args):
  class EvalLLaVAMoDMistralForCausalLM (line 532) | class EvalLLaVAMoDMistralForCausalLM(LLaVAMoDMistralForCausalLM):
    method __init__ (line 535) | def __init__(self, config):

FILE: llavamod/model/language_model/llava_mpt.py
  class LlavaMPTConfig (line 31) | class LlavaMPTConfig(MPTConfig):
  class LlavaMPTModel (line 35) | class LlavaMPTModel(LlavaMetaModel, MPTModel):
    method __init__ (line 38) | def __init__(self, config: MPTConfig):
    method embed_tokens (line 42) | def embed_tokens(self, x):
  class LlavaMPTForCausalLM (line 46) | class LlavaMPTForCausalLM(MPTForCausalLM, LlavaMetaForCausalLM):
    method __init__ (line 50) | def __init__(self, config):
    method get_model (line 66) | def get_model(self):
    method _set_gradient_checkpointing (line 69) | def _set_gradient_checkpointing(self, module, value=False):
    method forward (line 73) | def forward(self, input_ids: torch.LongTensor, past_key_values: Option...
    method prepare_inputs_for_generation (line 92) | def prepare_inputs_for_generation(self, input_ids, past_key_values=Non...

FILE: llavamod/model/language_model/llava_phi.py
  class LlavaPhiConfig (line 32) | class LlavaPhiConfig(PhiConfig):
  class LlavaPhiModel (line 36) | class LlavaPhiModel(LlavaMetaModel, PhiModel):
    method __init__ (line 39) | def __init__(self, config: PhiConfig):
  class LlavaPhiForCausalLM (line 43) | class LlavaPhiForCausalLM(PhiForCausalLM, LlavaMetaForCausalLM):
    method __init__ (line 46) | def __init__(self, config):
    method get_model (line 55) | def get_model(self):
    method forward (line 58) | def forward(
    method prepare_inputs_for_generation (line 112) | def prepare_inputs_for_generation(

FILE: llavamod/model/language_model/llava_phi_moe.py
  function rank0_print (line 44) | def rank0_print(*args):
  class LLaVAMoDPhiConfig (line 49) | class LLaVAMoDPhiConfig(PhiConfig):
    method __init__ (line 52) | def __init__(self,
  class LLaVAMoDPhiModel (line 84) | class LLaVAMoDPhiModel(LlavaMetaModel, PhiModel):
    method __init__ (line 87) | def __init__(self, config: PhiConfig):
  class MoEBaseModelOutputWithPast (line 92) | class MoEBaseModelOutputWithPast(ModelOutput):
  class MoECausalLMOutputWithPast (line 101) | class MoECausalLMOutputWithPast(ModelOutput):
  function MoEPhiDecoderLayer_forward (line 111) | def MoEPhiDecoderLayer_forward(self):
  function MoEPhiModel_forward (line 158) | def MoEPhiModel_forward(self):
  class LLaVAMoDPhiForCausalLM (line 297) | class LLaVAMoDPhiForCausalLM(PhiForCausalLM, LlavaMetaForCausalLM):
    method __init__ (line 300) | def __init__(self, config):
    method get_model (line 309) | def get_model(self):
    method forward (line 312) | def forward(
    method prepare_inputs_for_generation (line 407) | def prepare_inputs_for_generation(
    method initialize_moe_modules (line 429) | def initialize_moe_modules(self, model_args):
  class EvalLLaVAMoDPhiForCausalLM (line 512) | class EvalLLaVAMoDPhiForCausalLM(LLaVAMoDPhiForCausalLM):
    method __init__ (line 515) | def __init__(self, config):

FILE: llavamod/model/language_model/llava_qwen.py
  class LlavaQWenConfig (line 35) | class LlavaQWenConfig(QWenConfig):
  class LlavaQWenModel (line 39) | class LlavaQWenModel(LlavaMetaModel, QWenModel):
    method __init__ (line 42) | def __init__(self, config: QWenConfig):
    method embed_tokens (line 45) | def embed_tokens(self, input_ids):
  class LlavaQWenForCausalLM (line 49) | class LlavaQWenForCausalLM(QWenLMHeadModel, LlavaQWenMetaForCausalLM):
    method __init__ (line 52) | def __init__(self, config):
    method get_model (line 118) | def get_model(self):
    method forward (line 121) | def forward(
    method prepare_inputs_for_generation (line 190) | def prepare_inputs_for_generation(self, input_ids, past_key_values=Non...

FILE: llavamod/model/language_model/llava_qwen1_5.py
  function rank0_print (line 33) | def rank0_print(*args, **kwargs):
  class LlavaQwen1_5Config (line 45) | class LlavaQwen1_5Config(Qwen2Config):
  class LlavaQwen1_5Model (line 49) | class LlavaQwen1_5Model(LlavaMetaModel, Qwen2Model):
    method __init__ (line 52) | def __init__(self, config: Qwen2Config):
  class LlavaQwen1_5ForCausalLM (line 56) | class LlavaQwen1_5ForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
    method __init__ (line 59) | def __init__(self, config):
    method get_model (line 68) | def get_model(self):
    method forward (line 71) | def forward(
    method prepare_inputs_for_generation (line 147) | def prepare_inputs_for_generation(

FILE: llavamod/model/language_model/llava_qwen1_5_moe.py
  function rank0_print (line 43) | def rank0_print(*args):
  class LLaVAMoDQwen1_5Config (line 48) | class LLaVAMoDQwen1_5Config(Qwen2Config):
    method __init__ (line 51) | def __init__(self,
  class LLaVAMoDQwen1_5Model (line 84) | class LLaVAMoDQwen1_5Model(LlavaMetaModel, Qwen2Model):
    method __init__ (line 87) | def __init__(self, config: Qwen2Config):
  class MoEBaseModelOutputWithPast (line 92) | class MoEBaseModelOutputWithPast(ModelOutput):
  class MoECausalLMOutputWithPast (line 101) | class MoECausalLMOutputWithPast(ModelOutput):
  function MoEQwen1_5DecoderLayer_forward (line 112) | def MoEQwen1_5DecoderLayer_forward(self):
  function MoEQwen1_5Model_forward (line 184) | def MoEQwen1_5Model_forward(self):
  class LLaVAMoDQwen1_5ForCausalLM (line 342) | class LLaVAMoDQwen1_5ForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
    method __init__ (line 345) | def __init__(self, config):
    method get_model (line 354) | def get_model(self):
    method forward (line 357) | def forward(
    method prepare_inputs_for_generation (line 453) | def prepare_inputs_for_generation(
    method initialize_moe_modules (line 475) | def initialize_moe_modules(self, model_args):
  class LLaVAMoDQwen1_5ForCausalLMFineTune (line 564) | class LLaVAMoDQwen1_5ForCausalLMFineTune(LLaVAMoDQwen1_5ForCausalLM):
    method __init__ (line 567) | def __init__(self, config):
    method initialize_moe_modules (line 619) | def initialize_moe_modules(self, model_args):
  class EvalLLaVAMoDQwen1_5ForCausalLM (line 629) | class EvalLLaVAMoDQwen1_5ForCausalLM(LLaVAMoDQwen1_5ForCausalLM):
    method __init__ (line 632) | def __init__(self, config):

FILE: llavamod/model/language_model/llava_qwen2.py
  class LlavaQwen2Config (line 31) | class LlavaQwen2Config(Qwen2Config):
  class LlavaQwen2Model (line 35) | class LlavaQwen2Model(LlavaMetaModel, Qwen2Model):
    method __init__ (line 38) | def __init__(self, config: Qwen2Config):
  class LlavaQwen2ForCausalLM (line 42) | class LlavaQwen2ForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
    method __init__ (line 45) | def __init__(self, config):
    method get_model (line 54) | def get_model(self):
    method forward (line 57) | def forward(
    method prepare_inputs_for_generation (line 110) | def prepare_inputs_for_generation(

FILE: llavamod/model/language_model/llava_qwen2_moe.py
  function rank0_print (line 43) | def rank0_print(*args):
  class LLaVAMoDQwen2Config (line 48) | class LLaVAMoDQwen2Config(Qwen2Config):
    method __init__ (line 51) | def __init__(self,
  class LLaVAMoDQwen2Model (line 84) | class LLaVAMoDQwen2Model(LlavaMetaModel, Qwen2Model):
    method __init__ (line 87) | def __init__(self, config: Qwen2Config):
  class MoEBaseModelOutputWithPast (line 92) | class MoEBaseModelOutputWithPast(ModelOutput):
  class MoECausalLMOutputWithPast (line 101) | class MoECausalLMOutputWithPast(ModelOutput):
  function MoEQwen2DecoderLayer_forward (line 112) | def MoEQwen2DecoderLayer_forward(self):
  function MoEQwen2Model_forward (line 184) | def MoEQwen2Model_forward(self):
  class LLaVAMoDQwen2ForCausalLM (line 342) | class LLaVAMoDQwen2ForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
    method __init__ (line 345) | def __init__(self, config):
    method get_model (line 354) | def get_model(self):
    method forward (line 357) | def forward(
    method prepare_inputs_for_generation (line 453) | def prepare_inputs_for_generation(
    method initialize_moe_modules (line 475) | def initialize_moe_modules(self, model_args):
  class LLaVAMoDQwen2ForCausalLMFineTune (line 564) | class LLaVAMoDQwen2ForCausalLMFineTune(LLaVAMoDQwen2ForCausalLM):
    method __init__ (line 567) | def __init__(self, config):
    method initialize_moe_modules (line 619) | def initialize_moe_modules(self, model_args):
  class EvalLLaVAMoDQwen2ForCausalLM (line 629) | class EvalLLaVAMoDQwen2ForCausalLM(LLaVAMoDQwen2ForCausalLM):
    method __init__ (line 632) | def __init__(self, config):

FILE: llavamod/model/language_model/llava_qwen_moe.py
  function rank0_print (line 38) | def rank0_print(*args):
  class LLaVAMoDQWenConfig (line 43) | class LLaVAMoDQWenConfig(QWenConfig):
    method __init__ (line 46) | def __init__(self,
  class LLaVAMoDQWenModel (line 78) | class LLaVAMoDQWenModel(LlavaMetaModel, QWenModel):
    method __init__ (line 81) | def __init__(self, config: QWenConfig):
    method embed_tokens (line 84) | def embed_tokens(self, input_ids):
  class MoEBaseModelOutputWithPast (line 89) | class MoEBaseModelOutputWithPast(BaseModelOutputWithPast):
  class MoECausalLMOutputWithPast (line 98) | class MoECausalLMOutputWithPast(CausalLMOutputWithPast):
  function MoEQWenBlock_forward (line 109) | def MoEQWenBlock_forward(self):
  function MoEQWenModel_forward (line 162) | def MoEQWenModel_forward(self):
  class LLaVAMoDQWenForCausalLM (line 360) | class LLaVAMoDQWenForCausalLM(QWenLMHeadModel, LlavaQWenMetaForCausalLM):
    method __init__ (line 363) | def __init__(self, config):
    method get_model (line 429) | def get_model(self):
    method forward (line 432) | def forward(
    method prepare_inputs_for_generation (line 537) | def prepare_inputs_for_generation(self, input_ids, past_key_values=Non...
    method initialize_moe_modules (line 546) | def initialize_moe_modules(self, model_args):
  class LLaVAMoDQWenForCausalLMFineTune (line 626) | class LLaVAMoDQWenForCausalLMFineTune(LLaVAMoDQWenForCausalLM):
    method __init__ (line 629) | def __init__(self, config):
    method initialize_moe_modules (line 658) | def initialize_moe_modules(self, model_args):
  class EvalLLaVAMoDQWenForCausalLM (line 668) | class EvalLLaVAMoDQWenForCausalLM(LLaVAMoDQWenForCausalLM):
    method __init__ (line 671) | def __init__(self, config):

FILE: llavamod/model/language_model/llava_stablelm.py
  class LlavaStablelmConfig (line 32) | class LlavaStablelmConfig(StableLMEpochConfig):
  class LlavaStablelmModel (line 36) | class LlavaStablelmModel(LlavaMetaModel, StableLMEpochModel):
    method __init__ (line 39) | def __init__(self, config: StableLMEpochConfig):
  class LlavaStablelmForCausalLM (line 43) | class LlavaStablelmForCausalLM(StableLMEpochForCausalLM, LlavaMetaForCau...
    method __init__ (line 46) | def __init__(self, config):
    method get_model (line 55) | def get_model(self):
    method forward (line 58) | def forward(
    method prepare_inputs_for_generation (line 111) | def prepare_inputs_for_generation(self, input_ids, past_key_values=Non...

FILE: llavamod/model/language_model/llava_stablelm_moe.py
  function rank0_print (line 44) | def rank0_print(*args):
  class LLaVAMoDStablelmConfig (line 49) | class LLaVAMoDStablelmConfig(StableLMEpochConfig):
    method __init__ (line 52) | def __init__(self,
  class LLaVAMoDStablelmModel (line 84) | class LLaVAMoDStablelmModel(LlavaMetaModel, StableLMEpochModel):
    method __init__ (line 87) | def __init__(self, config: StableLMEpochConfig):
  class MoEBaseModelOutputWithPast (line 92) | class MoEBaseModelOutputWithPast(ModelOutput):
  class MoECausalLMOutputWithPast (line 101) | class MoECausalLMOutputWithPast(ModelOutput):
  function MoEStablelmDecoderLayer_forward (line 111) | def MoEStablelmDecoderLayer_forward(self):
  function MoEStablelmModel_forward (line 166) | def MoEStablelmModel_forward(self):
  class LLaVAMoDStablelmForCausalLM (line 311) | class LLaVAMoDStablelmForCausalLM(StableLMEpochForCausalLM, LlavaMetaFor...
    method __init__ (line 314) | def __init__(self, config):
    method get_model (line 323) | def get_model(self):
    method forward (line 326) | def forward(
    method prepare_inputs_for_generation (line 421) | def prepare_inputs_for_generation(self, input_ids, past_key_values=Non...
    method initialize_moe_modules (line 430) | def initialize_moe_modules(self, model_args):
  class EvalLLaVAMoDStablelmForCausalLM (line 513) | class EvalLLaVAMoDStablelmForCausalLM(LLaVAMoDStablelmForCausalLM):
    method __init__ (line 516) | def __init__(self, config):

FILE: llavamod/model/language_model/minicpm/configuration_minicpm.py
  class MiniCPMConfig (line 31) | class MiniCPMConfig(PretrainedConfig):
    method __init__ (line 117) | def __init__(
    method _rope_scaling_validation (line 178) | def _rope_scaling_validation(self):

FILE: llavamod/model/language_model/minicpm/modeling_minicpm.py
  function _get_unpad_data (line 74) | def _get_unpad_data(attention_mask):
  function _expand_mask (line 86) | def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Option...
  function _make_causal_mask (line 93) | def _make_causal_mask(
  function rms_layernorm (line 104) | def rms_layernorm(hidden: torch.Tensor, weight: torch.Tensor, eps: float):
  class MiniCPMRMSNorm (line 111) | class MiniCPMRMSNorm(nn.Module):
    method __init__ (line 112) | def __init__(self, hidden_size, eps=1e-6):
    method forward (line 120) | def forward(self, hidden_states):
  class MiniCPMRotaryEmbedding (line 127) | class MiniCPMRotaryEmbedding(nn.Module):
    method __init__ (line 128) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
    method _set_cos_sin_cache (line 143) | def _set_cos_sin_cache(self, seq_len, device, dtype):
    method forward (line 153) | def forward(self, x, seq_len=None):
  class MiniCPMLinearScalingRotaryEmbedding (line 164) | class MiniCPMLinearScalingRotaryEmbedding(MiniCPMRotaryEmbedding):
    method __init__ (line 167) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
    method _set_cos_sin_cache (line 171) | def _set_cos_sin_cache(self, seq_len, device, dtype):
  class MiniCPMDynamicNTKScalingRotaryEmbedding (line 183) | class MiniCPMDynamicNTKScalingRotaryEmbedding(MiniCPMRotaryEmbedding):
    method __init__ (line 186) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
    method _set_cos_sin_cache (line 190) | def _set_cos_sin_cache(self, seq_len, device, dtype):
  function rotate_half (line 210) | def rotate_half(x):
  function apply_rotary_pos_emb (line 217) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
  class MiniCPMMLP (line 251) | class MiniCPMMLP(nn.Module):
    method __init__ (line 252) | def __init__(self, config):
    method forward (line 262) | def forward(self, x):
  function repeat_kv (line 285) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
  class MiniCPMAttention (line 298) | class MiniCPMAttention(nn.Module):
    method __init__ (line 301) | def __init__(self, config: MiniCPMConfig, layer_idx: Optional[int] = N...
    method _init_rope (line 334) | def _init_rope(self):
    method _shape (line 361) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
    method forward (line 364) | def forward(
  class MiniCPMFlashAttention2 (line 469) | class MiniCPMFlashAttention2(MiniCPMAttention):
    method __init__ (line 476) | def __init__(self, *args, **kwargs):
    method forward (line 484) | def forward(
    method _flash_attention_forward (line 572) | def _flash_attention_forward(
    method _upad_input (line 629) | def _upad_input(self, query_layer, key_layer, value_layer, attention_m...
  class MiniCPMSdpaAttention (line 668) | class MiniCPMSdpaAttention(MiniCPMAttention):
    method forward (line 676) | def forward(
  class MiniCPMDecoderLayer (line 762) | class MiniCPMDecoderLayer(nn.Module):
    method __init__ (line 763) | def __init__(self, config: MiniCPMConfig, layer_idx: int):
    method forward (line 776) | def forward(
  class MiniCPMPreTrainedModel (line 859) | class MiniCPMPreTrainedModel(PreTrainedModel):
    method _init_weights (line 869) | def _init_weights(self, module):
  class MiniCPMModel (line 955) | class MiniCPMModel(MiniCPMPreTrainedModel):
    method __init__ (line 963) | def __init__(self, config: MiniCPMConfig):
    method get_input_embeddings (line 981) | def get_input_embeddings(self):
    method set_input_embeddings (line 984) | def set_input_embeddings(self, value):
    method forward (line 988) | def forward(
  class MiniCPMForCausalLM (line 1119) | class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
    method __init__ (line 1122) | def __init__(self, config):
    method get_input_embeddings (line 1131) | def get_input_embeddings(self):
    method set_input_embeddings (line 1134) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 1137) | def get_output_embeddings(self):
    method set_output_embeddings (line 1140) | def set_output_embeddings(self, new_embeddings):
    method set_decoder (line 1143) | def set_decoder(self, decoder):
    method get_decoder (line 1146) | def get_decoder(self):
    method forward (line 1151) | def forward(
    method prepare_inputs_for_generation (line 1242) | def prepare_inputs_for_generation(
    method _reorder_cache (line 1299) | def _reorder_cache(past_key_values, beam_idx):
    method chat (line 1308) | def chat(self, tokenizer, query: str, history: List[Dict] = None, role...
  class MiniCPMForSequenceClassification (line 1349) | class MiniCPMForSequenceClassification(MiniCPMPreTrainedModel):
    method __init__ (line 1350) | def __init__(self, config):
    method get_input_embeddings (line 1359) | def get_input_embeddings(self):
    method set_input_embeddings (line 1362) | def set_input_embeddings(self, value):
    method forward (line 1366) | def forward(

FILE: llavamod/model/language_model/mpt/adapt_tokenizer.py
  function adapt_tokenizer_for_denoising (line 6) | def adapt_tokenizer_for_denoising(tokenizer: Tokenizer):
  class AutoTokenizerForMOD (line 25) | class AutoTokenizerForMOD(AutoTokenizer):
    method from_pretrained (line 37) | def from_pretrained(cls, *args, **kwargs):

FILE: llavamod/model/language_model/mpt/attention.py
  function _reset_is_causal (line 12) | def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, origina...
  function scaled_multihead_dot_product_attention (line 20) | def scaled_multihead_dot_product_attention(query, key, value, n_heads, p...
  function check_valid_inputs (line 64) | def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bflo...
  function flash_attn_fn (line 71) | def flash_attn_fn(query, key, value, n_heads, past_key_value=None, softm...
  function triton_flash_attn_fn (line 107) | def triton_flash_attn_fn(query, key, value, n_heads, past_key_value=None...
  class MultiheadAttention (line 151) | class MultiheadAttention(nn.Module):
    method __init__ (line 158) | def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton'...
    method forward (line 191) | def forward(self, x, past_key_value=None, attn_bias=None, attention_ma...
  class MultiQueryAttention (line 204) | class MultiQueryAttention(nn.Module):
    method __init__ (line 211) | def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton'...
    method forward (line 245) | def forward(self, x, past_key_value=None, attn_bias=None, attention_ma...
  function attn_bias_shape (line 258) | def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causa...
  function build_attn_bias (line 272) | def build_attn_bias(attn_impl, attn_bias, n_heads, seq_len, causal=False...
  function gen_slopes (line 283) | def gen_slopes(n_heads, alibi_bias_max=8, device=None):
  function build_alibi_bias (line 292) | def build_alibi_bias(n_heads, seq_len, full=False, alibi_bias_max=8, dev...

FILE: llavamod/model/language_model/mpt/blocks.py
  class MPTMLP (line 8) | class MPTMLP(nn.Module):
    method __init__ (line 10) | def __init__(self, d_model: int, expansion_ratio: int, device: Optiona...
    method forward (line 17) | def forward(self, x):
  class MPTBlock (line 20) | class MPTBlock(nn.Module):
    method __init__ (line 22) | def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, a...
    method forward (line 34) | def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torc...

FILE: llavamod/model/language_model/mpt/configuration_mpt.py
  class MPTConfig (line 7) | class MPTConfig(PretrainedConfig):
    method __init__ (line 10) | def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=2...
    method _set_config_defaults (line 90) | def _set_config_defaults(self, config, config_defaults):
    method _validate_config (line 96) | def _validate_config(self):

FILE: llavamod/model/language_model/mpt/custom_embedding.py
  class SharedEmbedding (line 6) | class SharedEmbedding(nn.Embedding):
    method forward (line 8) | def forward(self, input: Tensor, unembed: bool=False) -> Tensor:

FILE: llavamod/model/language_model/mpt/flash_attn_triton.py
  function _fwd_kernel (line 51) | def _fwd_kernel(Q, K, V, Bias, Out, Lse, TMP, softmax_scale, stride_qb, ...
  function _bwd_preprocess_do_o_dot (line 155) | def _bwd_preprocess_do_o_dot(Out, DO, Delta, stride_ob, stride_oh, strid...
  function _bwd_store_dk_dv (line 168) | def _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k,...
  function _bwd_kernel_one_col_block (line 184) | def _bwd_kernel_one_col_block(start_n, Q, K, V, Bias, DO, DQ, DK, DV, LS...
  function init_to_zero (line 300) | def init_to_zero(name):
  function _bwd_kernel (line 306) | def _bwd_kernel(Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, st...
  function _flash_attn_forward (line 329) | def _flash_attn_forward(q, k, v, bias=None, causal=False, softmax_scale=...
  function _flash_attn_backward (line 366) | def _flash_attn_backward(do, q, k, v, o, lse, dq, dk, dv, bias=None, cau...
  class FlashAttnQKVPackedFunc (line 401) | class FlashAttnQKVPackedFunc(torch.autograd.Function):
    method forward (line 404) | def forward(ctx, qkv, bias=None, causal=False, softmax_scale=None):
    method backward (line 419) | def backward(ctx, do):
  class FlashAttnKVPackedFunc (line 428) | class FlashAttnKVPackedFunc(torch.autograd.Function):
    method forward (line 431) | def forward(ctx, q, kv, bias=None, causal=False, softmax_scale=None):
    method backward (line 446) | def backward(ctx, do):
  class FlashAttnFunc (line 457) | class FlashAttnFunc(torch.autograd.Function):
    method forward (line 460) | def forward(ctx, q, k, v, bias=None, causal=False, softmax_scale=None):
    method backward (line 475) | def backward(ctx, do):

FILE: llavamod/model/language_model/mpt/hf_prefixlm_converter.py
  function _convert_gpt_causal_lm_to_prefix_lm (line 29) | def _convert_gpt_causal_lm_to_prefix_lm(model: CAUSAL_GPT_TYPES) -> CAUS...
  function _convert_bloom_causal_lm_to_prefix_lm (line 113) | def _convert_bloom_causal_lm_to_prefix_lm(model: BloomForCausalLM) -> Bl...
  function _convert_opt_causal_lm_to_prefix_lm (line 269) | def _convert_opt_causal_lm_to_prefix_lm(model: OPTForCausalLM) -> OPTFor...
  function convert_hf_causal_lm_to_prefix_lm (line 335) | def convert_hf_causal_lm_to_prefix_lm(model: CAUSAL_LM_TYPES) -> CAUSAL_...
  function add_bidirectional_mask_if_missing (line 401) | def add_bidirectional_mask_if_missing(batch: Dict[str, Any]):

FILE: llavamod/model/language_model/mpt/meta_init_context.py
  function init_empty_weights (line 6) | def init_empty_weights(include_buffers: bool=False):
  function init_on_device (line 37) | def init_on_device(device: torch.device, include_buffers: bool=False):

FILE: llavamod/model/language_model/mpt/modeling_mpt.py
  class MPTPreTrainedModel (line 28) | class MPTPreTrainedModel(PreTrainedModel):
  class MPTModel (line 33) | class MPTModel(MPTPreTrainedModel):
    method __init__ (line 35) | def __init__(self, config: MPTConfig):
    method get_input_embeddings (line 81) | def get_input_embeddings(self):
    method set_input_embeddings (line 84) | def set_input_embeddings(self, value):
    method _attn_bias (line 88) | def _attn_bias(self, device, dtype, attention_mask: Optional[torch.Byt...
    method _apply_prefix_mask (line 119) | def _apply_prefix_mask(self, attn_bias: torch.Tensor, prefix_mask: tor...
    method _apply_sequence_id (line 134) | def _apply_sequence_id(self, attn_bias: torch.Tensor, sequence_id: tor...
    method forward (line 144) | def forward(self, input_ids: torch.LongTensor, past_key_values: Option...
    method param_init_fn (line 222) | def param_init_fn(self, module):
    method fsdp_wrap_fn (line 226) | def fsdp_wrap_fn(self, module):
    method activation_checkpointing_fn (line 229) | def activation_checkpointing_fn(self, module):
  class MPTForCausalLM (line 232) | class MPTForCausalLM(MPTPreTrainedModel):
    method __init__ (line 234) | def __init__(self, config: MPTConfig):
    method get_input_embeddings (line 255) | def get_input_embeddings(self):
    method set_input_embeddings (line 258) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 261) | def get_output_embeddings(self):
    method set_output_embeddings (line 264) | def set_output_embeddings(self, new_embeddings):
    method set_decoder (line 267) | def set_decoder(self, decoder):
    method get_decoder (line 270) | def get_decoder(self):
    method forward (line 273) | def forward(self, input_ids: torch.LongTensor, past_key_values: Option...
    method param_init_fn (line 291) | def param_init_fn(self, module):
    method fsdp_wrap_fn (line 295) | def fsdp_wrap_fn(self, module):
    method activation_checkpointing_fn (line 298) | def activation_checkpointing_fn(self, module):
    method prepare_inputs_for_generation (line 301) | def prepare_inputs_for_generation(self, input_ids, past_key_values=Non...
    method _reorder_cache (line 322) | def _reorder_cache(past_key_values, beam_idx):

FILE: llavamod/model/language_model/mpt/norm.py
  function _cast_if_autocast_enabled (line 3) | def _cast_if_autocast_enabled(tensor):
  class LPLayerNorm (line 14) | class LPLayerNorm(torch.nn.LayerNorm):
    method __init__ (line 16) | def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=Tru...
    method forward (line 19) | def forward(self, x):
  function rms_norm (line 27) | def rms_norm(x, weight=None, eps=1e-05):
  class RMSNorm (line 33) | class RMSNorm(torch.nn.Module):
    method __init__ (line 35) | def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=Non...
    method forward (line 43) | def forward(self, x):
  class LPRMSNorm (line 46) | class LPRMSNorm(RMSNorm):
    method __init__ (line 48) | def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=Non...
    method forward (line 51) | def forward(self, x):

FILE: llavamod/model/language_model/mpt/param_init_fns.py
  function torch_default_param_init_fn_ (line 10) | def torch_default_param_init_fn_(module: nn.Module, verbose: int=0, **kw...
  function fused_init_helper_ (line 17) | def fused_init_helper_(module: nn.Module, init_fn_):
  function generic_param_init_fn_ (line 28) | def generic_param_init_fn_(module: nn.Module, init_fn_, n_layers: int, d...
  function _normal_init_ (line 121) | def _normal_init_(std, mean=0.0):
  function _normal_param_init_fn_ (line 124) | def _normal_param_init_fn_(module: nn.Module, std: float, n_layers: int,...
  function baseline_param_init_fn_ (line 131) | def baseline_param_init_fn_(module: nn.Module, init_std: float, n_layers...
  function small_param_init_fn_ (line 137) | def small_param_init_fn_(module: nn.Module, n_layers: int, d_model: int,...
  function neox_param_init_fn_ (line 142) | def neox_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, ...
  function kaiming_uniform_param_init_fn_ (line 155) | def kaiming_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_m...
  function kaiming_normal_param_init_fn_ (line 162) | def kaiming_normal_param_init_fn_(module: nn.Module, n_layers: int, d_mo...
  function xavier_uniform_param_init_fn_ (line 169) | def xavier_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_mo...
  function xavier_normal_param_init_fn_ (line 176) | def xavier_normal_param_init_fn_(module: nn.Module, n_layers: int, d_mod...

FILE: llavamod/model/language_model/phi/configuration_phi.py
  class PhiConfig (line 30) | class PhiConfig(PretrainedConfig):
    method __init__ (line 117) | def __init__(
    method _rope_scaling_validation (line 174) | def _rope_scaling_validation(self):

FILE: llavamod/model/language_model/phi/modeling_phi.py
  function _get_unpad_data (line 69) | def _get_unpad_data(attention_mask):
  class PhiRotaryEmbedding (line 82) | class PhiRotaryEmbedding(nn.Module):
    method __init__ (line 83) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
    method _set_cos_sin_cache (line 97) | def _set_cos_sin_cache(self, seq_len, device, dtype):
    method forward (line 107) | def forward(self, x, seq_len=None):
  class PhiLinearScalingRotaryEmbedding (line 119) | class PhiLinearScalingRotaryEmbedding(PhiRotaryEmbedding):
    method __init__ (line 122) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
    method _set_cos_sin_cache (line 126) | def _set_cos_sin_cache(self, seq_len, device, dtype):
  class PhiDynamicNTKScalingRotaryEmbedding (line 139) | class PhiDynamicNTKScalingRotaryEmbedding(PhiRotaryEmbedding):
    method __init__ (line 142) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
    method _set_cos_sin_cache (line 146) | def _set_cos_sin_cache(self, seq_len, device, dtype):
  function rotate_half (line 166) | def rotate_half(x):
  function apply_rotary_pos_emb (line 174) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
  class PhiMLP (line 203) | class PhiMLP(nn.Module):
    method __init__ (line 204) | def __init__(self, config):
    method forward (line 211) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  function repeat_kv (line 219) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
  class PhiAttention (line 231) | class PhiAttention(nn.Module):
    method __init__ (line 234) | def __init__(self, config: PhiConfig, layer_idx: Optional[int] = None):
    method _init_rope (line 278) | def _init_rope(self):
    method forward (line 308) | def forward(
  class PhiFlashAttention2 (line 406) | class PhiFlashAttention2(PhiAttention):
    method __init__ (line 414) | def __init__(self, *args, **kwargs):
    method forward (line 422) | def forward(
    method _flash_attention_forward (line 524) | def _flash_attention_forward(
    method _upad_input (line 584) | def _upad_input(self, query_layer, key_layer, value_layer, attention_m...
  class PhiDecoderLayer (line 629) | class PhiDecoderLayer(nn.Module):
    method __init__ (line 630) | def __init__(self, config: PhiConfig, layer_idx: int):
    method forward (line 637) | def forward(
  class PhiPreTrainedModel (line 713) | class PhiPreTrainedModel(PreTrainedModel):
    method _init_weights (line 722) | def _init_weights(self, module):
  class PhiModel (line 808) | class PhiModel(PhiPreTrainedModel):
    method __init__ (line 816) | def __init__(self, config: PhiConfig):
    method get_input_embeddings (line 833) | def get_input_embeddings(self):
    method set_input_embeddings (line 836) | def set_input_embeddings(self, value):
    method forward (line 840) | def forward(
  class PhiForCausalLM (line 964) | class PhiForCausalLM(PhiPreTrainedModel):
    method __init__ (line 968) | def __init__(self, config):
    method get_input_embeddings (line 978) | def get_input_embeddings(self):
    method set_input_embeddings (line 982) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 986) | def get_output_embeddings(self):
    method set_output_embeddings (line 990) | def set_output_embeddings(self, new_embeddings):
    method set_decoder (line 994) | def set_decoder(self, decoder):
    method get_decoder (line 998) | def get_decoder(self):
    method forward (line 1003) | def forward(
    method prepare_inputs_for_generation (line 1091) | def prepare_inputs_for_generation(
    method _reorder_cache (line 1149) | def _reorder_cache(past_key_values, beam_idx):
  class PhiForSequenceClassification (line 1174) | class PhiForSequenceClassification(PhiPreTrainedModel):
    method __init__ (line 1175) | def __init__(self, config):
    method get_input_embeddings (line 1184) | def get_input_embeddings(self):
    method set_input_embeddings (line 1187) | def set_input_embeddings(self, value):
    method forward (line 1191) | def forward(
  class PhiForTokenClassification (line 1290) | class PhiForTokenClassification(PhiPreTrainedModel):
    method __init__ (line 1291) | def __init__(self, config: PhiConfig):
    method forward (line 1314) | def forward(

FILE: llavamod/model/language_model/qwen/configuration_qwen.py
  class QWenConfig (line 9) | class QWenConfig(PretrainedConfig):
    method __init__ (line 13) | def __init__(

FILE: llavamod/model/language_model/qwen/cpp_kernels.py
  function _get_cuda_bare_metal_version (line 6) | def _get_cuda_bare_metal_version(cuda_dir):
  function _create_build_dir (line 17) | def _create_build_dir(buildpath):
  function _cpp_extention_load_helper (line 39) | def _cpp_extention_load_helper(name, sources, extra_cuda_flags):

FILE: llavamod/model/language_model/qwen/modeling_qwen.py
  function _import_flash_attn (line 83) | def _import_flash_attn():
  function quantize_cache_v (line 123) | def quantize_cache_v(fdata, bits, qmax, qmin):
  function dequantize_cache_torch (line 145) | def dequantize_cache_torch(qdata, scale, zero):
  class FlashSelfAttention (line 149) | class FlashSelfAttention(torch.nn.Module):
    method __init__ (line 150) | def __init__(
    method unpad_input (line 167) | def unpad_input(self, hidden_states, attention_mask):
    method pad_input (line 176) | def pad_input(self, hidden_states, indices, batch, seqlen):
    method forward (line 182) | def forward(self, q, k, v, attention_mask=None):
  class QWenAttention (line 247) | class QWenAttention(nn.Module):
    method __init__ (line 248) | def __init__(self, config):
    method _attn (line 323) | def _attn(self, query, key, value, causal_mask=None, attention_mask=No...
    method _split_heads (line 394) | def _split_heads(self, tensor, num_heads, attn_head_size):
    method _merge_heads (line 399) | def _merge_heads(self, tensor, num_heads, attn_head_size):
    method forward (line 404) | def forward(
  class QWenMLP (line 558) | class QWenMLP(nn.Module):
    method __init__ (line 559) | def __init__(self, config):
    method forward (line 570) | def forward(self, hidden_states):
  class QWenBlock (line 578) | class QWenBlock(nn.Module):
    method __init__ (line 579) | def __init__(self, config):
    method forward (line 596) | def forward(
  class QWenPreTrainedModel (line 640) | class QWenPreTrainedModel(PreTrainedModel):
    method __init__ (line 648) | def __init__(self, *inputs, **kwargs):
    method _init_weights (line 651) | def _init_weights(self, module):
    method _set_gradient_checkpointing (line 674) | def _set_gradient_checkpointing(self, module, value=False):
  class QWenModel (line 679) | class QWenModel(QWenPreTrainedModel):
    method __init__ (line 682) | def __init__(self, config):
    method get_input_embeddings (line 729) | def get_input_embeddings(self):
    method set_input_embeddings (line 732) | def set_input_embeddings(self, new_embeddings):
    method get_ntk_alpha (line 735) | def get_ntk_alpha(self, true_seq_len):
    method forward (line 741) | def forward(
  class QWenLMHeadModel (line 929) | class QWenLMHeadModel(QWenPreTrainedModel):
    method __init__ (line 933) | def __init__(self, config):
    method get_output_embeddings (line 990) | def get_output_embeddings(self):
    method set_output_embeddings (line 993) | def set_output_embeddings(self, new_embeddings):
    method prepare_inputs_for_generation (line 996) | def prepare_inputs_for_generation(
    method forward (line 1022) | def forward(
    method _reorder_cache (line 1093) | def _reorder_cache(
    method chat (line 1105) | def chat(
    method chat_stream (line 1171) | def chat_stream(
    method generate (line 1234) | def generate(
  class RotaryEmbedding (line 1280) | class RotaryEmbedding(torch.nn.Module):
    method __init__ (line 1281) | def __init__(self, dim, base=10000):
    method update_rotary_pos_emb_cache (line 1295) | def update_rotary_pos_emb_cache(self, seqlen, ntk_alpha=1.0):
    method forward (line 1318) | def forward(self, max_seq_len, ntk_alpha=1.0):
  function _rotate_half (line 1324) | def _rotate_half(x):
  function apply_rotary_pos_emb (line 1332) | def apply_rotary_pos_emb(t, freqs):
  class RMSNorm (line 1357) | class RMSNorm(torch.nn.Module):
    method __init__ (line 1358) | def __init__(self, dim: int, eps: float = 1e-6):
    method _norm (line 1363) | def _norm(self, x):
    method forward (line 1366) | def forward(self, x):

FILE: llavamod/model/language_model/qwen/qwen_generation_utils.py
  function pad_batch (line 25) | def pad_batch(batch: BatchTokensType, pad_id: int, seq_length: int) -> B...
  function get_ltor_masks_and_position_ids (line 33) | def get_ltor_masks_and_position_ids(
  function get_batch (line 94) | def get_batch(context_tokens: torch.LongTensor, eod_id: int):
  function get_stop_words_ids (line 109) | def get_stop_words_ids(chat_format, tokenizer):
  function make_context (line 119) | def make_context(
  function _decode_default (line 192) | def _decode_default(
  function _decode_chatml (line 225) | def _decode_chatml(
  function decode_tokens (line 261) | def decode_tokens(
  class StopWordsLogitsProcessor (line 301) | class StopWordsLogitsProcessor(LogitsProcessor):
    method __init__ (line 314) | def __init__(self, stop_words_ids: Iterable[Iterable[int]], eos_token_...
    method __call__ (line 348) | def __call__(
    method _tokens_match (line 357) | def _tokens_match(self, prev_tokens: torch.LongTensor, tokens: List[in...
    method _calc_stopped_samples (line 370) | def _calc_stopped_samples(self, prev_input_ids: Iterable[int]) -> Iter...
  function top_k_logits (line 384) | def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
  function switch (line 414) | def switch(val1, val2, boolean):

FILE: llavamod/model/language_model/qwen/tokenization_qwen.py
  function _load_tiktoken_bpe (line 48) | def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
  class QWenTokenizer (line 57) | class QWenTokenizer(PreTrainedTokenizer):
    method __init__ (line 62) | def __init__(
    method __getstate__ (line 116) | def __getstate__(self):
    method __setstate__ (line 122) | def __setstate__(self, state):
    method __len__ (line 133) | def __len__(self) -> int:
    method get_vocab (line 136) | def get_vocab(self) -> Dict[bytes, int]:
    method convert_tokens_to_ids (line 139) | def convert_tokens_to_ids(
    method _add_tokens (line 155) | def _add_tokens(
    method save_vocabulary (line 168) | def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
    method tokenize (line 181) | def tokenize(
    method convert_tokens_to_string (line 214) | def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) ->...
    method vocab_size (line 235) | def vocab_size(self):
    method _convert_id_to_token (line 238) | def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
    method _convert_token_to_id (line 244) | def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
    method _tokenize (line 252) | def _tokenize(self, text: str, **kwargs):
    method _decode (line 260) | def _decode(

FILE: llavamod/model/language_model/qwen1_5/configuration_qwen2.py
  class Qwen2Config (line 28) | class Qwen2Config(PretrainedConfig):
    method __init__ (line 99) | def __init__(

FILE: llavamod/model/language_model/qwen1_5/modeling_qwen2.py
  function rank0_print (line 59) | def rank0_print(*args, **kwargs):
  function _get_unpad_data (line 83) | def _get_unpad_data(attention_mask):
  class Qwen2RMSNorm (line 96) | class Qwen2RMSNorm(nn.Module):
    method __init__ (line 97) | def __init__(self, hidden_size, eps=1e-6):
    method forward (line 105) | def forward(self, hidden_states):
  class Qwen2RotaryEmbedding (line 114) | class Qwen2RotaryEmbedding(nn.Module):
    method __init__ (line 115) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
    method _set_cos_sin_cache (line 129) | def _set_cos_sin_cache(self, seq_len, device, dtype):
    method forward (line 139) | def forward(self, x, seq_len=None):
  function rotate_half (line 151) | def rotate_half(x):
  function apply_rotary_pos_emb (line 159) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
  class Qwen2MLP (line 188) | class Qwen2MLP(nn.Module):
    method __init__ (line 189) | def __init__(self, config):
    method forward (line 199) | def forward(self, x):
  function repeat_kv (line 204) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
  class Qwen2Attention (line 216) | class Qwen2Attention(nn.Module):
    method __init__ (line 222) | def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
    method forward (line 259) | def forward(
  class Qwen2FlashAttention2 (line 341) | class Qwen2FlashAttention2(Qwen2Attention):
    method __init__ (line 351) | def __init__(self, *args, **kwargs):
    method forward (line 359) | def forward(
    method _flash_attention_forward (line 495) | def _flash_attention_forward(
    method _upad_input (line 600) | def _upad_input(self, query_layer, key_layer, value_layer, attention_m...
  class Qwen2SdpaAttention (line 644) | class Qwen2SdpaAttention(Qwen2Attention):
    method forward (line 652) | def forward(
  class Qwen2DecoderLayer (line 738) | class Qwen2DecoderLayer(nn.Module):
    method __init__ (line 739) | def __init__(self, config: Qwen2Config, layer_idx: int):
    method forward (line 754) | def forward(
  class Qwen2PreTrainedModel (line 836) | class Qwen2PreTrainedModel(PreTrainedModel):
    method _init_weights (line 846) | def _init_weights(self, module):
  class Qwen2Model (line 932) | class Qwen2Model(Qwen2PreTrainedModel):
    method __init__ (line 940) | def __init__(self, config: Qwen2Config):
    method get_input_embeddings (line 956) | def get_input_embeddings(self):
    method set_input_embeddings (line 959) | def set_input_embeddings(self, value):
    method forward (line 963) | def forward(
  class Qwen2ForCausalLM (line 1110) | class Qwen2ForCausalLM(Qwen2PreTrainedModel):
    method __init__ (line 1113) | def __init__(self, config):
    method get_input_embeddings (line 1122) | def get_input_embeddings(self):
    method set_input_embeddings (line 1125) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 1128) | def get_output_embeddings(self):
    method set_output_embeddings (line 1131) | def set_output_embeddings(self, new_embeddings):
    method set_decoder (line 1134) | def set_decoder(self, decoder):
    method get_decoder (line 1137) | def get_decoder(self):
    method forward (line 1142) | def forward(
    method prepare_inputs_for_generation (line 1219) | def prepare_inputs_for_generation(
    method _reorder_cache (line 1277) | def _reorder_cache(past_key_values, beam_idx):
  class Qwen2ForSequenceClassification (line 1301) | class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
    method __init__ (line 1302) | def __init__(self, config):
    method get_input_embeddings (line 1311) | def get_input_embeddings(self):
    method set_input_embeddings (line 1314) | def set_input_embeddings(self, value):
    method forward (line 1318) | def forward(

FILE: llavamod/model/language_model/qwen2/configuration_qwen2.py
  class Qwen2Config (line 28) | class Qwen2Config(PretrainedConfig):
    method __init__ (line 99) | def __init__(

FILE: llavamod/model/language_model/qwen2/modeling_qwen2.py
  function _get_unpad_data (line 70) | def _get_unpad_data(attention_mask):
  class Qwen2RMSNorm (line 83) | class Qwen2RMSNorm(nn.Module):
    method __init__ (line 84) | def __init__(self, hidden_size, eps=1e-6):
    method forward (line 92) | def forward(self, hidden_states):
  class Qwen2RotaryEmbedding (line 101) | class Qwen2RotaryEmbedding(nn.Module):
    method __init__ (line 102) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
    method _set_cos_sin_cache (line 116) | def _set_cos_sin_cache(self, seq_len, device, dtype):
    method forward (line 126) | def forward(self, x, seq_len=None):
  function rotate_half (line 138) | def rotate_half(x):
  function apply_rotary_pos_emb (line 146) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
  class Qwen2MLP (line 175) | class Qwen2MLP(nn.Module):
    method __init__ (line 176) | def __init__(self, config):
    method forward (line 186) | def forward(self, x):
  function repeat_kv (line 191) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
  class Qwen2Attention (line 203) | class Qwen2Attention(nn.Module):
    method __init__ (line 209) | def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
    method forward (line 246) | def forward(
  class Qwen2FlashAttention2 (line 328) | class Qwen2FlashAttention2(Qwen2Attention):
    method __init__ (line 338) | def __init__(self, *args, **kwargs):
    method forward (line 346) | def forward(
    method _flash_attention_forward (line 482) | def _flash_attention_forward(
    method _upad_input (line 587) | def _upad_input(self, query_layer, key_layer, value_layer, attention_m...
  class Qwen2SdpaAttention (line 631) | class Qwen2SdpaAttention(Qwen2Attention):
    method forward (line 639) | def forward(
  class Qwen2DecoderLayer (line 725) | class Qwen2DecoderLayer(nn.Module):
    method __init__ (line 726) | def __init__(self, config: Qwen2Config, layer_idx: int):
    method forward (line 741) | def forward(
  class Qwen2PreTrainedModel (line 823) | class Qwen2PreTrainedModel(PreTrainedModel):
    method _init_weights (line 833) | def _init_weights(self, module):
  class Qwen2Model (line 919) | class Qwen2Model(Qwen2PreTrainedModel):
    method __init__ (line 927) | def __init__(self, config: Qwen2Config):
    method get_input_embeddings (line 943) | def get_input_embeddings(self):
    method set_input_embeddings (line 946) | def set_input_embeddings(self, value):
    method forward (line 950) | def forward(
  class Qwen2ForCausalLM (line 1097) | class Qwen2ForCausalLM(Qwen2PreTrainedModel):
    method __init__ (line 1100) | def __init__(self, config):
    method get_input_embeddings (line 1109) | def get_input_embeddings(self):
    method set_input_embeddings (line 1112) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 1115) | def get_output_embeddings(self):
    method set_output_embeddings (line 1118) | def set_output_embeddings(self, new_embeddings):
    method set_decoder (line 1121) | def set_decoder(self, decoder):
    method get_decoder (line 1124) | def get_decoder(self):
    method forward (line 1129) | def forward(
    method prepare_inputs_for_generation (line 1192) | def prepare_inputs_for_generation(
    method _reorder_cache (line 1250) | def _reorder_cache(past_key_values, beam_idx):
  class Qwen2ForSequenceClassification (line 1274) | class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
    method __init__ (line 1275) | def __init__(self, config):
    method get_input_embeddings (line 1284) | def get_input_embeddings(self):
    method set_input_embeddings (line 1287) | def set_input_embeddings(self, value):
    method forward (line 1291) | def forward(

FILE: llavamod/model/language_model/stablelm/configuration_stablelm_epoch.py
  class StableLMEpochConfig (line 22) | class StableLMEpochConfig(PretrainedConfig):
    method __init__ (line 72) | def __init__(

FILE: llavamod/model/language_model/stablelm/modeling_stablelm_epoch.py
  function _get_unpad_data (line 52) | def _get_unpad_data(attention_mask):
  function _make_causal_mask (line 65) | def _make_causal_mask(
  function _expand_mask (line 83) | def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Option...
  class RotaryEmbedding (line 96) | class RotaryEmbedding(nn.Module):
    method __init__ (line 97) | def __init__(
    method _set_cos_sin_cache (line 117) | def _set_cos_sin_cache(self, seq_len: int, device: torch.device, dtype...
    method forward (line 129) | def forward(self, x: torch.Tensor, seq_len: Optional[int] = None):
  function rotate_half (line 139) | def rotate_half(x: torch.Tensor):
  function apply_rotary_pos_emb (line 145) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
  class MLP (line 156) | class MLP(nn.Module):
    method __init__ (line 157) | def __init__(self, config: StableLMEpochConfig):
    method forward (line 167) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  function repeat_kv (line 171) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
  class Attention (line 183) | class Attention(nn.Module):
    method __init__ (line 184) | def __init__(self, config: StableLMEpochConfig):
    method _init_rope (line 208) | def _init_rope(self):
    method forward (line 216) | def forward(
  class FlashAttention2 (line 299) | class FlashAttention2(Attention):
    method __init__ (line 304) | def __init__(self, *args, **kwargs):
    method forward (line 312) | def forward(
    method _flash_attention_forward (line 387) | def _flash_attention_forward(
    method _upad_input (line 446) | def _upad_input(self, query_layer, key_layer, value_layer, attention_m...
  class DecoderLayer (line 491) | class DecoderLayer(nn.Module):
    method __init__ (line 492) | def __init__(self, config: StableLMEpochConfig):
    method forward (line 499) | def forward(
  class StableLMEpochPreTrainedModel (line 540) | class StableLMEpochPreTrainedModel(PreTrainedModel):
    method _init_weights (line 552) | def _init_weights(self, module: nn.Module):
    method _set_gradient_checkpointing (line 566) | def _set_gradient_checkpointing(self, module: nn.Module, value=False):
  class StableLMEpochModel (line 571) | class StableLMEpochModel(StableLMEpochPreTrainedModel):
    method __init__ (line 572) | def __init__(self, config: StableLMEpochConfig):
    method get_input_embeddings (line 583) | def get_input_embeddings(self):
    method set_input_embeddings (line 586) | def set_input_embeddings(self, value: nn.Module):
    method _prepare_decoder_attention_mask (line 590) | def _prepare_decoder_attention_mask(
    method forward (line 617) | def forward(
  class StableLMEpochForCausalLM (line 760) | class StableLMEpochForCausalLM(StableLMEpochPreTrainedModel):
    method __init__ (line 763) | def __init__(self, config: StableLMEpochConfig):
    method get_input_embeddings (line 772) | def get_input_embeddings(self):
    method set_input_embeddings (line 775) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 778) | def get_output_embeddings(self):
    method set_output_embeddings (line 781) | def set_output_embeddings(self, new_embeddings: nn.Module):
    method get_decoder (line 784) | def get_decoder(self):
    method set_decoder (line 787) | def set_decoder(self, decoder):
    method forward (line 790) | def forward(
    method prepare_inputs_for_generation (line 858) | def prepare_inputs_for_generation(
    method _reorder_cache (line 904) | def _reorder_cache(past_key_values, beam_idx):

FILE: llavamod/model/language_model/stablelm/tokenization_arcade100k.py
  function _load_tiktoken_bpe (line 23) | def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
  function _arcade100k (line 79) | def _arcade100k(vocab_file: str):
  class Arcade100kTokenizer (line 90) | class Arcade100kTokenizer(PreTrainedTokenizer):
    method __init__ (line 107) | def __init__(
    method __len__ (line 133) | def __len__(self):
    method vocab_size (line 137) | def vocab_size(self):
    method get_vocab (line 140) | def get_vocab(self) -> Dict[bytes, int]:
    method convert_tokens_to_ids (line 143) | def convert_tokens_to_ids(
    method _add_tokens (line 159) | def _add_tokens(
    method save_vocabulary (line 172) | def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
    method tokenize (line 186) | def tokenize(
    method convert_tokens_to_string (line 222) | def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) ->...
    method _convert_id_to_token (line 242) | def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
    method _convert_token_to_id (line 248) | def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
    method _tokenize (line 256) | def _tokenize(self, text: str, **kwargs):
    method _decode (line 265) | def _decode(

FILE: llavamod/model/llava_arch.py
  class LlavaMetaModel (line 27) | class LlavaMetaModel:
    method __init__ (line 29) | def __init__(self, config):
    method get_image_tower (line 38) | def get_image_tower(self):
    method get_video_tower (line 44) | def get_video_tower(self):
    method initialize_vision_modules (line 50) | def initialize_vision_modules(self, model_args, fsdp=None):
  class LlavaMetaForCausalLM (line 131) | class LlavaMetaForCausalLM(ABC):
    method get_model (line 134) | def get_model(self):
    method get_image_tower (line 137) | def get_image_tower(self):
    method get_video_tower (line 140) | def get_video_tower(self):
    method encode_images (line 143) | def encode_images(self, images):
    method encode_videos (line 150) | def encode_videos(self, videos):  # [mini_b, c, t, h, w]
    method prepare_inputs_labels_for_multimodal (line 155) | def prepare_inputs_labels_for_multimodal(
    method initialize_vision_tokenizer (line 336) | def initialize_vision_tokenizer(self, model_args, tokenizer):
  class LlavaQWenMetaForCausalLM (line 382) | class LlavaQWenMetaForCausalLM(LlavaMetaForCausalLM):
    method prepare_inputs_labels_for_multimodal (line 384) | def prepare_inputs_labels_for_multimodal(

FILE: llavamod/model/make_delta.py
  function make_delta (line 13) | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_...

FILE: llavamod/model/modeling_flash_attention_utils.py
  function _get_unpad_data (line 34) | def _get_unpad_data(attention_mask: torch.Tensor) -> Tuple[torch.Tensor,...
  function _upad_input (line 61) | def _upad_input(
  function prepare_fa2_from_position_ids (line 134) | def prepare_fa2_from_position_ids(query, key, value, position_ids):
  function _flash_attention_forward (line 184) | def _flash_attention_forward(

FILE: llavamod/model/multimodal_encoder/builder.py
  function build_image_tower (line 15) | def build_image_tower(image_tower_cfg, **kwargs):
  function build_video_tower (line 39) | def build_video_tower(video_tower_cfg, **kwargs):

FILE: llavamod/model/multimodal_encoder/clip_encoder.py
  class CLIPVisionTower (line 7) | class CLIPVisionTower(nn.Module):
    method __init__ (line 8) | def __init__(self, image_tower, args, delay_load=False, cache_dir='./c...
    method load_model (line 24) | def load_model(self):
    method feature_select (line 35) | def feature_select(self, image_forward_outs):
    method forward (line 46) | def forward(self, images):
    method dummy_feature (line 60) | def dummy_feature(self):
    method dtype (line 64) | def dtype(self):
    method device (line 68) | def device(self):
    method config (line 72) | def config(self):
    method hidden_size (line 79) | def hidden_size(self):
    method num_patches (line 83) | def num_patches(self):

FILE: llavamod/model/multimodal_encoder/clips2_encoder.py
  class CLIPVisionTowerS2 (line 8) | class CLIPVisionTowerS2(CLIPVisionTower):
    method __init__ (line 9) | def __init__(self, image_tower, args, delay_load=False):
    method load_model (line 27) | def load_model(self):
    method forward_feature (line 42) | def forward_feature(self, images):
    method forward (line 49) | def forward(self, images):
    method hidden_size (line 65) | def hidden_size(self):

FILE: llavamod/model/multimodal_encoder/siglip_encoder.py
  class SiglipVisionTower (line 8) | class SiglipVisionTower(nn.Module):
    method __init__ (line 9) | def __init__(self, image_tower, args, delay_load=False, cache_dir='./c...
    method load_model (line 25) | def load_model(self):
    method feature_select (line 32) | def feature_select(self, image_forward_outs):
    method forward (line 43) | def forward(self, images):
    method dummy_feature (line 57) | def dummy_feature(self):
    method dtype (line 61) | def dtype(self):
    method device (line 65) | def device(self):
    method config (line 69) | def config(self):
    method hidden_size (line 76) | def hidden_size(self):
    method num_patches (line 80) | def num_patches(self):

FILE: llavamod/model/multimodal_projector/builder.py
  class IdentityMap (line 14) | class IdentityMap(nn.Module):
    method __init__ (line 15) | def __init__(self):
    method forward (line 18) | def forward(self, x, *args, **kwargs):
    method config (line 22) | def config(self):
  function build_image_projector (line 26) | def build_image_projector(config, delay_load=False, **kwargs):
  function build_video_projector (line 69) | def build_video_projector(config, delay_load=False, **kwargs):
  class MLP (line 112) | class MLP(nn.Module):
    method __init__ (line 113) | def __init__(self, mm_hidden_size, hidden_size):
    method forward (line 121) | def forward(self, x):
  class build_projector (line 125) | class build_projector(nn.Module):
    method __init__ (line 126) | def __init__(self, config, delay_load=False, **kwargs):
    method forward_image (line 148) | def forward_image(self, image_feature):
    method forward_video (line 151) | def forward_video(self, video_feature):

FILE: llavamod/model/multimodal_projector/pool_block.py
  class Pool_Block (line 8) | class Pool_Block(nn.Module):
    method __init__ (line 9) | def __init__(self, projector_type, config):
    method forward (line 20) | def forward(self, x):

FILE: llavamod/model/multimodal_projector/qformer.py
  class Blip2Model (line 9) | class Blip2Model(Blip2PreTrainedModel):
    method __init__ (line 10) | def __init__(self, config: Blip2Config):
    method forward (line 26) | def forward(
  function qformer_config_template (line 62) | def qformer_config_template(config, projector_type):
  class Cheap_Blip2Model (line 159) | class Cheap_Blip2Model(Blip2PreTrainedModel):
    method __init__ (line 160) | def __init__(self, config: Blip2Config):
    method forward (line 176) | def forward(
  function cheap_qformer_config_template (line 208) | def cheap_qformer_config_template(config, projector_type):

FILE: llavamod/model/multimodal_projector/simple_block.py
  class SimpleResBlock (line 9) | class SimpleResBlock(nn.Module):
    method __init__ (line 10) | def __init__(self, channels):
    method forward (line 19) | def forward(self, x):
  class BaseConv2D (line 23) | class BaseConv2D(nn.Module):
    method __init__ (line 24) | def __init__(self, channels, groups=1, eps=1e-6):
    method forward (line 31) | def forward(self, x):
  class SimpleBlock (line 39) | class SimpleBlock(nn.Module):
    method __init__ (line 40) | def __init__(self, in_channels, out_channels, num_in_block, num_out_bl...
    method forward (line 60) | def forward(self, x):
  class Cheap_SimpleBlock (line 81) | class Cheap_SimpleBlock(nn.Module):
    method __init__ (line 82) | def __init__(self, in_channels, out_channels, num_in_block, num_out_bl...
    method forward (line 102) | def forward(self, x):

FILE: llavamod/model/utils.py
  function auto_upgrade (line 7) | def auto_upgrade(config):
  function create_reference_model (line 34) | def create_reference_model(model, num_shared_layers: int = None, pattern...
  function disable_dropout_in_model (line 109) | def disable_dropout_in_model(model: torch.nn.Module) -> None:
  class CausalLMOutputWithPast (line 121) | class CausalLMOutputWithPast(ModelOutput):

FILE: llavamod/serve/cli.py
  function load_image (line 17) | def load_image(image_file):
  function main (line 26) | def main(args):

FILE: llavamod/serve/utils.py
  function load_image (line 7) | def load_image(image_file):

FILE: llavamod/train/align_train.py
  function rank0_print (line 15) | def rank0_print(*args):
  function create_model_tokenizer (line 20) | def create_model_tokenizer(
  function train (line 515) | def train():

FILE: llavamod/train/align_trainer.py
  function rank0_print (line 43) | def rank0_print(*args):
  function maybe_zero_3 (line 48) | def maybe_zero_3(param, ignore_status=False, name=None):
  function get_mm_adapter_state_maybe_zero_3 (line 62) | def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
  function split_to_even_chunks (line 68) | def split_to_even_chunks(indices, lengths, num_chunks):
  function get_modality_length_grouped_indices (line 90) | def get_modality_length_grouped_indices(lengths, batch_size, world_size,...
  function get_length_grouped_indices (line 119) | def get_length_grouped_indices(lengths, batch_size, world_size, generato...
  class LengthGroupedSampler (line 130) | class LengthGroupedSampler(Sampler):
    method __init__ (line 136) | def __init__(
    method __len__ (line 153) | def __len__(self):
    method __iter__ (line 156) | def __iter__(self):
  function unwrap_model (line 166) | def unwrap_model(model: nn.Module) -> nn.Module:
  class AlignTrainer (line 180) | class AlignTrainer(Trainer):
    method __init__ (line 217) | def __init__(
    method _get_train_sampler (line 311) | def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
    method create_optimizer (line 326) | def create_optimizer(self):
    method _prepare_deepspeed (line 436) | def _prepare_deepspeed(self, model):
    method get_p (line 455) | def get_p(self, model, inputs):
    method get_logp (line 479) | def get_logp(self, model, inputs):
    method compute_align_loss (line 503) | def compute_align_loss(self, policy_logprobs, reference_probs, labels):
    method compute_loss (line 530) | def compute_loss(
    method store_metrics (line 596) | def store_metrics(self, metrics: Dict[str, float], train_eval: Literal...
    method log (line 600) | def log(self, logs: Dict[str, float]) -> None:
    method _save_checkpoint (line 616) | def _save_checkpoint(self, model, trial, metrics=None):
    method _save (line 638) | def _save(self, output_dir: Optional[str] = None, state_dict=None):

FILE: llavamod/train/dpo_train.py
  function rank0_print (line 15) | def rank0_print(*args):
  function create_model_tokenizer (line 20) | def create_model_tokenizer(
  function train (line 471) | def train():

FILE: llavamod/train/dpo_trainer.py
  function rank0_print (line 43) | def rank0_print(*args):
  function maybe_zero_3 (line 48) | def maybe_zero_3(param, ignore_status=False, name=None):
  function get_mm_adapter_state_maybe_zero_3 (line 62) | def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
  function split_to_even_chunks (line 68) | def split_to_even_chunks(indices, lengths, num_chunks):
  function get_modality_length_grouped_indices (line 90) | def get_modality_length_grouped_indices(lengths, batch_size, world_size,...
  function get_length_grouped_indices (line 119) | def get_length_grouped_indices(lengths, batch_size, world_size, generato...
  class LengthGroupedSampler (line 130) | class LengthGroupedSampler(Sampler):
    method __init__ (line 136) | def __init__(
    method __len__ (line 153) | def __len__(self):
    method __iter__ (line 156) | def __iter__(self):
  function unwrap_model (line 166) | def unwrap_model(model: nn.Module) -> nn.Module:
  class DPOTrainer (line 180) | class DPOTrainer(Trainer):
    method __init__ (line 217) | def __init__(
    method _get_train_sampler (line 318) | def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
    method create_optimizer (line 333) | def create_optimizer(self):
    method _prepare_deepspeed (line 443) | def _prepare_deepspeed(self, model):
    method get_logp (line 462) | def get_logp(self, model, inputs, average_log_prob: bool = False):
    method dpo_loss (line 497) | def dpo_loss(
    method compute_loss (line 564) | def compute_loss(
    method store_metrics (line 643) | def store_metrics(self, metrics: Dict[str, float], train_eval: Literal...
    method log (line 647) | def log(self, logs: Dict[str, float]) -> None:
    method _save_checkpoint (line 663) | def _save_checkpoint(self, model, trial, metrics=None):
    method _save (line 685) | def _save(self, output_dir: Optional[str] = None, state_dict=None):

FILE: llavamod/train/llava_trainer.py
  function maybe_zero_3 (line 20) | def maybe_zero_3(param, ignore_status=False, name=None):
  function get_mm_adapter_state_maybe_zero_3 (line 34) | def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
  function split_to_even_chunks (line 40) | def split_to_even_chunks(indices, lengths, num_chunks):
  function get_modality_length_grouped_indices (line 62) | def get_modality_length_grouped_indices(lengths, batch_size, world_size,...
  function get_length_grouped_indices (line 90) | def get_length_grouped_indices(lengths, batch_size, world_size, generato...
  class LengthGroupedSampler (line 101) | class LengthGroupedSampler(Sampler):
    method __init__ (line 107) | def __init__(
    method __len__ (line 124) | def __len__(self):
    method __iter__ (line 127) | def __iter__(self):
  class LLaVATrainer (line 135) | class LLaVATrainer(Trainer):
    method _get_train_sampler (line 137) | def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
    method create_optimizer (line 152) | def create_optimizer(self):
    method _save_checkpoint (line 256) | def _save_checkpoint(self, model, trial, metrics=None):
    method _save (line 277) | def _save(self, output_dir: Optional[str] = None, state_dict=None):

FILE: llavamod/train/train.py
  function rank0_print (line 14) | def rank0_print(*args):
  function train (line 19) | def train():

FILE: llavamod/train/train_utils.py
  function maybe_zero_3 (line 10) | def maybe_zero_3(param, ignore_status=False, name=None):
  function get_peft_state_maybe_zero_3 (line 25) | def get_peft_state_maybe_zero_3(named_params, bias):
  function get_peft_state_non_lora_maybe_zero_3 (line 50) | def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only...
  function get_mm_adapter_state_maybe_zero_3 (line 58) | def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
  function find_all_linear_names (line 64) | def find_all_linear_names(model, add_keywords=None):
  function safe_save_model_for_hf_trainer (line 81) | def safe_save_model_for_hf_trainer(trainer: transformers.Trainer,
  function is_accelerate_greater_20_0 (line 127) | def is_accelerate_greater_20_0() -> bool:
  function is_peft_available (line 139) | def is_peft_available() -> bool:
  function is_npu_available (line 143) | def is_npu_available() -> bool:
  function is_xpu_available (line 154) | def is_xpu_available() -> bool:
  function is_transformers_greater_than (line 170) | def is_transformers_greater_than(current_version: str) -> bool:
  function is_wandb_available (line 182) | def is_wandb_available() -> bool:

FILE: llavamod/utils.py
  function order_pick_k (line 17) | def order_pick_k(lst, k):
  class HookTool (line 32) | class HookTool:
    method __init__ (line 33) | def __init__(self):
    method hook_fun (line 35) | def hook_fun(self, module, fea_in, fea_out):
  function get_gating_logit_by_hook (line 38) | def get_gating_logit_by_hook(model):
  function build_logger (line 50) | def build_logger(logger_name, logger_filename):
  class StreamToLogger (line 93) | class StreamToLogger(object):
    method __init__ (line 97) | def __init__(self, logger, log_level=logging.INFO):
    method __getattr__ (line 103) | def __getattr__(self, attr):
    method write (line 106) | def write(self, buf):
    method flush (line 120) | def flush(self):
  function disable_torch_init (line 126) | def disable_torch_init():
  function violates_moderation (line 135) | def violates_moderation(text):
  function pretty_print_semaphore (line 156) | def pretty_print_semaphore(semaphore):

FILE: scripts/activated_params.py
  function num_param (line 1) | def num_param(vocab_size, hidden_size, num_hidden_layers, intermediate_s...

FILE: scripts/convert_mmbench_for_submission.py
  function get_args (line 6) | def get_args():

FILE: scripts/convert_seed_for_submission.py
  function get_args (line 6) | def get_args():
  function eval_single (line 14) | def eval_single(result_file, eval_only_type=None):

FILE: scripts/convert_sqa_to_llava.py
  function convert_to_llava (line 8) | def convert_to_llava(base_dir, split, prompt_format="QCM-LEA"):
  function convert_to_jsonl (line 49) | def convert_to_jsonl(base_dir, split, prompt_format="QCM-LEPA"):
  function main (line 83) | def main(task, **kwargs):

FILE: scripts/convert_sqa_to_llava_base_prompt.py
  function get_question_text (line 1) | def get_question_text(problem):
  function get_context_text (line 6) | def get_context_text(problem, use_caption):
  function get_choice_text (line 15) | def get_choice_text(probelm, options):
  function get_answer (line 25) | def get_answer(problem, options):
  function get_lecture_text (line 29) | def get_lecture_text(problem):
  function get_solution_text (line 35) | def get_solution_text(problem):
  function create_one_example_chatbot (line 41) | def create_one_example_chatbot(format, question, context, choice, answer...
  function create_one_example (line 106) | def create_one_example(format, question, context, choice, answer, lectur...
  function create_one_example_gpt4 (line 162) | def create_one_example_gpt4(format, question, context, choice, answer, l...
  function build_prompt_chatbot (line 221) | def build_prompt_chatbot(problems, shot_qids, prompt_format, use_caption...
  function build_prompt (line 244) | def build_prompt(problems, shot_qids, test_qid, args):
  function build_prompt_gpt4 (line 291) | def build_prompt_gpt4(problems, shot_qids, test_qid, args):

FILE: scripts/convert_vizwiz_for_submission.py
  function parse_args (line 8) | def parse_args():

FILE: scripts/convert_vqav2_for_submission.py
  function parse_args (line 8) | def parse_args():

FILE: scripts/extract_mm_projector.py
  function parse_args (line 14) | def parse_args():

FILE: scripts/merge_lora_weights.py
  function merge_lora (line 6) | def merge_lora(args):

FILE: scripts/merge_moe_lora_weights.py
  function _replace_module (line 13) | def _replace_module(parent_module, child_name, new_module, old_module):
  function _unload_and_optionally_merge (line 31) | def _unload_and_optionally_merge(model, merge=True):
  function merge_lora (line 70) | def merge_lora(args):
Copy disabled (too large) Download .json
Condensed preview — 181 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (10,591K chars).
[
  {
    "path": "LICENSE",
    "chars": 11357,
    "preview": "                                 Apache License\n                           Version 2.0, January 2004\n                   "
  },
  {
    "path": "README.md",
    "chars": 6950,
    "preview": "# <div align=\"center\">LLaVA-MoD: Making LLaVA Tiny via MoE Knowledge Distillation<div>\n\n<div align=\"center\">\n<!-- [![arX"
  },
  {
    "path": "docs/INFERENCE.md",
    "chars": 521,
    "preview": "## Inference\n\nWe provide commandline inference and batch inference scripts.\n\n### CLI Inference\n```Shell\ndeepspeed --incl"
  },
  {
    "path": "docs/TRAIN_EVAL.md",
    "chars": 7522,
    "preview": "## Preliminary\n### Download Pretrained Checkpoints\nWe use [clip-vit-large-patch14-336](https://huggingface.co/openai/cli"
  },
  {
    "path": "llavamod/__init__.py",
    "chars": 808,
    "preview": "from .model import LlavaLlamaForCausalLM\nfrom .model import LLaVAMoDLlamaForCausalLM\nfrom .model import LlavaQWenForCaus"
  },
  {
    "path": "llavamod/config/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "llavamod/config/args.py",
    "chars": 5315,
    "preview": "from typing import Optional, List\nfrom dataclasses import field\n\nfrom llavamod.data.dataset import *\nfrom llavamod.train"
  },
  {
    "path": "llavamod/config/dpconfig/dpo_zero2.json",
    "chars": 1135,
    "preview": "{\n    \"fp16\": {\n        \"enabled\": \"auto\",\n        \"loss_scale\": 0,\n        \"loss_scale_window\": 1000,\n        \"initial_"
  },
  {
    "path": "llavamod/config/dpconfig/dpo_zero2_offload.json",
    "chars": 1260,
    "preview": "{\n    \"fp16\": {\n        \"enabled\": \"auto\",\n        \"loss_scale\": 0,\n        \"loss_scale_window\": 100,\n        \"initial_s"
  },
  {
    "path": "llavamod/config/dpconfig/zero2.json",
    "chars": 556,
    "preview": "{\n    \"fp16\": {\n        \"enabled\": \"auto\",\n        \"loss_scale\": 0,\n        \"loss_scale_window\": 1000,\n        \"initial_"
  },
  {
    "path": "llavamod/config/dpconfig/zero2_offload.json",
    "chars": 625,
    "preview": "{\n    \"fp16\": {\n        \"enabled\": \"auto\",\n        \"loss_scale\": 0,\n        \"loss_scale_window\": 100,\n        \"initial_s"
  },
  {
    "path": "llavamod/config/dpconfig/zero3.json",
    "chars": 801,
    "preview": "{\n    \"fp16\": {\n        \"enabled\": \"auto\",\n        \"loss_scale\": 0,\n        \"loss_scale_window\": 100,\n        \"initial_s"
  },
  {
    "path": "llavamod/config/dpconfig/zero3_offload.json",
    "chars": 1280,
    "preview": "{\n  \"fp16\": {\n    \"enabled\": \"auto\",\n    \"loss_scale\": 0,\n    \"loss_scale_window\": 1000,\n    \"initial_scale_power\": 16,\n"
  },
  {
    "path": "llavamod/constants.py",
    "chars": 826,
    "preview": "CONTROLLER_HEART_BEAT_EXPIRATION = 30\nWORKER_HEART_BEAT_INTERVAL = 15\n\nLOGDIR = \".\"\n\nIGNORE_INDEX = -100\n\nIMAGE_TOKEN_IN"
  },
  {
    "path": "llavamod/conversation.py",
    "chars": 18395,
    "preview": "import dataclasses\nfrom enum import auto, Enum\nfrom typing import List, Tuple\n\n\nclass SeparatorStyle(Enum):\n    \"\"\"Diffe"
  },
  {
    "path": "llavamod/data/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "llavamod/data/data_utils.py",
    "chars": 27047,
    "preview": "import copy\nfrom typing import Dict, Sequence\n\nimport torch\n\nfrom llavamod.constants import IGNORE_INDEX, DEFAULT_IMAGE_"
  },
  {
    "path": "llavamod/data/dataset.py",
    "chars": 25055,
    "preview": "import os\nimport json\nimport random\nfrom dataclasses import dataclass\n\nfrom torch.utils.data import Dataset\n\nfrom llavam"
  },
  {
    "path": "llavamod/eval/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "llavamod/eval/eval_gpt_mmhal.py",
    "chars": 10201,
    "preview": "import argparse\nimport json\nimport time\nfrom gpt4_grpc import Chat\n\ntemplate = '''Please act as an impartial and objecti"
  },
  {
    "path": "llavamod/eval/eval_gpt_objhal.py",
    "chars": 28364,
    "preview": "import os\nimport sys\nimport ssl\nimport json\nimport copy\nimport glob\nimport time\nimport pathlib\nimport random\nimport json"
  },
  {
    "path": "llavamod/eval/eval_gpt_review.py",
    "chars": 3620,
    "preview": "import argparse\nimport json\nimport os\n\nimport openai\nimport tqdm\nimport ray\nimport time\n\nNUM_SECONDS_TO_SLEEP = 3\n\n@ray."
  },
  {
    "path": "llavamod/eval/eval_gpt_review_bench.py",
    "chars": 4171,
    "preview": "import argparse\nimport json\nimport os\n\nimport openai\nimport time\n\nNUM_SECONDS_TO_SLEEP = 0.5\n\n\ndef get_eval(content: str"
  },
  {
    "path": "llavamod/eval/eval_gpt_review_visual.py",
    "chars": 4177,
    "preview": "import argparse\nimport json\nimport os\n\nimport openai\nimport time\n\nNUM_SECONDS_TO_SLEEP = 0.5\n\n\ndef get_eval(content: str"
  },
  {
    "path": "llavamod/eval/eval_gqa.py",
    "chars": 20332,
    "preview": "# Evaluation code for GQA.\n# Computes a suite of metrics such as accuracy, consistency, plausibility and scores per ques"
  },
  {
    "path": "llavamod/eval/eval_gqa_1.py",
    "chars": 20093,
    "preview": "# Evaluation code for GQA.\n# Computes a suite of metrics such as accuracy, consistency, plausibility and scores per ques"
  },
  {
    "path": "llavamod/eval/eval_pope.py",
    "chars": 2737,
    "preview": "import os\nimport json\nimport argparse\n\n\ndef eval_pope(answers, label_file):\n    label_list = [json.loads(q)['label'] for"
  },
  {
    "path": "llavamod/eval/eval_science_qa.py",
    "chars": 3920,
    "preview": "import argparse\nimport json\nimport os\nimport re\nimport random\n\n\ndef get_args():\n    parser = argparse.ArgumentParser()\n "
  },
  {
    "path": "llavamod/eval/eval_science_qa_gpt4.py",
    "chars": 3675,
    "preview": "import argparse\nimport json\nimport os\nimport re\nimport random\nfrom collections import defaultdict\n\n\ndef get_args():\n    "
  },
  {
    "path": "llavamod/eval/eval_science_qa_gpt4_requery.py",
    "chars": 5774,
    "preview": "import argparse\nimport json\nimport os\nimport re\nimport random\nfrom collections import defaultdict\n\n\ndef get_args():\n    "
  },
  {
    "path": "llavamod/eval/eval_textvqa.py",
    "chars": 2229,
    "preview": "import os\nimport argparse\nimport json\nimport re\n\nfrom llavamod.eval.m4c_evaluator import TextVQAAccuracyEvaluator\n\n\ndef "
  },
  {
    "path": "llavamod/eval/generate_webpage_data_from_table.py",
    "chars": 4088,
    "preview": "\"\"\"Generate json file for webpage.\"\"\"\nimport json\nimport os\nimport re\n\n# models = ['llama', 'alpaca', 'gpt35', 'bard']\nm"
  },
  {
    "path": "llavamod/eval/gpt4_grpc.py",
    "chars": 1918,
    "preview": "import openai\nimport time\n\nNUM_SECONDS_TO_SLEEP = 0.5\n\nclass Chat:\n    def __init__(self, model=\"\", timeout_sec=20, open"
  },
  {
    "path": "llavamod/eval/m4c_evaluator.py",
    "chars": 10265,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\nimport re\n\nfrom tqdm import tqdm\n\n\nclass EvalAIAnswerProcessor:\n    \""
  },
  {
    "path": "llavamod/eval/model_qa.py",
    "chars": 3291,
    "preview": "import argparse\nfrom transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria\nimport torch\nimport os\nim"
  },
  {
    "path": "llavamod/eval/model_vqa.py",
    "chars": 6350,
    "preview": "import argparse\nimport torch\nimport os\nimport json\nfrom tqdm import tqdm\nimport shortuuid\n\nfrom llavamod.constants impor"
  },
  {
    "path": "llavamod/eval/model_vqa_loader.py",
    "chars": 7962,
    "preview": "import argparse\nimport torch\nimport os\nimport json\nfrom tqdm import tqdm\nimport shortuuid\n\nfrom llavamod.constants impor"
  },
  {
    "path": "llavamod/eval/model_vqa_mmbench.py",
    "chars": 9608,
    "preview": "import argparse\nimport torch\nimport os\nimport json\nimport pandas as pd\nfrom tqdm import tqdm\nimport shortuuid\n\nfrom llav"
  },
  {
    "path": "llavamod/eval/model_vqa_mmhal.py",
    "chars": 5256,
    "preview": "import argparse\nimport torch\nimport os\nimport json\nfrom tqdm import tqdm\nimport shortuuid\nimport random\n\nfrom llavamod.c"
  },
  {
    "path": "llavamod/eval/model_vqa_objhal.py",
    "chars": 6500,
    "preview": "import argparse\nimport torch\nimport os\nimport json\nfrom tqdm import tqdm\nimport shortuuid\nimport base64\nimport io\n\nfrom "
  },
  {
    "path": "llavamod/eval/model_vqa_qbench.py",
    "chars": 4986,
    "preview": "import argparse\nimport torch\nfrom tqdm import tqdm\nimport json\n\nfrom llavamod.constants import IMAGE_TOKEN_INDEX, DEFAUL"
  },
  {
    "path": "llavamod/eval/model_vqa_science.py",
    "chars": 8314,
    "preview": "import argparse\nimport torch\nimport os\nimport json\nfrom tqdm import tqdm\nimport shortuuid\n\nfrom llavamod.constants impor"
  },
  {
    "path": "llavamod/eval/qa_baseline_gpt35.py",
    "chars": 2345,
    "preview": "\"\"\"Generate answers with GPT-3.5\"\"\"\n# Note: you need to be using OpenAI Python v0.27.0 for the code below to work\nimport"
  },
  {
    "path": "llavamod/eval/run_llava.py",
    "chars": 4907,
    "preview": "import argparse\nimport torch\n\n\nfrom llavamod.constants import (\n    IMAGE_TOKEN_INDEX,\n    DEFAULT_IMAGE_TOKEN,\n    DEFA"
  },
  {
    "path": "llavamod/eval/summarize_gpt_review.py",
    "chars": 2481,
    "preview": "import json\nimport os\nfrom collections import defaultdict\n\nimport numpy as np\n\nimport argparse\n\n\ndef parse_args():\n    p"
  },
  {
    "path": "llavamod/eval/table/answer/answer_alpaca-13b.jsonl",
    "chars": 57071,
    "preview": "{\"question_id\": 1, \"text\": \"Improving time management skills involves setting priorities, breaking tasks into smaller ch"
  },
  {
    "path": "llavamod/eval/table/answer/answer_bard.jsonl",
    "chars": 112274,
    "preview": "{\"answer_id\": \"3oW4JY265ZPJGTYi2CgRYF\", \"model_id\": \"bard:20230327\", \"question_id\": 1, \"text\": \"Here are some tips on ho"
  },
  {
    "path": "llavamod/eval/table/answer/answer_gpt35.jsonl",
    "chars": 107603,
    "preview": "{\"answer_id\": \"BZGowHM7L3RvtWRktKZjLT\", \"model_id\": \"gpt-3.5-turbo:20230327\", \"question_id\": 1, \"text\": \"Here are some t"
  },
  {
    "path": "llavamod/eval/table/answer/answer_llama-13b.jsonl",
    "chars": 76353,
    "preview": "{\"answer_id\": \"J3UA6eGXGyFeUGqGpP3g34\", \"model_id\": \"llama-13b:v1\", \"question_id\": 1, \"text\": \"The following are some st"
  },
  {
    "path": "llavamod/eval/table/answer/answer_vicuna-13b.jsonl",
    "chars": 131904,
    "preview": "{\"answer_id\": \"cV4zXygaNP6CXEsgdHMEqz\", \"model_id\": \"vicuna-13b:20230322-clean-lang\", \"question_id\": 1, \"text\": \"Improvi"
  },
  {
    "path": "llavamod/eval/table/caps_boxes_coco2014_val_80.jsonl",
    "chars": 58574,
    "preview": "{\"id\": \"000000296284\", \"image\": \"000000296284.jpg\", \"captions\": [\"A donut shop is full of different flavors of donuts.\","
  },
  {
    "path": "llavamod/eval/table/model.jsonl",
    "chars": 681,
    "preview": "{\"model_id\": \"vicuna-13b:20230322-clean-lang\", \"model_name\": \"vicuna-13b\", \"model_version\": \"20230322-clean-lang\", \"mode"
  },
  {
    "path": "llavamod/eval/table/prompt.jsonl",
    "chars": 5129,
    "preview": "{\"prompt_id\": 1, \"system_prompt\": \"You are a helpful and precise assistant for checking the quality of the answer.\", \"pr"
  },
  {
    "path": "llavamod/eval/table/question.jsonl",
    "chars": 12885,
    "preview": "{\"question_id\": 1, \"text\": \"How can I improve my time management skills?\", \"category\": \"generic\"}\n{\"question_id\": 2, \"te"
  },
  {
    "path": "llavamod/eval/table/results/test_sqa_llava_13b_v0.json",
    "chars": 3950324,
    "preview": "{\n  \"acc\": 90.8983730252299,\n  \"correct\": 3855,\n  \"count\": 4241,\n  \"results\": {\n    \"4\": 1,\n    \"5\": 1,\n    \"11\": 1,\n   "
  },
  {
    "path": "llavamod/eval/table/results/test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json",
    "chars": 3830902,
    "preview": "{\n  \"acc\": 91.08700778118369,\n  \"correct\": 3863,\n  \"count\": 4241,\n  \"results\": {\n    \"4\": 1,\n    \"5\": 1,\n    \"11\": 1,\n  "
  },
  {
    "path": "llavamod/eval/table/review/review_alpaca-13b_vicuna-13b.jsonl",
    "chars": 73131,
    "preview": "{\"review_id\": \"QM5m5nnioWr8M2LFHsaQvu\", \"question_id\": 1, \"answer1_id\": \"kEL9ifUHDeYuAXzevje2se\", \"answer2_id\": \"cV4zXyg"
  },
  {
    "path": "llavamod/eval/table/review/review_bard_vicuna-13b.jsonl",
    "chars": 73145,
    "preview": "{\"review_id\": \"4CeMvEQyE6fKMJwvSLY3P4\", \"question_id\": 1, \"answer1_id\": \"3oW4JY265ZPJGTYi2CgRYF\", \"answer2_id\": \"cV4zXyg"
  },
  {
    "path": "llavamod/eval/table/review/review_gpt35_vicuna-13b.jsonl",
    "chars": 73399,
    "preview": "{\"review_id\": \"jyhS7AFj2mrFNqoRXQJDPS\", \"question_id\": 1, \"answer1_id\": \"BZGowHM7L3RvtWRktKZjLT\", \"answer2_id\": \"cV4zXyg"
  },
  {
    "path": "llavamod/eval/table/review/review_llama-13b_vicuna-13b.jsonl",
    "chars": 67249,
    "preview": "{\"review_id\": \"WFp5i5yjjFethrgugKTDmX\", \"question_id\": 1, \"answer1_id\": \"J3UA6eGXGyFeUGqGpP3g34\", \"answer2_id\": \"cV4zXyg"
  },
  {
    "path": "llavamod/eval/table/reviewer.jsonl",
    "chars": 604,
    "preview": "{\"reviewer_id\": \"gpt-4-0328-default\", \"prompt_id\": 1, \"metadata\": {\"temperature\": 0.2, \"max_tokens\": 1024}, \"description"
  },
  {
    "path": "llavamod/eval/table/rule.json",
    "chars": 9098,
    "preview": "{\n    \"coding\": {\"role\": \"Assistant\", \"prompt\": \"Your task is to evaluate the coding abilities of the above two assistan"
  },
  {
    "path": "llavamod/eval/webpage/index.html",
    "chars": 7664,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n    <meta charset=\"UTF-8\">\n    <meta name=\"viewport\" content=\"width=device-width"
  },
  {
    "path": "llavamod/eval/webpage/script.js",
    "chars": 9967,
    "preview": "// Description: Script for the evaluation webpage.\n\nlet currentQuestionIndex = 1;\n\n// Store the model name mapping for l"
  },
  {
    "path": "llavamod/eval/webpage/styles.css",
    "chars": 1822,
    "preview": "body {\n    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;\n    background-color: #f8f9fa;\n}\n\n.navbar-dark "
  },
  {
    "path": "llavamod/mm_utils.py",
    "chars": 4267,
    "preview": "from PIL import Image\nfrom io import BytesIO\nimport base64\n\nimport torch\nfrom transformers import StoppingCriteria\nfrom "
  },
  {
    "path": "llavamod/model/__init__.py",
    "chars": 2178,
    "preview": "from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaLlamaConfig\nfrom .language_model.llava_llama_moe imp"
  },
  {
    "path": "llavamod/model/apply_delta.py",
    "chars": 1959,
    "preview": "\"\"\"\nUsage:\npython3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --de"
  },
  {
    "path": "llavamod/model/builder.py",
    "chars": 41708,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llavamod/model/cache_utils.py",
    "chars": 28060,
    "preview": "import copy\nimport importlib.metadata\nimport json\nimport os\nfrom dataclasses import dataclass\nfrom typing import Any, Di"
  },
  {
    "path": "llavamod/model/consolidate.py",
    "chars": 920,
    "preview": "\"\"\"\nUsage:\npython3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate\n"
  },
  {
    "path": "llavamod/model/import_utils.py",
    "chars": 2125,
    "preview": "from functools import lru_cache\nimport importlib\nimport importlib.metadata\nimport importlib.util\n\nfrom functools import "
  },
  {
    "path": "llavamod/model/language_model/gemma2/configuration_gemma2.py",
    "chars": 7091,
    "preview": "# coding=utf-8\n# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.\n\n# Licensed under the Apache Lic"
  },
  {
    "path": "llavamod/model/language_model/gemma2/modeling_gemma2.py",
    "chars": 60625,
    "preview": "# coding=utf-8\n# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.\n#\n#\n# Licensed under the Apache "
  },
  {
    "path": "llavamod/model/language_model/gemma2/tokenization_gemma2.py",
    "chars": 14030,
    "preview": "# coding=utf-8\n# Copyright 2024 The HuggingFace Inc. team. All rights reserved.\n#\n# Licensed under the Apache License, V"
  },
  {
    "path": "llavamod/model/language_model/gemma2/tokenization_gemma2_fast.py",
    "chars": 8307,
    "preview": "# coding=utf-8\n# Copyright 2024 The HuggingFace Inc. team.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lice"
  },
  {
    "path": "llavamod/model/language_model/llama/configuration_llama.py",
    "chars": 9406,
    "preview": "# coding=utf-8\n# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.\n#\n# This code is based on"
  },
  {
    "path": "llavamod/model/language_model/llama/modeling_llama.py",
    "chars": 65626,
    "preview": "# coding=utf-8\n# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.\n#\n# This code is based on"
  },
  {
    "path": "llavamod/model/language_model/llava_gemma2.py",
    "chars": 4845,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llavamod/model/language_model/llava_gemma2_moe.py",
    "chars": 29313,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llavamod/model/language_model/llava_llama.py",
    "chars": 4302,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llavamod/model/language_model/llava_llama_moe.py",
    "chars": 31795,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llavamod/model/language_model/llava_minicpm.py",
    "chars": 4802,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llavamod/model/language_model/llava_minicpm_moe.py",
    "chars": 24835,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llavamod/model/language_model/llava_mistral.py",
    "chars": 4734,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llavamod/model/language_model/llava_mistral_moe.py",
    "chars": 23252,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llavamod/model/language_model/llava_mpt.py",
    "chars": 5909,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llavamod/model/language_model/llava_phi.py",
    "chars": 4706,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llavamod/model/language_model/llava_phi_moe.py",
    "chars": 21952,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llavamod/model/language_model/llava_qwen.py",
    "chars": 8157,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llavamod/model/language_model/llava_qwen1_5.py",
    "chars": 6207,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llavamod/model/language_model/llava_qwen1_5_moe.py",
    "chars": 29656,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llavamod/model/language_model/llava_qwen2.py",
    "chars": 4694,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llavamod/model/language_model/llava_qwen2_moe.py",
    "chars": 29566,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llavamod/model/language_model/llava_qwen_moe.py",
    "chars": 29653,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llavamod/model/language_model/llava_stablelm.py",
    "chars": 4567,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llavamod/model/language_model/llava_stablelm_moe.py",
    "chars": 22343,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llavamod/model/language_model/minicpm/configuration_minicpm.py",
    "chars": 9634,
    "preview": "# coding=utf-8\n# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.\n#\n# This code is based on"
  },
  {
    "path": "llavamod/model/language_model/minicpm/modeling_minicpm.py",
    "chars": 67873,
    "preview": "# coding=utf-8\n# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.\n#\n# This code is based on"
  },
  {
    "path": "llavamod/model/language_model/mpt/adapt_tokenizer.py",
    "chars": 1752,
    "preview": "from typing import Union\nfrom transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast\nTokenizer "
  },
  {
    "path": "llavamod/model/language_model/mpt/attention.py",
    "chars": 17695,
    "preview": "\"\"\"Attention layers.\"\"\"\nimport math\nimport warnings\nfrom typing import Optional\nimport torch\nimport torch.nn as nn\nfrom "
  },
  {
    "path": "llavamod/model/language_model/mpt/blocks.py",
    "chars": 2551,
    "preview": "\"\"\"GPT Blocks used for the GPT Model.\"\"\"\nfrom typing import Dict, Optional, Tuple\nimport torch\nimport torch.nn as nn\nfro"
  },
  {
    "path": "llavamod/model/language_model/mpt/configuration_mpt.py",
    "chars": 9198,
    "preview": "\"\"\"A HuggingFace-style model configuration.\"\"\"\nfrom typing import Dict, Optional, Union\nfrom transformers import Pretrai"
  },
  {
    "path": "llavamod/model/language_model/mpt/custom_embedding.py",
    "chars": 305,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch import Tensor\n\nclass SharedEmbedding(nn.Em"
  },
  {
    "path": "llavamod/model/language_model/mpt/flash_attn_triton.py",
    "chars": 28182,
    "preview": "\"\"\"\nCopied from https://github.com/HazyResearch/flash-attention/blob/eff9fe6b8076df59d64d7a3f464696738a3c7c24/flash_attn"
  },
  {
    "path": "llavamod/model/language_model/mpt/hf_prefixlm_converter.py",
    "chars": 27219,
    "preview": "\"\"\"Converts Huggingface Causal LM to Prefix LM.\n\nConversion does lightweight surgery on a HuggingFace\nCausal LM to conve"
  },
  {
    "path": "llavamod/model/language_model/mpt/meta_init_context.py",
    "chars": 3639,
    "preview": "from contextlib import contextmanager\nimport torch\nimport torch.nn as nn\n\n@contextmanager\ndef init_empty_weights(include"
  },
  {
    "path": "llavamod/model/language_model/mpt/modeling_mpt.py",
    "chars": 19710,
    "preview": "\"\"\"A simple, flexible implementation of a GPT model.\n\nInspired by https://github.com/karpathy/minGPT/blob/master/mingpt/"
  },
  {
    "path": "llavamod/model/language_model/mpt/norm.py",
    "chars": 2563,
    "preview": "import torch\n\ndef _cast_if_autocast_enabled(tensor):\n    if torch.is_autocast_enabled():\n        if tensor.device.type ="
  },
  {
    "path": "llavamod/model/language_model/mpt/param_init_fns.py",
    "chars": 12556,
    "preview": "import math\nimport warnings\nfrom collections.abc import Sequence\nfrom functools import partial\nfrom typing import Option"
  },
  {
    "path": "llavamod/model/language_model/phi/configuration_phi.py",
    "chars": 9258,
    "preview": "# coding=utf-8\n# Copyright 2023 Microsoft and the HuggingFace Inc. team. All rights reserved.\n#\n# Licensed under the Apa"
  },
  {
    "path": "llavamod/model/language_model/phi/modeling_phi.py",
    "chars": 62665,
    "preview": "# coding=utf-8\n# Copyright 2023 Microsoft and the HuggingFace Inc. team. All rights reserved.\n#\n# Licensed under the Apa"
  },
  {
    "path": "llavamod/model/language_model/qwen/configuration_qwen.py",
    "chars": 2344,
    "preview": "# Copyright (c) Alibaba Cloud.\n\n# This source code is licensed under the license found in the\n# LICENSE file in the root"
  },
  {
    "path": "llavamod/model/language_model/qwen/cpp_kernels.py",
    "chars": 1924,
    "preview": "from torch.utils import cpp_extension\nimport pathlib\nimport os\nimport subprocess\n\ndef _get_cuda_bare_metal_version(cuda_"
  },
  {
    "path": "llavamod/model/language_model/qwen/modeling_qwen.py",
    "chars": 55516,
    "preview": "# Copyright (c) Alibaba Cloud.\n#\n# This source code is licensed under the license found in the\n# LICENSE file in the roo"
  },
  {
    "path": "llavamod/model/language_model/qwen/qwen_generation_utils.py",
    "chars": 14604,
    "preview": "# Copyright (c) Alibaba Cloud.\n#\n# This source code is licensed under the license found in the\n# LICENSE file in the roo"
  },
  {
    "path": "llavamod/model/language_model/qwen/tokenization_qwen.py",
    "chars": 9612,
    "preview": "# Copyright (c) Alibaba Cloud.\n#\n# This source code is licensed under the license found in the\n# LICENSE file in the roo"
  },
  {
    "path": "llavamod/model/language_model/qwen1_5/configuration_qwen2.py",
    "chars": 6816,
    "preview": "# coding=utf-8\n# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.\n#\n# Lic"
  },
  {
    "path": "llavamod/model/language_model/qwen1_5/modeling_qwen2.py",
    "chars": 64144,
    "preview": "# coding=utf-8\n# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.\n#\n# Thi"
  },
  {
    "path": "llavamod/model/language_model/qwen2/configuration_qwen2.py",
    "chars": 6816,
    "preview": "# coding=utf-8\n# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.\n#\n# Lic"
  },
  {
    "path": "llavamod/model/language_model/qwen2/modeling_qwen2.py",
    "chars": 62887,
    "preview": "# coding=utf-8\n# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.\n#\n# Thi"
  },
  {
    "path": "llavamod/model/language_model/stablelm/configuration_stablelm_epoch.py",
    "chars": 5453,
    "preview": "# Copyright 2023 Stability and The HuggingFace Inc. team. All rights reserved.\n#\n# Licensed under the Apache License, Ve"
  },
  {
    "path": "llavamod/model/language_model/stablelm/modeling_stablelm_epoch.py",
    "chars": 38196,
    "preview": "# coding=utf-8\n# Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. All rights reserved.\n#\n# Licens"
  },
  {
    "path": "llavamod/model/language_model/stablelm/tokenization_arcade100k.py",
    "chars": 9462,
    "preview": "# coding=utf-8\n# Copyright (c) 2023 Alibaba Cloud & Stability AI.\n#\n# Tongyi Qianwen LICENSE AGREEMENT:\n# https://github"
  },
  {
    "path": "llavamod/model/llava_arch.py",
    "chars": 28292,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llavamod/model/make_delta.py",
    "chars": 2260,
    "preview": "\"\"\"\nUsage:\npython3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~"
  },
  {
    "path": "llavamod/model/modeling_flash_attention_utils.py",
    "chars": 13862,
    "preview": "# coding=utf-8\n# Copyright 2024 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.\n#\n# Licensed und"
  },
  {
    "path": "llavamod/model/multimodal_encoder/builder.py",
    "chars": 2008,
    "preview": "import os\nfrom .clip_encoder import CLIPVisionTower\nfrom .clips2_encoder import CLIPVisionTowerS2\nimport transformers\n\na"
  },
  {
    "path": "llavamod/model/multimodal_encoder/clip_encoder.py",
    "chars": 2939,
    "preview": "import torch\nimport torch.nn as nn\n\nfrom transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig\n\n\ncla"
  },
  {
    "path": "llavamod/model/multimodal_encoder/clips2_encoder.py",
    "chars": 2745,
    "preview": "import torch\nimport torch.nn as nn\n\nfrom transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig\nfrom "
  },
  {
    "path": "llavamod/model/multimodal_encoder/siglip_encoder.py",
    "chars": 2850,
    "preview": "import torch\nimport torch.nn as nn\n\nfrom transformers import SiglipImageProcessor, SiglipVisionConfig\nfrom transformers."
  },
  {
    "path": "llavamod/model/multimodal_projector/builder.py",
    "chars": 9210,
    "preview": "import torch\nimport torch.nn as nn\nimport re\n\nfrom einops import rearrange\n\nfrom llavamod.model.multimodal_projector.poo"
  },
  {
    "path": "llavamod/model/multimodal_projector/pool_block.py",
    "chars": 1352,
    "preview": "import re\n\nimport torch\nfrom einops import rearrange\nfrom timm.models.vision_transformer import Block\nfrom torch import "
  },
  {
    "path": "llavamod/model/multimodal_projector/qformer.py",
    "chars": 12532,
    "preview": "import re\nfrom typing import Optional\n\nimport torch\nfrom torch import nn\nfrom transformers import Blip2PreTrainedModel, "
  },
  {
    "path": "llavamod/model/multimodal_projector/simple_block.py",
    "chars": 5620,
    "preview": "import re\n\nimport torch\nfrom einops import rearrange\nfrom timm.models.vision_transformer import Block\nfrom torch import "
  },
  {
    "path": "llavamod/model/utils.py",
    "chars": 4703,
    "preview": "import logging\nimport torch\nfrom transformers import AutoConfig\nfrom copy import deepcopy\n\n\ndef auto_upgrade(config):\n  "
  },
  {
    "path": "llavamod/serve/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "llavamod/serve/cli.py",
    "chars": 5238,
    "preview": "import argparse\nimport torch\n\nfrom llavamod.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TO"
  },
  {
    "path": "llavamod/serve/utils.py",
    "chars": 457,
    "preview": "from io import BytesIO\n\nimport requests\nfrom PIL import Image\n\n\ndef load_image(image_file):\n    if image_file.startswith"
  },
  {
    "path": "llavamod/train/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "llavamod/train/align_train.py",
    "chars": 31870,
    "preview": "import copy\nimport wandb\nimport pathlib\nfrom glob import glob\n\nfrom llavamod.config.args import ModelArguments, DataArgu"
  },
  {
    "path": "llavamod/train/align_trainer.py",
    "chars": 26915,
    "preview": "import os\nimport warnings\nfrom copy import deepcopy\nfrom collections import defaultdict\nfrom typing import (\n    Any, Ca"
  },
  {
    "path": "llavamod/train/dpo_train.py",
    "chars": 29192,
    "preview": "import copy\nimport wandb\nimport pathlib\nfrom glob import glob\n\nfrom llavamod.config.args import ModelArguments, DataArgu"
  },
  {
    "path": "llavamod/train/dpo_trainer.py",
    "chars": 31126,
    "preview": "import os\nimport warnings\nfrom copy import deepcopy\nfrom collections import OrderedDict, defaultdict\nfrom typing import "
  },
  {
    "path": "llavamod/train/llava_trainer.py",
    "chars": 12432,
    "preview": "import os\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch.utils.data import Sampler\n\nfrom"
  },
  {
    "path": "llavamod/train/train.py",
    "chars": 28059,
    "preview": "import wandb\nimport pathlib\nfrom glob import glob\n\nfrom llavamod.config.args import ModelArguments, DataArguments, Train"
  },
  {
    "path": "llavamod/train/train_utils.py",
    "chars": 6342,
    "preview": "import os\nimport sys\nimport logging\nimport torch\nimport transformers\n\nfrom importlib.util import find_spec\n\n\ndef maybe_z"
  },
  {
    "path": "llavamod/utils.py",
    "chars": 4889,
    "preview": "import datetime\nimport logging\nimport logging.handlers\nimport os\nimport sys\nfrom torch import nn\nimport numpy as np\nimpo"
  },
  {
    "path": "requirements.txt",
    "chars": 452,
    "preview": "tokenizers==0.15.1\ntransformers==4.37.0\ntiktoken==0.5.2\nprotobuf==4.24.4\naccelerate==0.21.0\nbackoff\nbitsandbytes==0.43.0"
  },
  {
    "path": "scripts/activated_params.py",
    "chars": 1215,
    "preview": "def num_param(vocab_size, hidden_size, num_hidden_layers, intermediate_size, ffn_factor, freq_moe_layer, num_experts):\n "
  },
  {
    "path": "scripts/convert_gqa_for_eval.py",
    "chars": 487,
    "preview": "import os\nimport json\nimport argparse\n\nparser = argparse.ArgumentParser()\nparser.add_argument(\"--src\", type=str)\nparser."
  },
  {
    "path": "scripts/convert_mmbench_for_submission.py",
    "chars": 980,
    "preview": "import os\nimport json\nimport argparse\nimport pandas as pd\n\ndef get_args():\n    parser = argparse.ArgumentParser()\n    pa"
  },
  {
    "path": "scripts/convert_mmvet_for_eval.py",
    "chars": 397,
    "preview": "import os\nimport json\nimport argparse\n\nparser = argparse.ArgumentParser()\nparser.add_argument(\"--src\", type=str)\nparser."
  },
  {
    "path": "scripts/convert_seed_for_submission.py",
    "chars": 2571,
    "preview": "import os\nimport json\nimport argparse\n\n\ndef get_args():\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\""
  },
  {
    "path": "scripts/convert_sqa_to_llava.py",
    "chars": 2971,
    "preview": "import json\nimport os\nimport fire\nimport re\nfrom convert_sqa_to_llava_base_prompt import build_prompt_chatbot\n\n\ndef conv"
  },
  {
    "path": "scripts/convert_sqa_to_llava_base_prompt.py",
    "chars": 13795,
    "preview": "def get_question_text(problem):\n    question = problem['question']\n    return question\n\n\ndef get_context_text(problem, u"
  },
  {
    "path": "scripts/convert_vizwiz_for_submission.py",
    "chars": 1390,
    "preview": "import os\nimport argparse\nimport json\n\nfrom llavamod.eval.m4c_evaluator import EvalAIAnswerProcessor\n\n\ndef parse_args():"
  },
  {
    "path": "scripts/convert_vqav2_for_submission.py",
    "chars": 1746,
    "preview": "import os\nimport argparse\nimport json\n\nfrom llavamod.eval.m4c_evaluator import EvalAIAnswerProcessor\n\n\ndef parse_args():"
  },
  {
    "path": "scripts/extract_mm_projector.py",
    "chars": 1604,
    "preview": "\"\"\"\nThis is just a utility that I use to extract the projector for quantized models.\nIt is NOT necessary at all to train"
  },
  {
    "path": "scripts/kill.py",
    "chars": 185,
    "preview": "#!/usr/bin/env python\n# coding=utf-8\nimport os\npid = list(set(os.popen('fuser -v /dev/nvidia*').read().split()))\nkill_cm"
  },
  {
    "path": "scripts/merge_lora_weights.py",
    "chars": 773,
    "preview": "import argparse\nfrom llavamod.model.builder import load_pretrained_model\nfrom llavamod.mm_utils import get_model_name_fr"
  },
  {
    "path": "scripts/merge_moe_lora_weights.py",
    "chars": 3790,
    "preview": "import argparse\n\nimport torch\nfrom peft.tuners.lora import LoraLayer\nfrom peft.utils import ModulesToSaveWrapper, _get_s"
  },
  {
    "path": "scripts/sqa_eval_batch.sh",
    "chars": 524,
    "preview": "#!/bin/bash\n\nCHUNKS=8\nfor IDX in {0..7}; do\n    CUDA_VISIBLE_DEVICES=$IDX python -m llava.eval.model_vqa_science \\\n     "
  },
  {
    "path": "scripts/sqa_eval_gather.sh",
    "chars": 518,
    "preview": "#!/bin/bash\n\nCHUNKS=8\noutput_file=\"test_llava-13b.jsonl\"\n\n# Clear out the output file if it exists.\n> \"$output_file\"\n\n# "
  },
  {
    "path": "shells/eval/gqa.sh",
    "chars": 1448,
    "preview": "#!/bin/bash\n\nGPULIST=(0 1 2 3)\nCHUNKS=${#GPULIST[@]}\n\nMODEL_NAME='your_model_name'\nMODEL_PATH='your_model_path'\n\nCONV=\"q"
  },
  {
    "path": "shells/eval/mmbench.sh",
    "chars": 746,
    "preview": "#!/bin/bash\n\nMODEL_NAME='your_model_name'\nMODEL_PATH='your_model_path'\n\nCONV=\"qwen\"\nSPLIT=\"mmbench_dev_en_20231003\"\nEVAL"
  },
  {
    "path": "shells/eval/mmbench_cn.sh",
    "chars": 758,
    "preview": "#!/bin/bash\n\nMODEL_NAME='your_model_name'\nMODEL_PATH='your_model_path'\n\nCONV=\"qwen\"\nEVAL=\"benchmark\"\nSPLIT=\"mmbench_dev_"
  },
  {
    "path": "shells/eval/mme.sh",
    "chars": 629,
    "preview": "#!/bin/bash\n\nMODEL_NAME='your_model_name'\nMODEL_PATH='your_model_path'\n\nCONV=\"qwen\"\nEVAL=\"benchmark\"\nIMAGE_FOLDER=\"your_"
  },
  {
    "path": "shells/eval/mmhal.sh",
    "chars": 738,
    "preview": "#!/bin/bash\n\nMODEL_NAME='your_model_name'\nMODEL_PATH='your_model_path'\n\nCONV=\"qwen\"\nEVAL=\"benchmark\"\nIMAGE_FOLDER=\"your_"
  },
  {
    "path": "shells/eval/objhal.sh",
    "chars": 717,
    "preview": "#!/bin/bash\n\nMODEL_NAME='your_model_name'\nMODEL_PATH='your_model_path'\n\nCONV=\"qwen\"\nEVAL=\"benchmark\"\nquestion_file='your"
  },
  {
    "path": "shells/eval/pope.sh",
    "chars": 654,
    "preview": "#!/bin/bash\n\nMODEL_NAME='your_model_name'\nMODEL_PATH='your_model_path'\n\nCONV=\"qwen\"\n\nEVAL=\"benchmark\"\nIMAGE_FOLDER=\"your"
  },
  {
    "path": "shells/eval/sqa.sh",
    "chars": 766,
    "preview": "#!/bin/bash\n\nMODEL_NAME='your_model_name'\nMODEL_PATH='your_model_path'\n\nCONV=\"qwen\"\nEVAL=\"benchmark\"\n\ndeepspeed --includ"
  },
  {
    "path": "shells/eval/textvqa.sh",
    "chars": 620,
    "preview": "#!/bin/bash\n\nMODEL_NAME='your_model_name'\nMODEL_PATH='your_model_path'\n\nCONV=\"qwen\"\nEVAL=\"benchmark\"\n\ndeepspeed --includ"
  },
  {
    "path": "shells/eval/vizwiz.sh",
    "chars": 672,
    "preview": "#!/bin/bash\n\nMODEL_NAME='your_model_name'\nMODEL_PATH='your_model_path'\n\nCONV=\"qwen\"\nEVAL=\"benchmark\"\n\ndeepspeed --includ"
  },
  {
    "path": "shells/inference/cli_inference.sh",
    "chars": 239,
    "preview": "#!/bin/bash\n\n\nmodel_path='your_model'\nimage_file='llavamod/serve/examples/extreme_ironing.jpg'\n\n# use qwen\ndeepspeed --i"
  },
  {
    "path": "shells/train/qwen/dense2dense_distillation.sh",
    "chars": 2739,
    "preview": "#!/bin/bash\n\n# WANDB\nWANDB_API_KEY='your_wandb_api_key'\nexport WANDB_API_KEY=${WANDB_API_KEY}\nexport WANDB_MODE=online\nw"
  },
  {
    "path": "shells/train/qwen/dense2sparse_distillation.sh",
    "chars": 2623,
    "preview": "#!/bin/bash\n\n# WANDB\nWANDB_API_KEY='your_wandb_api_key'\nexport WANDB_API_KEY=${WANDB_API_KEY}\nexport WANDB_MODE=online\nw"
  },
  {
    "path": "shells/train/qwen/finetune.sh",
    "chars": 1721,
    "preview": "#!/bin/bash\n\n# WANDB\nWANDB_API_KEY='your_wandb_api_key'\nexport WANDB_API_KEY=${WANDB_API_KEY}\nexport WANDB_MODE=online\nw"
  },
  {
    "path": "shells/train/qwen/finetune_moe.sh",
    "chars": 2079,
    "preview": "#!/bin/bash\n\n# WANDB\nWANDB_API_KEY='your_wandb_api_key'\nexport WANDB_API_KEY=${WANDB_API_KEY}\nexport WANDB_MODE=online\nw"
  },
  {
    "path": "shells/train/qwen/preference_distillation.sh",
    "chars": 2643,
    "preview": "#!/bin/bash\n\n# WANDB\nWANDB_API_KEY='your_wandb_api_key'\nexport WANDB_API_KEY=${WANDB_API_KEY}\nexport WANDB_MODE=online\nw"
  },
  {
    "path": "shells/train/qwen/pretrain.sh",
    "chars": 1571,
    "preview": "#!/bin/bash\n\n# WANDB\nWANDB_API_KEY='your_wandb_api_key'\nexport WANDB_API_KEY=${WANDB_API_KEY}\nexport WANDB_MODE=online\nw"
  }
]

About this extraction

This page contains the full source code of the shufangxun/LLaVA-MoD GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 181 files (9.9 MB), approximately 2.6M tokens, and a symbol index with 1352 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!