Full Code of haotian-liu/LLaVA for AI

main c121f0432da2 cached
140 files
459.8 KB
119.2k tokens
322 symbols
1 requests
Download .txt
Showing preview only (495K chars total). Download the full file or copy to clipboard to get everything.
Repository: haotian-liu/LLaVA
Branch: main
Commit: c121f0432da2
Files: 140
Total size: 459.8 KB

Directory structure:
gitextract_4r0bw0y0/

├── .devcontainer/
│   ├── Dockerfile
│   ├── devcontainer.env
│   ├── devcontainer.json
│   └── postCreateCommand.sh
├── .dockerignore
├── .editorconfig
├── .gitattributes
├── .github/
│   └── ISSUE_TEMPLATE/
│       ├── 1-usage.yaml
│       ├── 2-feature-request.yaml
│       ├── 3-question.yaml
│       └── 4-discussion.yaml
├── .gitignore
├── LICENSE
├── README.md
├── cog.yaml
├── docs/
│   ├── Customize_Component.md
│   ├── Data.md
│   ├── Evaluation.md
│   ├── Finetune_Custom_Data.md
│   ├── Intel.md
│   ├── LLaVA_Bench.md
│   ├── LLaVA_from_LLaMA2.md
│   ├── LoRA.md
│   ├── MODEL_ZOO.md
│   ├── ScienceQA.md
│   ├── Windows.md
│   └── macOS.md
├── llava/
│   ├── __init__.py
│   ├── constants.py
│   ├── conversation.py
│   ├── eval/
│   │   ├── eval_gpt_review.py
│   │   ├── eval_gpt_review_bench.py
│   │   ├── eval_gpt_review_visual.py
│   │   ├── eval_pope.py
│   │   ├── eval_science_qa.py
│   │   ├── eval_science_qa_gpt4.py
│   │   ├── eval_science_qa_gpt4_requery.py
│   │   ├── eval_textvqa.py
│   │   ├── generate_webpage_data_from_table.py
│   │   ├── m4c_evaluator.py
│   │   ├── model_qa.py
│   │   ├── model_vqa.py
│   │   ├── model_vqa_loader.py
│   │   ├── model_vqa_mmbench.py
│   │   ├── model_vqa_science.py
│   │   ├── qa_baseline_gpt35.py
│   │   ├── run_llava.py
│   │   ├── summarize_gpt_review.py
│   │   └── webpage/
│   │       ├── index.html
│   │       ├── script.js
│   │       └── styles.css
│   ├── mm_utils.py
│   ├── model/
│   │   ├── __init__.py
│   │   ├── apply_delta.py
│   │   ├── builder.py
│   │   ├── consolidate.py
│   │   ├── language_model/
│   │   │   ├── llava_llama.py
│   │   │   ├── llava_mistral.py
│   │   │   └── llava_mpt.py
│   │   ├── llava_arch.py
│   │   ├── make_delta.py
│   │   ├── multimodal_encoder/
│   │   │   ├── builder.py
│   │   │   └── clip_encoder.py
│   │   ├── multimodal_projector/
│   │   │   └── builder.py
│   │   └── utils.py
│   ├── serve/
│   │   ├── __init__.py
│   │   ├── cli.py
│   │   ├── controller.py
│   │   ├── gradio_web_server.py
│   │   ├── model_worker.py
│   │   ├── register_worker.py
│   │   ├── sglang_worker.py
│   │   └── test_message.py
│   ├── train/
│   │   ├── llama_flash_attn_monkey_patch.py
│   │   ├── llama_xformers_attn_monkey_patch.py
│   │   ├── llava_trainer.py
│   │   ├── train.py
│   │   ├── train_mem.py
│   │   └── train_xformers.py
│   └── utils.py
├── playground/
│   └── data/
│       └── prompts/
│           ├── complex_reasoning/
│           │   ├── 000_caps.txt
│           │   ├── 000_conv.txt
│           │   ├── 001_caps.txt
│           │   ├── 001_conv.txt
│           │   ├── 002_caps.txt
│           │   ├── 002_conv.txt
│           │   └── system_message.txt
│           ├── conversation/
│           │   ├── 000_caps.txt
│           │   ├── 000_conv.txt
│           │   ├── 001_caps.txt
│           │   ├── 001_conv.txt
│           │   └── system_message.txt
│           └── detail_description/
│               ├── 000_caps.txt
│               ├── 000_conv.txt
│               ├── 001_caps.txt
│               ├── 001_conv.txt
│               ├── 002_caps.txt
│               ├── 002_conv.txt
│               └── system_message.txt
├── predict.py
├── pyproject.toml
└── scripts/
    ├── convert_gqa_for_eval.py
    ├── convert_mmbench_for_submission.py
    ├── convert_mmvet_for_eval.py
    ├── convert_seed_for_submission.py
    ├── convert_sqa_to_llava.py
    ├── convert_sqa_to_llava_base_prompt.py
    ├── convert_vizwiz_for_submission.py
    ├── convert_vqav2_for_submission.py
    ├── extract_mm_projector.py
    ├── finetune.sh
    ├── finetune_full_schedule.sh
    ├── finetune_lora.sh
    ├── finetune_qlora.sh
    ├── finetune_sqa.sh
    ├── merge_lora_weights.py
    ├── pretrain.sh
    ├── pretrain_xformers.sh
    ├── sqa_eval_batch.sh
    ├── sqa_eval_gather.sh
    ├── upload_pypi.sh
    └── v1_5/
        ├── eval/
        │   ├── gqa.sh
        │   ├── llavabench.sh
        │   ├── mmbench.sh
        │   ├── mmbench_cn.sh
        │   ├── mme.sh
        │   ├── mmvet.sh
        │   ├── pope.sh
        │   ├── qbench.sh
        │   ├── qbench_zh.sh
        │   ├── seed.sh
        │   ├── sqa.sh
        │   ├── textvqa.sh
        │   ├── vizwiz.sh
        │   └── vqav2.sh
        ├── finetune.sh
        ├── finetune_lora.sh
        ├── finetune_task.sh
        ├── finetune_task_lora.sh
        └── pretrain.sh

================================================
FILE CONTENTS
================================================

================================================
FILE: .devcontainer/Dockerfile
================================================
FROM mcr.microsoft.com/devcontainers/base:ubuntu-20.04

SHELL [ "bash", "-c" ]

# update apt and install packages
RUN apt update && \
    apt install -yq \
        ffmpeg \
        dkms \
        build-essential

# add user tools
RUN sudo apt install -yq \
        jq \
        jp \
        tree \
        tldr

# add git-lfs and install
RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash && \
    sudo apt-get install -yq git-lfs && \
    git lfs install

############################################
# Setup user
############################################

USER vscode

# install azcopy, a tool to copy to/from blob storage
# for more info: https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-blobs-upload#upload-a-file
RUN cd /tmp && \
    wget https://azcopyvnext.azureedge.net/release20230123/azcopy_linux_amd64_10.17.0.tar.gz && \
    tar xvf azcopy_linux_amd64_10.17.0.tar.gz && \
    mkdir -p ~/.local/bin && \
    mv azcopy_linux_amd64_10.17.0/azcopy ~/.local/bin && \
    chmod +x ~/.local/bin/azcopy && \
    rm -rf azcopy_linux_amd64*

# Setup conda
RUN cd /tmp && \
    wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
    bash ./Miniconda3-latest-Linux-x86_64.sh -b && \
    rm ./Miniconda3-latest-Linux-x86_64.sh

# Install dotnet
RUN cd /tmp && \
    wget https://dot.net/v1/dotnet-install.sh && \
    chmod +x dotnet-install.sh && \
    ./dotnet-install.sh --channel 7.0 && \
    ./dotnet-install.sh --channel 3.1 && \
    rm ./dotnet-install.sh



================================================
FILE: .devcontainer/devcontainer.env
================================================
SAMPLE_ENV_VAR1="Sample Value"
SAMPLE_ENV_VAR2=332431bf-68bf

================================================
FILE: .devcontainer/devcontainer.json
================================================
{
    "name": "LLaVA",
    "build": {
        "dockerfile": "Dockerfile",
        "context": "..",
        "args": {}
    },
    "features": {
        "ghcr.io/devcontainers/features/docker-in-docker:2": {},
        "ghcr.io/devcontainers/features/azure-cli:1": {},
        "ghcr.io/azure/azure-dev/azd:0": {},
        "ghcr.io/devcontainers/features/powershell:1": {},
        "ghcr.io/devcontainers/features/common-utils:2": {},
        "ghcr.io/devcontainers-contrib/features/zsh-plugins:0": {},
    },
    // "forwardPorts": [],
    "postCreateCommand": "bash ./.devcontainer/postCreateCommand.sh",
    "customizations": {
        "vscode": {
            "settings": {
                "python.analysis.autoImportCompletions": true,
                "python.analysis.autoImportUserSymbols": true,
                "python.defaultInterpreterPath": "~/miniconda3/envs/llava/bin/python",
                "python.formatting.provider": "yapf",
                "python.linting.enabled": true,
                "python.linting.flake8Enabled": true,
                "isort.check": true,
                "dev.containers.copyGitConfig": true,
                "terminal.integrated.defaultProfile.linux": "zsh",
                "terminal.integrated.profiles.linux": {
                    "zsh": {
                        "path": "/usr/bin/zsh"
                    },
                }
            },
            "extensions": [
                "aaron-bond.better-comments",
                "eamodio.gitlens",
                "EditorConfig.EditorConfig",
                "foxundermoon.shell-format",
                "GitHub.copilot-chat",
                "GitHub.copilot-labs",
                "GitHub.copilot",
                "lehoanganh298.json-lines-viewer",
                "mhutchie.git-graph",
                "ms-azuretools.vscode-docker",
                "ms-dotnettools.dotnet-interactive-vscode",
                "ms-python.flake8",
                "ms-python.isort",
                "ms-python.python",
                "ms-python.vscode-pylance",
                "njpwerner.autodocstring",
                "redhat.vscode-yaml",
                "stkb.rewrap",
                "yzhang.markdown-all-in-one",
            ]
        }
    },
    "mounts": [],
    "runArgs": [
        "--gpus",
        "all",
        // "--ipc",
        // "host",
        "--ulimit",
        "memlock=-1",
        "--env-file",
        ".devcontainer/devcontainer.env"
    ],
    // "remoteUser": "root"
}


================================================
FILE: .devcontainer/postCreateCommand.sh
================================================
git config --global safe.directory '*'
git config --global core.editor "code --wait"
git config --global pager.branch false

# Set AZCOPY concurrency to auto
echo "export AZCOPY_CONCURRENCY_VALUE=AUTO" >> ~/.zshrc
echo "export AZCOPY_CONCURRENCY_VALUE=AUTO" >> ~/.bashrc

# Activate conda by default
echo ". /home/vscode/miniconda3/bin/activate" >> ~/.zshrc
echo ". /home/vscode/miniconda3/bin/activate" >> ~/.bashrc

# Use llava environment by default
echo "conda activate llava" >> ~/.zshrc
echo "conda activate llava" >> ~/.bashrc

# Add dotnet to PATH
echo 'export PATH="$PATH:$HOME/.dotnet"' >> ~/.bashrc
echo 'export PATH="$PATH:$HOME/.dotnet"' >> ~/.zshrc

# Create and activate llava environment
source /home/vscode/miniconda3/bin/activate
conda create -y -q -n llava python=3.10
conda activate llava

# Install Nvidia Cuda Compiler
conda install -y -c nvidia cuda-compiler

pip install pre-commit==3.0.2

# Install package locally
pip install --upgrade pip  # enable PEP 660 support
pip install -e .

# Install additional packages for training
pip install -e ".[train]"
pip install flash-attn --no-build-isolation

# Download checkpoints to location outside of the repo
git clone https://huggingface.co/liuhaotian/llava-v1.5-7b ~/llava-v1.5-7b

# Commented because it is unlikely for users to have enough local GPU memory to load the model
# git clone https://huggingface.co/liuhaotian/llava-v1.5-13b ~/llava-v1.5-13b

echo "postCreateCommand.sh COMPLETE!"


================================================
FILE: .dockerignore
================================================
# The .dockerignore file excludes files from the container build process.
#
# https://docs.docker.com/engine/reference/builder/#dockerignore-file

# Exclude Git files
.git
.github
.gitignore

# Exclude Python cache files
__pycache__
.mypy_cache
.pytest_cache
.ruff_cache

# Exclude Python virtual environment
/venv

# Exclude some weights
/openai
/liuhaotian


================================================
FILE: .editorconfig
================================================
root = true

# Unix-style newlines with a newline ending every file
[*]
end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true
charset = utf-8

# 4 space indentation
[*.{py,json}]
indent_style = space
indent_size = 4

# 2 space indentation
[*.{md,sh,yaml,yml}]
indent_style = space
indent_size = 2

================================================
FILE: .gitattributes
================================================
# https://git-scm.com/docs/gitattributes

# Set the default behavior, in case people don't have core.autocrlf set.
# https://git-scm.com/docs/gitattributes#_end_of_line_conversion
* text=auto

# common python attributes, taken from https://github.com/alexkaratarakis/gitattributes/blob/710900479a2bedeec7003d381719521ffbb18bf8/Python.gitattributes
# Source files
# ============
*.pxd    text diff=python
*.py     text diff=python
*.py3    text diff=python
*.pyw    text diff=python
*.pyx    text diff=python
*.pyz    text diff=python
*.pyi    text diff=python

# Binary files
# ============
*.db     binary
*.p      binary
*.pkl    binary
*.pickle binary
*.pyc    binary export-ignore
*.pyo    binary export-ignore
*.pyd    binary

# Jupyter notebook
*.ipynb  text eol=lf


================================================
FILE: .github/ISSUE_TEMPLATE/1-usage.yaml
================================================
name: Usage issues
description: Report issues in usage.
title: "[Usage] "
body:
  - type: markdown
    attributes:
      value: |
        Thanks for taking the time to fill out this form.  Please give as detailed description as possible for us to better assist with the issue :)
  - type: textarea
    id: what-happened
    attributes:
      label: Describe the issue
      description: Please give as detailed description as possible for us to better assist with the issue.  Please paste the **FULL** error log here, so that we can better understand the issue. Wrap the log with ``` for better readability in GitHub.
      placeholder: Issue
      value: |
        Issue:
        
        Command:
        ```
        PASTE THE COMMANDS HERE.
        ```
        
        Log: 
        ```
        PASTE THE LOGS HERE.
        ```
        
        Screenshots:
        You may attach screenshots if it better explains the issue.
    validations:
      required: true


================================================
FILE: .github/ISSUE_TEMPLATE/2-feature-request.yaml
================================================
name: Feature Request
description: Request for a new feature
title: "[Feature request] "
body:
  - type: markdown
    attributes:
      value: |
        Thanks for your interest in our work.  Please share your thoughts of the new features below.
  - type: textarea
    id: feature
    attributes:
      label: feature
      placeholder: Start your thoughts here...

================================================
FILE: .github/ISSUE_TEMPLATE/3-question.yaml
================================================
name: Questions
description: General questions about the work
title: "[Question] "
body:
  - type: markdown
    attributes:
      value: |
        Thanks for your interest in our work.  For this type of question, it may be more suitable to go to [discussion](https://github.com/haotian-liu/LLaVA/discussions) sections.  If you believe an issue would be better for your request, please continue your post below :)
  - type: textarea
    id: question
    attributes:
      label: Question
      placeholder: Start question here...

================================================
FILE: .github/ISSUE_TEMPLATE/4-discussion.yaml
================================================
name: Discussions
description: General discussions about the work
title: "[Discussion] "
body:
  - type: markdown
    attributes:
      value: |
        Thanks for your interest in our work.  For this type of question, it may be more suitable to go to [discussion](https://github.com/haotian-liu/LLaVA/discussions) sections.  If you believe an issue would be better for your request, please continue your post below :)
  - type: textarea
    id: discussion
    attributes:
      label: Discussion
      placeholder: Start discussion here...

================================================
FILE: .gitignore
================================================
# Python
__pycache__
*.pyc
*.egg-info
dist

# Log
*.log
*.log.*
*.json
*.jsonl

# Data
!**/alpaca-data-conversation.json

# Editor
.idea
*.swp

# Other
.DS_Store
wandb
output

checkpoints
ckpts*

.ipynb_checkpoints
*.ipynb

# DevContainer
!.devcontainer/*

# Demo
serve_images/


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
# 🌋 LLaVA: Large Language and Vision Assistant

*Visual instruction tuning towards large language and vision models with GPT-4 level capabilities.*

[📢 [LLaVA-NeXT Blog](https://llava-vl.github.io/blog/2024-01-30-llava-next/)] [[Project Page](https://llava-vl.github.io/)] [[Demo](https://llava.hliu.cc/)]  [[Data](https://github.com/haotian-liu/LLaVA/blob/main/docs/Data.md)] [[Model Zoo](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)]

🤝Community Contributions: [[llama.cpp](https://github.com/ggerganov/llama.cpp/pull/3436)] [[Colab](https://github.com/camenduru/LLaVA-colab)] [[🤗Space](https://huggingface.co/spaces/badayvedat/LLaVA)] [[Replicate](https://replicate.com/yorickvp/llava-13b)] [[AutoGen](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_lmm_llava.ipynb)]  [[BakLLaVA](https://github.com/SkunkworksAI/BakLLaVA)]

**Improved Baselines with Visual Instruction Tuning** [[Paper](https://arxiv.org/abs/2310.03744)] [[HF](https://huggingface.co/papers/2310.03744)] <br>
[Haotian Liu](https://hliu.cc), [Chunyuan Li](https://chunyuan.li/), [Yuheng Li](https://yuheng-li.github.io/), [Yong Jae Lee](https://pages.cs.wisc.edu/~yongjaelee/)

**Visual Instruction Tuning** (NeurIPS 2023, **Oral**) [[Paper](https://arxiv.org/abs/2304.08485)] [[HF](https://huggingface.co/papers/2304.08485)] <br>
[Haotian Liu*](https://hliu.cc), [Chunyuan Li*](https://chunyuan.li/), [Qingyang Wu](https://scholar.google.ca/citations?user=HDiw-TsAAAAJ&hl=en/), [Yong Jae Lee](https://pages.cs.wisc.edu/~yongjaelee/) (*Equal Contribution)

<!--p align="center">
    <a href="https://llava.hliu.cc/"><img src="images/llava_logo.png" width="50%"></a> <br>
    Generated by <a href="https://gligen.github.io/">GLIGEN</a> via "a cute lava llama with glasses" and box prompt
</p-->


## Release

- [2024/05/10] 🔥 **LLaVA-NeXT** (Stronger) models are released, stronger LMM with support of LLama-3 (8B) and Qwen-1.5 (72B/110B). [[Blog](https://llava-vl.github.io/blog/2024-05-10-llava-next-stronger-llms/)] [[Checkpoints](https://huggingface.co/collections/lmms-lab/llava-next-6623288e2d61edba3ddbf5ff)] [[Demo](https://llava-next.lmms-lab.com/)] [[Code](https://github.com/LLaVA-VL/LLaVA-NeXT/)] 
- [2024/05/10] 🔥 **LLaVA-NeXT** (Video) is released. The image-only-trained LLaVA-NeXT model is surprisingly strong on video tasks with zero-shot modality transfer. DPO training with AI feedback on videos can yield significant improvement. [[Blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)] [[Checkpoints](https://huggingface.co/collections/lmms-lab/llava-next-video-661e86f5e8dabc3ff793c944)] [[Code](https://github.com/LLaVA-VL/LLaVA-NeXT/)]
- [03/10] Releasing **LMMs-Eval**, a highly efficient evaluation pipeline we used when developing LLaVA-NeXT. It supports the evaluation of LMMs on dozens of public datasets and allows new dataset onboarding, making the dev of new LMMs much faster. [[Blog](https://lmms-lab.github.io/lmms-eval-blog/lmms-eval-0.1/)] [[Codebase](https://github.com/EvolvingLMMs-Lab/lmms-eval)]
- [1/30] 🔥 **LLaVA-NeXT** (LLaVA-1.6) is out! With additional scaling to LLaVA-1.5, LLaVA-NeXT-34B outperforms Gemini Pro on some benchmarks. It can now process 4x more pixels and perform more tasks/applications than before. Check out the [blog post](https://llava-vl.github.io/blog/2024-01-30-llava-next/), and explore the [demo](https://llava.hliu.cc/)! Models are available in [Model Zoo](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md). Training/eval data and scripts coming soon.
- [11/10] [LLaVA-Plus](https://llava-vl.github.io/llava-plus/) is released: Learning to Use Tools for Creating Multimodal Agents, with LLaVA-Plus (LLaVA that Plug and Learn to Use Skills). [[Project Page](https://llava-vl.github.io/llava-plus/)] [[Demo](https://llavaplus.ngrok.io/)] [[Code](https://github.com/LLaVA-VL/LLaVA-Plus-Codebase)] [[Paper](https://arxiv.org/abs/2311.05437)]
- [11/2] [LLaVA-Interactive](https://llava-vl.github.io/llava-interactive/) is released: Experience the future of human-AI multimodal interaction with an all-in-one demo for Image Chat, Segmentation, Generation and Editing. [[Project Page](https://llava-vl.github.io/llava-interactive/)] [[Demo](https://llavainteractive.ngrok.io/)] [[Code](https://github.com/LLaVA-VL/LLaVA-Interactive-Demo)] [[Paper](https://arxiv.org/abs/2311.00571)]
- [10/26] 🔥 LLaVA-1.5 with LoRA achieves comparable performance as full-model finetuning, with a reduced GPU RAM requirement ([ckpts](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md#llava-v15), [script](https://github.com/haotian-liu/LLaVA#train)). We also provide a [doc](https://github.com/haotian-liu/LLaVA/blob/main/docs/Finetune_Custom_Data.md) on how to finetune LLaVA-1.5 on your own dataset with LoRA.
- [10/12] Check out the Korean LLaVA (Ko-LLaVA), created by ETRI, who has generously supported our research! [[🤗 Demo](https://huggingface.co/spaces/etri-vilab/Ko-LLaVA)]
- [10/5] 🔥 LLaVA-1.5 is out! Achieving SoTA on 11 benchmarks, with just simple modifications to the original LLaVA, utilizes all public data, completes training in ~1 day on a single 8-A100 node, and surpasses methods like Qwen-VL-Chat that use billion-scale data. Check out the [technical report](https://arxiv.org/abs/2310.03744), and explore the [demo](https://llava.hliu.cc/)! Models are available in [Model Zoo](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md). The training data and scripts of LLaVA-1.5 are released [here](https://github.com/haotian-liu/LLaVA#train), and evaluation scripts are released [here](https://github.com/haotian-liu/LLaVA/blob/main/docs/Evaluation.md)!
- [9/26] LLaVA is improved with reinforcement learning from human feedback (RLHF) to improve fact grounding and reduce hallucination. Check out the new SFT and RLHF checkpoints at project [[LLavA-RLHF]](https://llava-rlhf.github.io/)
- [9/22] [LLaVA](https://arxiv.org/abs/2304.08485) is accepted by NeurIPS 2023 as **oral presentation**, and [LLaVA-Med](https://arxiv.org/abs/2306.00890) is accepted by NeurIPS 2023 Datasets and Benchmarks Track as **spotlight presentation**.

<details>
<summary>More</summary>

- [11/6] Support **Intel** dGPU and CPU platforms. [More details here.](https://github.com/haotian-liu/LLaVA/tree/intel/docs/intel)
- [10/12] LLaVA is now supported in [llama.cpp](https://github.com/ggerganov/llama.cpp/pull/3436) with 4-bit / 5-bit quantization support!
- [10/11] The training data and scripts of LLaVA-1.5 are released [here](https://github.com/haotian-liu/LLaVA#train), and evaluation scripts are released [here](https://github.com/haotian-liu/LLaVA/blob/main/docs/Evaluation.md)!
- [10/10] [Roboflow Deep Dive](https://blog.roboflow.com/first-impressions-with-llava-1-5/): First Impressions with LLaVA-1.5.
- [9/20] We summarize our empirical study of training 33B and 65B LLaVA models in a [note](https://arxiv.org/abs/2309.09958). Further, if you are interested in the comprehensive review, evolution and trend of multimodal foundation models, please check out our recent survey paper [``Multimodal Foundation Models: From Specialists to General-Purpose Assistants''.](https://arxiv.org/abs/2309.10020)
<p align="center">
  <img src="https://github.com/Computer-Vision-in-the-Wild/CVinW_Readings/blob/main/images/mfm_evolution.jpeg?raw=true" width=50%/>
</p>

- [7/19] 🔥 We release a major upgrade, including support for LLaMA-2, LoRA training, 4-/8-bit inference, higher resolution (336x336), and a lot more. We release [LLaVA Bench](https://github.com/haotian-liu/LLaVA/blob/main/docs/LLaVA_Bench.md) for benchmarking open-ended visual chat with results from Bard and Bing-Chat. We also support and verify training with RTX 3090 and RTX A6000. Check out [LLaVA-from-LLaMA-2](https://github.com/haotian-liu/LLaVA/blob/main/docs/LLaVA_from_LLaMA2.md), and our [model zoo](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)!
- [6/26] [CVPR 2023 Tutorial](https://vlp-tutorial.github.io/) on **Large Multimodal Models: Towards Building and Surpassing Multimodal GPT-4**!  Please check out [[Slides](https://datarelease.blob.core.windows.net/tutorial/vision_foundation_models_2023/slides/Chunyuan_cvpr2023_tutorial_lmm.pdf)] [[Notes](https://arxiv.org/abs/2306.14895)] [[YouTube](https://youtu.be/mkI7EPD1vp8)] [[Bilibli](https://www.bilibili.com/video/BV1Ng4y1T7v3/)].
- [6/11] We released the preview for the most requested feature: DeepSpeed and LoRA support!  Please see documentations [here](./docs/LoRA.md).
- [6/1] We released **LLaVA-Med: Large Language and Vision Assistant for Biomedicine**, a step towards building biomedical domain large language and vision models with GPT-4 level capabilities.  Checkout the [paper](https://arxiv.org/abs/2306.00890) and [page](https://github.com/microsoft/LLaVA-Med).
- [5/6] We are releasing [LLaVA-Lighting-MPT-7B-preview](https://huggingface.co/liuhaotian/LLaVA-Lightning-MPT-7B-preview), based on MPT-7B-Chat!  See [here](#LLaVA-MPT-7b) for more details.
- [5/2] 🔥 We are releasing LLaVA-Lighting!  Train a lite, multimodal GPT-4 with just $40 in 3 hours!  See [here](#train-llava-lightning) for more details.
- [4/27] Thanks to the community effort, LLaVA-13B with 4-bit quantization allows you to run on a GPU with as few as 12GB VRAM!  Try it out [here](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/llava).
- [4/17] 🔥 We released **LLaVA: Large Language and Vision Assistant**. We propose visual instruction tuning, towards building large language and vision models with GPT-4 level capabilities.  Checkout the [paper](https://arxiv.org/abs/2304.08485) and [demo](https://llava.hliu.cc/).

</details>

<!-- <a href="https://llava.hliu.cc/"><img src="assets/demo.gif" width="70%"></a> -->

[![Code License](https://img.shields.io/badge/Code%20License-Apache_2.0-green.svg)](https://github.com/tatsu-lab/stanford_alpaca/blob/main/LICENSE)
**Usage and License Notices**: This project utilizes certain datasets and checkpoints that are subject to their respective original licenses. Users must comply with all terms and conditions of these original licenses, including but not limited to the [OpenAI Terms of Use](https://openai.com/policies/terms-of-use) for the dataset and the specific licenses for base language models for checkpoints trained using the dataset (e.g. [Llama community license](https://ai.meta.com/llama/license/) for LLaMA-2 and Vicuna-v1.5). This project does not impose any additional constraints beyond those stipulated in the original licenses. Furthermore, users are reminded to ensure that their use of the dataset and checkpoints is in compliance with all applicable laws and regulations.


## Contents
- [Install](#install)
- [LLaVA Weights](#llava-weights)
- [Demo](#Demo)
- [Model Zoo](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)
- [Dataset](https://github.com/haotian-liu/LLaVA/blob/main/docs/Data.md)
- [Train](#train)
- [Evaluation](#evaluation)

## Install

If you are not using Linux, do *NOT* proceed, see instructions for [macOS](https://github.com/haotian-liu/LLaVA/blob/main/docs/macOS.md) and [Windows](https://github.com/haotian-liu/LLaVA/blob/main/docs/Windows.md).

1. Clone this repository and navigate to LLaVA folder
```bash
git clone https://github.com/haotian-liu/LLaVA.git
cd LLaVA
```

2. Install Package
```Shell
conda create -n llava python=3.10 -y
conda activate llava
pip install --upgrade pip  # enable PEP 660 support
pip install -e .
```

3. Install additional packages for training cases
```
pip install -e ".[train]"
pip install flash-attn --no-build-isolation
```

### Upgrade to latest code base

```Shell
git pull
pip install -e .

# if you see some import errors when you upgrade,
# please try running the command below (without #)
# pip install flash-attn --no-build-isolation --no-cache-dir
```

### Quick Start With HuggingFace

<details>
<summary>Example Code</summary>

```Python
from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path
from llava.eval.run_llava import eval_model

model_path = "liuhaotian/llava-v1.5-7b"

tokenizer, model, image_processor, context_len = load_pretrained_model(
    model_path=model_path,
    model_base=None,
    model_name=get_model_name_from_path(model_path)
)
```

Check out the details wth the `load_pretrained_model` function in `llava/model/builder.py`.

You can also use the `eval_model` function in `llava/eval/run_llava.py` to get the output easily. By doing so, you can use this code on Colab directly after downloading this repository.

``` python
model_path = "liuhaotian/llava-v1.5-7b"
prompt = "What are the things I should be cautious about when I visit here?"
image_file = "https://llava-vl.github.io/static/images/view.jpg"

args = type('Args', (), {
    "model_path": model_path,
    "model_base": None,
    "model_name": get_model_name_from_path(model_path),
    "query": prompt,
    "conv_mode": None,
    "image_file": image_file,
    "sep": ",",
    "temperature": 0,
    "top_p": None,
    "num_beams": 1,
    "max_new_tokens": 512
})()

eval_model(args)
```
</details>

## LLaVA Weights
Please check out our [Model Zoo](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md) for all public LLaVA checkpoints, and the instructions of how to use the weights.

## Demo

### Gradio Web UI

To launch a Gradio demo locally, please run the following commands one by one. If you plan to launch multiple model workers to compare between different checkpoints, you only need to launch the controller and the web server *ONCE*.

```mermaid
flowchart BT
    %% Declare Nodes
    gws("Gradio (UI Server)")
    c("Controller (API Server):<br/>PORT: 10000")
    mw7b("Model Worker:<br/>llava-v1.5-7b<br/>PORT: 40000")
    mw13b("Model Worker:<br/>llava-v1.5-13b<br/>PORT: 40001")
    sglw13b("SGLang Backend:<br/>llava-v1.6-34b<br/>http://localhost:30000")
    lsglw13b("SGLang Worker:<br/>llava-v1.6-34b<br/>PORT: 40002")

    %% Declare Styles
    classDef data fill:#3af,stroke:#48a,stroke-width:2px,color:#444
    classDef success fill:#8f8,stroke:#0a0,stroke-width:2px,color:#444
    classDef failure fill:#f88,stroke:#f00,stroke-width:2px,color:#444

    %% Assign Styles
    class id,od data;
    class cimg,cs_s,scsim_s success;
    class ncimg,cs_f,scsim_f failure;

    subgraph Demo Connections
        direction BT
        c<-->gws
        
        mw7b<-->c
        mw13b<-->c
        lsglw13b<-->c
        sglw13b<-->lsglw13b
    end
```

#### Launch a controller
```Shell
python -m llava.serve.controller --host 0.0.0.0 --port 10000
```

#### Launch a gradio web server.
```Shell
python -m llava.serve.gradio_web_server --controller http://localhost:10000 --model-list-mode reload
```
You just launched the Gradio web interface. Now, you can open the web interface with the URL printed on the screen. You may notice that there is no model in the model list. Do not worry, as we have not launched any model worker yet. It will be automatically updated when you launch a model worker.

#### Launch a SGLang worker

This is the recommended way to serve LLaVA model with high throughput, and you need to install SGLang first. Note that currently `4-bit` quantization is not supported yet on SGLang-LLaVA, and if you have limited GPU VRAM, please check out model worker with [quantization](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#launch-a-model-worker-4-bit-8-bit-inference-quantized).

```Shell
pip install "sglang[all]"
```

You'll first launch a SGLang backend worker which will execute the models on GPUs. Remember the `--port` you've set and you'll use that later.

```Shell
# Single GPU
CUDA_VISIBLE_DEVICES=0 python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --port 30000

# Multiple GPUs with tensor parallel
CUDA_VISIBLE_DEVICES=0,1 python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-13b --tokenizer-path llava-hf/llava-1.5-13b-hf --port 30000 --tp 2
```

Tokenizers (temporary): `llava-hf/llava-1.5-7b-hf`, `llava-hf/llava-1.5-13b-hf`, `liuhaotian/llava-v1.6-34b-tokenizer`.

You'll then launch a LLaVA-SGLang worker that will communicate between LLaVA controller and SGLang backend to route the requests. Set `--sgl-endpoint` to `http://127.0.0.1:port` where `port` is the one you just set (default: 30000).

```Shell
python -m llava.serve.sglang_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --sgl-endpoint http://127.0.0.1:30000
```

#### Launch a model worker

This is the actual *worker* that performs the inference on the GPU.  Each worker is responsible for a single model specified in `--model-path`.

```Shell
python -m llava.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --model-path liuhaotian/llava-v1.5-13b
```
Wait until the process finishes loading the model and you see "Uvicorn running on ...".  Now, refresh your Gradio web UI, and you will see the model you just launched in the model list.

You can launch as many workers as you want, and compare between different model checkpoints in the same Gradio interface. Please keep the `--controller` the same, and modify the `--port` and `--worker` to a different port number for each worker.
```Shell
python -m llava.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port <different from 40000, say 40001> --worker http://localhost:<change accordingly, i.e. 40001> --model-path <ckpt2>
```

If you are using an Apple device with an M1 or M2 chip, you can specify the mps device by using the `--device` flag: `--device mps`.

#### Launch a model worker (Multiple GPUs, when GPU VRAM <= 24GB)

If the VRAM of your GPU is less than 24GB (e.g., RTX 3090, RTX 4090, etc.), you may try running it with multiple GPUs. Our latest code base will automatically try to use multiple GPUs if you have more than one GPU. You can specify which GPUs to use with `CUDA_VISIBLE_DEVICES`. Below is an example of running with the first two GPUs.

```Shell
CUDA_VISIBLE_DEVICES=0,1 python -m llava.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --model-path liuhaotian/llava-v1.5-13b
```

#### Launch a model worker (4-bit, 8-bit inference, quantized)

You can launch the model worker with quantized bits (4-bit, 8-bit), which allows you to run the inference with reduced GPU memory footprint, potentially allowing you to run on a GPU with as few as 12GB VRAM. Note that inference with quantized bits may not be as accurate as the full-precision model. Simply append `--load-4bit` or `--load-8bit` to the **model worker** command that you are executing. Below is an example of running with 4-bit quantization.

```Shell
python -m llava.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --model-path liuhaotian/llava-v1.5-13b --load-4bit
```

#### Launch a model worker (LoRA weights, unmerged)

You can launch the model worker with LoRA weights, without merging them with the base checkpoint, to save disk space. There will be additional loading time, while the inference speed is the same as the merged checkpoints. Unmerged LoRA checkpoints do not have `lora-merge` in the model name, and are usually much smaller (less than 1GB) than the merged checkpoints (13G for 7B, and 25G for 13B).

To load unmerged LoRA weights, you simply need to pass an additional argument `--model-base`, which is the base LLM that is used to train the LoRA weights. You can check the base LLM of each LoRA weights in the [model zoo](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md).

```Shell
python -m llava.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --model-path liuhaotian/llava-v1-0719-336px-lora-vicuna-13b-v1.3 --model-base lmsys/vicuna-13b-v1.3
```

### CLI Inference

Chat about images using LLaVA without the need of Gradio interface. It also supports multiple GPUs, 4-bit and 8-bit quantized inference. With 4-bit quantization, for our LLaVA-1.5-7B, it uses less than 8GB VRAM on a single GPU.

```Shell
python -m llava.serve.cli \
    --model-path liuhaotian/llava-v1.5-7b \
    --image-file "https://llava-vl.github.io/static/images/view.jpg" \
    --load-4bit
```

<img src="images/demo_cli.gif" width="70%">

## Train

*Below is the latest training configuration for LLaVA v1.5. For legacy models, please refer to README of [this](https://github.com/haotian-liu/LLaVA/tree/v1.0.1) version for now. We'll add them in a separate doc later.*

LLaVA training consists of two stages: (1) feature alignment stage: use our 558K subset of the LAION-CC-SBU dataset to connect a *frozen pretrained* vision encoder to a *frozen LLM*; (2) visual instruction tuning stage: use 150K GPT-generated multimodal instruction-following data, plus around 515K VQA data from academic-oriented tasks, to teach the model to follow multimodal instructions.

LLaVA is trained on 8 A100 GPUs with 80GB memory. To train on fewer GPUs, you can reduce the `per_device_train_batch_size` and increase the `gradient_accumulation_steps` accordingly. Always keep the global batch size the same: `per_device_train_batch_size` x `gradient_accumulation_steps` x `num_gpus`.

### Hyperparameters
We use a similar set of hyperparameters as Vicuna in finetuning.  Both hyperparameters used in pretraining and finetuning are provided below.

1. Pretraining

| Hyperparameter | Global Batch Size | Learning rate | Epochs | Max length | Weight decay |
| --- | ---: | ---: | ---: | ---: | ---: |
| LLaVA-v1.5-13B | 256 | 1e-3 | 1 | 2048 | 0 |

2. Finetuning

| Hyperparameter | Global Batch Size | Learning rate | Epochs | Max length | Weight decay |
| --- | ---: | ---: | ---: | ---: | ---: |
| LLaVA-v1.5-13B | 128 | 2e-5 | 1 | 2048 | 0 |

### Download Vicuna checkpoints (automatically)

Our base model Vicuna v1.5, which is an instruction-tuned chatbot, will be downloaded automatically when you run our provided training scripts. No action is needed.

### Pretrain (feature alignment)

Please download the 558K subset of the LAION-CC-SBU dataset with BLIP captions we use in the paper [here](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain).

Pretrain takes around 5.5 hours for LLaVA-v1.5-13B on 8x A100 (80G), due to the increased resolution to 336px. It takes around 3.5 hours for LLaVA-v1.5-7B.

Training script with DeepSpeed ZeRO-2: [`pretrain.sh`](https://github.com/haotian-liu/LLaVA/blob/main/scripts/v1_5/pretrain.sh).

- `--mm_projector_type mlp2x_gelu`: the two-layer MLP vision-language connector.
- `--vision_tower openai/clip-vit-large-patch14-336`: CLIP ViT-L/14 336px.

<details>
<summary>Pretrain takes around 20 hours for LLaVA-7B on 8x V100 (32G)</summary>

 We provide training script with DeepSpeed [here](https://github.com/haotian-liu/LLaVA/blob/main/scripts/pretrain_xformers.sh).
Tips:
- If you are using V100 which is not supported by FlashAttention, you can use the [memory-efficient attention](https://arxiv.org/abs/2112.05682) implemented in [xFormers](https://github.com/facebookresearch/xformers). Install xformers and replace `llava/train/train_mem.py` above with [llava/train/train_xformers.py](llava/train/train_xformers.py).
</details>

### Visual Instruction Tuning

1. Prepare data

Please download the annotation of the final mixture our instruction tuning data [llava_v1_5_mix665k.json](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/llava_v1_5_mix665k.json), and download the images from constituting datasets:

- COCO: [train2017](http://images.cocodataset.org/zips/train2017.zip)
- GQA: [images](https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip)
- OCR-VQA: [download script](https://drive.google.com/drive/folders/1_GYPY5UkUy7HIcR0zq3ZCFgeZN7BAfm_?usp=sharing), **we save all files as `.jpg`**
- TextVQA: [train_val_images](https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip)
- VisualGenome: [part1](https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip), [part2](https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip)

After downloading all of them, organize the data as follows in `./playground/data`,

```
├── coco
│   └── train2017
├── gqa
│   └── images
├── ocr_vqa
│   └── images
├── textvqa
│   └── train_images
└── vg
    ├── VG_100K
    └── VG_100K_2
```

2. Start training!

You may download our pretrained projectors in [Model Zoo](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md). It is not recommended to use legacy projectors, as they may be trained with a different version of the codebase, and if any option is off, the model will not function/train as we expected.

Visual instruction tuning takes around 20 hours for LLaVA-v1.5-13B on 8x A100 (80G), due to the increased resolution to 336px. It takes around 10 hours for LLaVA-v1.5-7B on 8x A100 (40G).

Training script with DeepSpeed ZeRO-3: [`finetune.sh`](https://github.com/haotian-liu/LLaVA/blob/main/scripts/v1_5/finetune.sh).

If you are do not have enough GPU memory:

- Use LoRA: [`finetune_lora.sh`](https://github.com/haotian-liu/LLaVA/blob/main/scripts/v1_5/finetune_lora.sh). We are able to fit 13B training in 8-A100-40G/8-A6000, and 7B training in 8-RTX3090. Make sure `per_device_train_batch_size*gradient_accumulation_steps` is the same as the provided script for best reproducibility.
- Replace `zero3.json` with `zero3_offload.json` which offloads some parameters to CPU RAM. This slows down the training speed.

If you are interested in finetuning LLaVA model to your own task/data, please check out [`Finetune_Custom_Data.md`](https://github.com/haotian-liu/LLaVA/blob/main/docs/Finetune_Custom_Data.md)。

New options to note:

- `--mm_projector_type mlp2x_gelu`: the two-layer MLP vision-language connector.
- `--vision_tower openai/clip-vit-large-patch14-336`: CLIP ViT-L/14 336px.
- `--image_aspect_ratio pad`: this pads the non-square images to square, instead of cropping them; it slightly reduces hallucination.
- `--group_by_modality_length True`: this should only be used when your instruction tuning dataset contains both language (e.g. ShareGPT) and multimodal (e.g. LLaVA-Instruct). It makes the training sampler only sample a single modality (either image or language) during training, which we observe to speed up training by ~25%, and does not affect the final outcome.

## Evaluation

In LLaVA-1.5, we evaluate models on a diverse set of 12 benchmarks. To ensure the reproducibility, we evaluate the models with greedy decoding. We do not evaluate using beam search to make the inference process consistent with the chat demo of real-time outputs.

See [Evaluation.md](https://github.com/haotian-liu/LLaVA/blob/main/docs/Evaluation.md).

### GPT-assisted Evaluation

Our GPT-assisted evaluation pipeline for multimodal modeling is provided for a comprehensive understanding of the capabilities of vision-language models.  Please see our paper for more details.

1. Generate LLaVA responses

```Shell
python model_vqa.py \
    --model-path ./checkpoints/LLaVA-13B-v0 \
    --question-file \
    playground/data/coco2014_val_qa_eval/qa90_questions.jsonl \
    --image-folder \
    /path/to/coco2014_val \
    --answers-file \
    /path/to/answer-file-our.jsonl
```

2. Evaluate the generated responses.  In our case, [`answer-file-ref.jsonl`](./playground/data/coco2014_val_qa_eval/qa90_gpt4_answer.jsonl) is the response generated by text-only GPT-4 (0314), with the context captions/boxes provided.

```Shell
OPENAI_API_KEY="sk-***********************************" python llava/eval/eval_gpt_review_visual.py \
    --question playground/data/coco2014_val_qa_eval/qa90_questions.jsonl \
    --context llava/eval/table/caps_boxes_coco2014_val_80.jsonl \
    --answer-list \
    /path/to/answer-file-ref.jsonl \
    /path/to/answer-file-our.jsonl \
    --rule llava/eval/table/rule.json \
    --output /path/to/review.json
```

3. Summarize the evaluation results

```Shell
python summarize_gpt_review.py
```

## Citation

If you find LLaVA useful for your research and applications, please cite using this BibTeX:
```bibtex
@misc{liu2024llavanext,
    title={LLaVA-NeXT: Improved reasoning, OCR, and world knowledge},
    url={https://llava-vl.github.io/blog/2024-01-30-llava-next/},
    author={Liu, Haotian and Li, Chunyuan and Li, Yuheng and Li, Bo and Zhang, Yuanhan and Shen, Sheng and Lee, Yong Jae},
    month={January},
    year={2024}
}

@misc{liu2023improvedllava,
      title={Improved Baselines with Visual Instruction Tuning}, 
      author={Liu, Haotian and Li, Chunyuan and Li, Yuheng and Lee, Yong Jae},
      publisher={arXiv:2310.03744},
      year={2023},
}

@misc{liu2023llava,
      title={Visual Instruction Tuning}, 
      author={Liu, Haotian and Li, Chunyuan and Wu, Qingyang and Lee, Yong Jae},
      publisher={NeurIPS},
      year={2023},
}
```

## Acknowledgement

- [Vicuna](https://github.com/lm-sys/FastChat): the codebase we built upon, and our base model Vicuna-13B that has the amazing language capabilities!

## Related Projects

- [Instruction Tuning with GPT-4](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
- [LLaVA-Med: Training a Large Language-and-Vision Assistant for Biomedicine in One Day](https://github.com/microsoft/LLaVA-Med)
- [Otter: In-Context Multi-Modal Instruction Tuning](https://github.com/Luodian/Otter)

For future project ideas, please check out:
- [SEEM: Segment Everything Everywhere All at Once](https://github.com/UX-Decoder/Segment-Everything-Everywhere-All-At-Once)
- [Grounded-Segment-Anything](https://github.com/IDEA-Research/Grounded-Segment-Anything) to detect, segment, and generate anything by marrying [Grounding DINO](https://github.com/IDEA-Research/GroundingDINO) and [Segment-Anything](https://github.com/facebookresearch/segment-anything).


================================================
FILE: cog.yaml
================================================
# Configuration for Cog ⚙️
# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md

build:
  gpu: true

  python_version: "3.11"

  python_packages:
    - "torch==2.0.1"
    - "accelerate==0.21.0"
    - "bitsandbytes==0.41.0"
    - "deepspeed==0.9.5"
    - "einops-exts==0.0.4"
    - "einops==0.6.1"
    - "gradio==3.35.2"
    - "gradio_client==0.2.9"
    - "httpx==0.24.0"
    - "markdown2==2.4.10"
    - "numpy==1.26.0"
    - "peft==0.4.0"
    - "scikit-learn==1.2.2"
    - "sentencepiece==0.1.99"
    - "shortuuid==1.0.11"
    - "timm==0.6.13"
    - "tokenizers==0.13.3"
    - "torch==2.0.1"
    - "torchvision==0.15.2"
    - "transformers==4.31.0"
    - "wandb==0.15.12"
    - "wavedrom==2.0.3.post3"
    - "Pygments==2.16.1"
  run:
    - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.0.3/pget" && chmod +x /usr/local/bin/pget

# predict.py defines how predictions are run on your model
predict: "predict.py:Predictor"


================================================
FILE: docs/Customize_Component.md
================================================
# Customize Components in LLaVA

This is an initial guide on how to replace the LLMs, visual encoders, etc. with your choice of components.

## LLM

It is quite simple to swap out LLaMA to any other LLMs.  You can refer to our implementation of [`llava_llama.py`](https://raw.githubusercontent.com/haotian-liu/LLaVA/main/llava/model/language_model/llava_llama.py) for an example of how to replace the LLM.

Although it may seem that it still needs ~100 lines of code, most of them are copied from the original `llama.py` from HF.  The only part that is different is to insert some lines for processing the multimodal inputs.

In `forward` function, you can see that we call `self.prepare_inputs_labels_for_multimodal` to process the multimodal inputs.  This function is defined in `LlavaMetaForCausalLM` and you just need to insert it into the `forward` function of your LLM.

In `prepare_inputs_for_generation` function, you can see that we add `images` to the `model_inputs`.  This is because we need to pass the images to the LLM during generation.

These are basically all the changes you need to make to replace the LLM.

## Visual Encoder

You can check out [`clip_encoder.py`](https://github.com/haotian-liu/LLaVA/blob/main/llava/model/multimodal_encoder/clip_encoder.py) on how we implement the CLIP visual encoder.



================================================
FILE: docs/Data.md
================================================
## Data

| Data file name | Size |
| --- | ---: |
| [llava_instruct_150k.json](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/llava_instruct_150k.json) | 229 MB |
| [llava_instruct_80k.json](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/llava_instruct_80k.json) | 229 MB |
| [conversation_58k.json](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/conversation_58k.json) | 126 MB |
| [detail_23k.json](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/detail_23k.json) | 20.5 MB |
| [complex_reasoning_77k.json](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/complex_reasoning_77k.json) | 79.6 MB |

### Pretraining Dataset
The pretraining dataset used in this release is a subset of CC-3M dataset, filtered with a more balanced concept coverage distribution.  Please see [here](https://huggingface.co/datasets/liuhaotian/LLaVA-CC3M-Pretrain-595K) for a detailed description of the dataset structure and how to download the images.

If you already have CC-3M dataset on your disk, the image names follow this format: `GCC_train_000000000.jpg`.  You may edit the `image` field correspondingly if necessary.

| Data | Chat File | Meta Data | Size |
| --- |  --- |  --- | ---: |
| CC-3M Concept-balanced 595K | [chat.json](https://huggingface.co/datasets/liuhaotian/LLaVA-CC3M-Pretrain-595K/blob/main/chat.json) | [metadata.json](https://huggingface.co/datasets/liuhaotian/LLaVA-CC3M-Pretrain-595K/blob/main/metadata.json) | 211 MB
| LAION/CC/SBU BLIP-Caption Concept-balanced 558K | [blip_laion_cc_sbu_558k.json](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/blob/main/blip_laion_cc_sbu_558k.json) | [metadata.json](#) | 181 MB

**Important notice**: Upon the request from the community, as ~15% images of the original CC-3M dataset are no longer accessible, we upload [`images.zip`](https://huggingface.co/datasets/liuhaotian/LLaVA-CC3M-Pretrain-595K/blob/main/images.zip) for better reproducing our work in research community. It must not be used for any other purposes. The use of these images must comply with the CC-3M license. This may be taken down at any time when requested by the original CC-3M dataset owner or owners of the referenced images.

### GPT-4 Prompts

We provide our prompts and few-shot samples for GPT-4 queries, to better facilitate research in this domain.  Please check out the [`prompts`](https://github.com/haotian-liu/LLaVA/tree/main/playground/data/prompts) folder for three kinds of questions: conversation, detail description, and complex reasoning.

They are organized in a format of `system_message.txt` for system message, pairs of `abc_caps.txt` for few-shot sample user input, and `abc_conv.txt` for few-shot sample reference output.

Note that you may find them in different format. For example, `conversation` is in `jsonl`, and detail description is answer-only.  The selected format in our preliminary experiments works slightly better than a limited set of alternatives that we tried: `jsonl`, more natural format, answer-only.  If interested, you may try other variants or conduct more careful study in this.  Contributions are welcomed!


================================================
FILE: docs/Evaluation.md
================================================
# Evaluation

In LLaVA-1.5, we evaluate models on a diverse set of 12 benchmarks. To ensure the reproducibility, we evaluate the models with greedy decoding. We do not evaluate using beam search to make the inference process consistent with the chat demo of real-time outputs.

Currently, we mostly utilize the official toolkit or server for the evaluation.

## Evaluate on Custom Datasets

You can evaluate LLaVA on your custom datasets by converting your dataset to LLaVA's jsonl format, and evaluate using [`model_vqa.py`](https://github.com/haotian-liu/LLaVA/blob/main/llava/eval/model_vqa.py).

Below we provide a general guideline for evaluating datasets with some common formats.

1. Short-answer (e.g. VQAv2, MME).

```
<question>
Answer the question using a single word or phrase.
```

2. Option-only for multiple-choice (e.g. MMBench, SEED-Bench).

```
<question>
A. <option_1>
B. <option_2>
C. <option_3>
D. <option_4>
Answer with the option's letter from the given choices directly.
```

3. Natural QA (e.g. LLaVA-Bench, MM-Vet).

No postprocessing is needed.

## Scripts

Before preparing task-specific data, **you MUST first download [eval.zip](https://drive.google.com/file/d/1atZSBBrAX54yYpxtVVW33zFvcnaHeFPy/view?usp=sharing)**. It contains custom annotations, scripts, and the prediction files with LLaVA v1.5. Extract to `./playground/data/eval`. This also provides a general structure for all datasets.

### VQAv2

1. Download [`test2015`](http://images.cocodataset.org/zips/test2015.zip) and put it under `./playground/data/eval/vqav2`.
2. Multi-GPU inference.
```Shell
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/vqav2.sh
```
3. Submit the results to the [evaluation server](https://eval.ai/web/challenges/challenge-page/830/my-submission): `./playground/data/eval/vqav2/answers_upload`.

### GQA

1. Download the [data](https://cs.stanford.edu/people/dorarad/gqa/download.html) and [evaluation scripts](https://cs.stanford.edu/people/dorarad/gqa/evaluate.html) following the official instructions and put under `./playground/data/eval/gqa/data`. You may need to modify `eval.py` as [this](https://gist.github.com/haotian-liu/db6eddc2a984b4cbcc8a7f26fd523187) due to the missing assets in the GQA v1.2 release.
2. Multi-GPU inference.
```Shell
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/gqa.sh
```

### VisWiz

1. Download [`test.json`](https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations.zip) and extract [`test.zip`](https://vizwiz.cs.colorado.edu/VizWiz_final/images/test.zip) to `test`. Put them under `./playground/data/eval/vizwiz`.
2. Single-GPU inference.
```Shell
CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/vizwiz.sh
```
3. Submit the results to the [evaluation server](https://eval.ai/web/challenges/challenge-page/2185/my-submission): `./playground/data/eval/vizwiz/answers_upload`.

### ScienceQA

1. Under `./playground/data/eval/scienceqa`, download `images`, `pid_splits.json`, `problems.json` from the `data/scienceqa` folder of the ScienceQA [repo](https://github.com/lupantech/ScienceQA).
2. Single-GPU inference and evaluate.
```Shell
CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/sqa.sh
```

### TextVQA

1. Download [`TextVQA_0.5.1_val.json`](https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_val.json) and [images](https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip) and extract to `./playground/data/eval/textvqa`.
2. Single-GPU inference and evaluate.
```Shell
CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/textvqa.sh
```

### POPE

1. Download `coco` from [POPE](https://github.com/AoiDragon/POPE/tree/e3e39262c85a6a83f26cf5094022a782cb0df58d/output/coco) and put under `./playground/data/eval/pope`.
2. Single-GPU inference and evaluate.
```Shell
CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/pope.sh
```

### MME

1. Download the data following the official instructions [here](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation).
2. Downloaded images to `MME_Benchmark_release_version`.
3. put the official `eval_tool` and `MME_Benchmark_release_version` under `./playground/data/eval/MME`.
4. Single-GPU inference and evaluate.
```Shell
CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mme.sh
```

### MMBench

1. Download [`mmbench_dev_20230712.tsv`](https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_20230712.tsv) and put under `./playground/data/eval/mmbench`.
2. Single-GPU inference.
```Shell
CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmbench.sh
```
3. Submit the results to the [evaluation server](https://opencompass.org.cn/leaderboard-multimodal): `./playground/data/eval/mmbench/answers_upload/mmbench_dev_20230712`.

### MMBench-CN

1. Download [`mmbench_dev_cn_20231003.tsv`](https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_cn_20231003.tsv) and put under `./playground/data/eval/mmbench`.
2. Single-GPU inference.
```Shell
CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmbench_cn.sh
```
3. Submit the results to the evaluation server: `./playground/data/eval/mmbench/answers_upload/mmbench_dev_cn_20231003`.


### SEED-Bench

1. Following the official [instructions](https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md) to download the images and the videos. Put images under `./playground/data/eval/seed_bench/SEED-Bench-image`.
2. Extract the video frame in the middle from the downloaded videos, and put them under `./playground/data/eval/seed_bench/SEED-Bench-video-image`. We provide our script `extract_video_frames.py` modified from the official one.
3. Multiple-GPU inference and evaluate.
```Shell
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/seed.sh
```
4. Optionally, submit the results to the leaderboard: `./playground/data/eval/seed_bench/answers_upload` using the official jupyter notebook.

### LLaVA-Bench-in-the-Wild

1. Extract contents of [`llava-bench-in-the-wild`](https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild) to `./playground/data/eval/llava-bench-in-the-wild`.
2. Single-GPU inference and evaluate.
```Shell
CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/llavabench.sh
```

### MM-Vet

1. Extract [`mm-vet.zip`](https://github.com/yuweihao/MM-Vet/releases/download/v1/mm-vet.zip) to `./playground/data/eval/mmvet`.
2. Single-GPU inference.
```Shell
CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmvet.sh
```
3. Evaluate the predictions in `./playground/data/eval/mmvet/results` using the official jupyter notebook.

## More Benchmarks

Below are awesome benchmarks for multimodal understanding from the research community, that are not initially included in the LLaVA-1.5 release.

### Q-Bench

1. Download [`llvisionqa_dev.json`](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/llvisionqa_dev.json) (for `dev`-subset) and [`llvisionqa_test.json`](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/llvisionqa_test.json) (for `test`-subset). Put them under `./playground/data/eval/qbench`. 
2. Download and extract [images](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/images_llvisionqa.tar) and put all the images directly under `./playground/data/eval/qbench/images_llviqionqa`.
3. Single-GPU inference (change `dev` to `test` for evaluation on test set).
```Shell
CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/qbench.sh dev
```
4. Submit the results by instruction [here](https://github.com/VQAssessment/Q-Bench#option-1-submit-results): `./playground/data/eval/qbench/llvisionqa_dev_answers.jsonl`.

### Chinese-Q-Bench

1. Download [`质衡-问答-验证集.json`](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/%E8%B4%A8%E8%A1%A1-%E9%97%AE%E7%AD%94-%E9%AA%8C%E8%AF%81%E9%9B%86.json) (for `dev`-subset) and [`质衡-问答-测试集.json`](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/%E8%B4%A8%E8%A1%A1-%E9%97%AE%E7%AD%94-%E6%B5%8B%E8%AF%95%E9%9B%86.json) (for `test`-subset). Put them under `./playground/data/eval/qbench`. 
2. Download and extract [images](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/images_llvisionqa.tar) and put all the images directly under `./playground/data/eval/qbench/images_llviqionqa`.
3. Single-GPU inference (change `dev` to `test` for evaluation on test set).
```Shell
CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/qbench_zh.sh dev
```
4. Submit the results by instruction [here](https://github.com/VQAssessment/Q-Bench#option-1-submit-results): `./playground/data/eval/qbench/llvisionqa_zh_dev_answers.jsonl`.


================================================
FILE: docs/Finetune_Custom_Data.md
================================================
# Finetune LLaVA on Custom Datasets

## Dataset Format

Convert your data to a JSON file of a List of all samples. Sample metadata should contain `id` (a unique identifier), `image` (the path to the image), and `conversations` (the conversation data between human and AI).

A sample JSON for finetuning LLaVA for generating tag-style captions for Stable Diffusion:

```json
[
  {
    "id": "997bb945-628d-4724-b370-b84de974a19f",
    "image": "part-000001/997bb945-628d-4724-b370-b84de974a19f.jpg",
    "conversations": [
      {
        "from": "human",
        "value": "<image>\nWrite a prompt for Stable Diffusion to generate this image."
      },
      {
        "from": "gpt",
        "value": "a beautiful painting of chernobyl by nekro, pascal blanche, john harris, greg rutkowski, sin jong hun, moebius, simon stalenhag. in style of cg art. ray tracing. cel shading. hyper detailed. realistic. ue 5. maya. octane render. "
      },
    ]
  },
  ...
]
```

## Command

If you have a limited task-specific data, we recommend finetuning from LLaVA checkpoints with LoRA following this [script](https://github.com/haotian-liu/LLaVA/blob/main/scripts/v1_5/finetune_task_lora.sh).

If the amount of the task-specific data is sufficient, you can also finetune from LLaVA checkpoints with full-model finetuning following this [script](https://github.com/haotian-liu/LLaVA/blob/main/scripts/v1_5/finetune_task.sh).

You may need to adjust the hyperparameters to fit each specific dataset and your hardware constraint.




================================================
FILE: docs/Intel.md
================================================
# Intel Platforms 

* Support [Intel GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html)    
* Support [Intel CPU Sapphire Rapides](https://ark.intel.com/content/www/us/en/ark/products/codename/126212/products-formerly-sapphire-rapids.html)    
* Based on [Intel Extension for Pytorch](https://intel.github.io/intel-extension-for-pytorch)    

More details in  [**intel branch**](https://github.com/haotian-liu/LLaVA/tree/intel/docs/intel)


================================================
FILE: docs/LLaVA_Bench.md
================================================
# LLaVA-Bench [[Download](https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild)]

**-Introduction-**  Large commercial multimodal chatbots have been released in this week, including 
- [Multimodal Bing-Chat by Microsoft](https://blogs.bing.com/search/july-2023/Bing-Chat-Enterprise-announced,-multimodal-Visual-Search-rolling-out-to-Bing-Chat) (July 18, 2023) 
- [Multimodal Bard by Google](https://bard.google.com/). 

These chatbots are presumably supported by proprietary large multimodal models (LMM). Compared with the open-source LMM such as LLaVA, proprietary LMM represent the scaling success upperbound of the current SoTA techniques. They share the goal of developing multimodal chatbots that follow human intents to complete various daily-life visual tasks in the wild. While it remains less explored how to evaluate multimodal chat ability, it provides useful feedback to study open-source LMMs against the commercial multimodal chatbots. In addition to the *LLaVA-Bench (COCO)* dataset we used to develop the early versions of LLaVA, we are releasing  [*LLaVA-Bench (In-the-Wild)*](https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild) to the community for the public use.

## LLaVA-Bench (In-the-Wild *[Ongoing work]*)

To evaluate the model's capability in more challenging tasks and generalizability to novel domains, we collect a diverse set of 24 images with 60 questions in total, including indoor and outdoor scenes, memes, paintings, sketches, etc, and associate each image with a highly-detailed and manually-curated description and a proper selection of questions. Such design also assesses the model's robustness to different prompts. In this release, we also categorize questions into three categories: conversation (simple QA), detailed description, and complex reasoning. We continue to expand and improve the diversity of the LLaVA-Bench (In-the-Wild).  We manually query Bing-Chat and Bard to get the responses. 

### Results

The score is measured by comparing against a reference answer generated by text-only GPT-4. It is generated by feeding the question, along with the ground truth image annotations as the context. A text-only GPT-4 evaluator rates both answers. We query GPT-4 by putting the reference answer first, and then the answer generated by the candidate model. We upload images at their original resolution to Bard and Bing-Chat to obtain the results.

| Approach       | Conversation | Detail | Reasoning | Overall |
|----------------|--------------|--------|-----------|---------|
| Bard-0718      | 83.7         | 69.7   | 78.7      | 77.8    |
| Bing-Chat-0629 | 59.6         | 52.2   | 90.1      | 71.5    |
| LLaVA-13B-v1-336px-0719 (beam=1) | 64.3         | 55.9   | 81.7      | 70.1    |
| LLaVA-13B-v1-336px-0719 (beam=5) | 68.4         | 59.9   | 84.3      | 73.5    |

Note that Bard sometimes refuses to answer questions about images containing humans, and Bing-Chat blurs the human faces in the images. We also provide the benchmark score for the subset without humans.

| Approach       | Conversation | Detail | Reasoning | Overall |
|----------------|--------------|--------|-----------|---------|
| Bard-0718      | 94.9         | 74.3   | 84.3      | 84.6    |
| Bing-Chat-0629 | 55.8         | 53.6   | 93.5      | 72.6    |
| LLaVA-13B-v1-336px-0719 (beam=1) | 62.2         | 56.4   | 82.2      | 70.0    |
| LLaVA-13B-v1-336px-0719 (beam=5) | 65.6         | 61.7   | 85.0      | 73.6    |


================================================
FILE: docs/LLaVA_from_LLaMA2.md
================================================
# LLaVA (based on Llama 2 LLM, Preview)

*NOTE: This is a technical preview. We are still running hyperparameter search, and will release the final model soon.  If you'd like to contribute to this, please contact us.*

:llama: **-Introduction-** [Llama 2 is an open-source LLM released by Meta AI](https://about.fb.com/news/2023/07/llama-2/) today (July 18, 2023). Compared with its early version [Llama 1](https://ai.meta.com/blog/large-language-model-llama-meta-ai/), Llama 2 is more favored in ***stronger language performance***, ***longer context window***, and importantly ***commercially usable***! While Llama 2 is changing the LLM market landscape in the language space, its multimodal ability remains unknown. We quickly develop the LLaVA variant based on the latest Llama 2 checkpoints, and release it to the community for the public use.

You need to apply for and download the latest Llama 2 checkpoints to start your own training (apply [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/))


## Training

Please checkout [`pretrain.sh`](https://github.com/haotian-liu/LLaVA/blob/main/scripts/pretrain.sh), [`finetune.sh`](https://github.com/haotian-liu/LLaVA/blob/main/scripts/finetune.sh), [`finetune_lora.sh`](https://github.com/haotian-liu/LLaVA/blob/main/scripts/finetune_lora.sh).

## LLaVA (based on Llama 2), What is different? 

:volcano: How is the new LLaVA based on Llama 2 different from Llama 1? The comparisons of the training process are described:
- **Pre-training**. The pre-trained base LLM is changed from Llama 1 to Llama 2
- **Language instruction-tuning**. The previous LLaVA model starts with Vicuna, which is instruct tuned on ShareGPT data from Llama 1; The new LLaVA model starts with Llama 2 Chat, which is an instruct tuned checkpoint on dialogue data from Llama 2.
- **Multimodal instruction-tuning**. The same LLaVA-Lighting process is applied.


### Results

- Llama 2 is better at following the instructions of role playing; Llama 2 fails in following the instructions of translation
- The quantitative evaluation on [LLaVA-Bench](https://github.com/haotian-liu/LLaVA/blob/main/docs/LLaVA_Bench.md) demonstrates on-par performance between Llama 2 and Llama 1 in LLaVA's multimodal chat ability.


<img src="../images/llava_example_cmp.png" width="100%">



================================================
FILE: docs/LoRA.md
================================================
# LLaVA (LoRA, Preview)

NOTE: This is a technical preview, and is not yet ready for production use. We are still running hyperparameter search for the LoRA model, and will release the final model soon.  If you'd like to contribute to this, please contact us.

You need latest code base for LoRA support (instructions [here](https://github.com/haotian-liu/LLaVA#upgrade-to-latest-code-base))

## Demo (Web UI)

Please execute each of the commands below one by one (after the previous one has finished).  The commands are the same as launching other demos except for an additional `--model-base` flag to specify the base model to use. Please make sure the base model corresponds to the LoRA checkpoint that you are using.  For this technical preview, you need Vicuna v1.1 (7B) checkpoint (if you do not have that already, follow the instructions [here](https://github.com/lm-sys/FastChat#vicuna-weights)).

#### Launch a controller
```Shell
python -m llava.serve.controller --host 0.0.0.0 --port 10000
```

#### Launch a gradio web server.
```Shell
python -m llava.serve.gradio_web_server --controller http://localhost:10000 --model-list-mode reload
```
You just launched the Gradio web interface. Now, you can open the web interface with the URL printed on the screen. You may notice that there is no model in the model list. Do not worry, as we have not launched any model worker yet. It will be automatically updated when you launch a model worker.

#### Launch a model worker
```Shell
python -m llava.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --model-path liuhaotian/llava-vicuna-7b-v1.1-lcs_558k-instruct_80k_3e-lora-preview-alpha --model-base /path/to/vicuna-v1.1
```
Wait until the process finishes loading the model and you see "Uvicorn running on ...".  Now, refresh your Gradio web UI, and you will see the model you just launched in the model list.

You can launch as many workers as you want, and compare between different model checkpoints in the same Gradio interface. Please keep the `--controller` the same, and modify the `--port` and `--worker` to a different port number for each worker.


## Training

Please see sample training scripts for [LoRA](https://github.com/haotian-liu/LLaVA/blob/main/scripts/finetune_lora.sh) and [QLoRA](https://github.com/haotian-liu/LLaVA/blob/main/scripts/finetune_qlora.sh).

We provide sample DeepSpeed configs, [`zero3.json`](https://github.com/haotian-liu/LLaVA/blob/main/scripts/zero3.json) is more like PyTorch FSDP, and [`zero3_offload.json`](https://github.com/haotian-liu/LLaVA/blob/main/scripts/zero3_offload.json) can further save memory consumption by offloading parameters to CPU. `zero3.json` is usually faster than `zero3_offload.json` but requires more GPU memory, therefore, we recommend trying `zero3.json` first, and if you run out of GPU memory, try `zero3_offload.json`. You can also tweak the `per_device_train_batch_size` and `gradient_accumulation_steps` in the config to save memory, and just to make sure that `per_device_train_batch_size` and `gradient_accumulation_steps` remains the same.

If you are having issues with ZeRO-3 configs, and there are enough VRAM, you may try [`zero2.json`](https://github.com/haotian-liu/LLaVA/blob/main/scripts/zero2.json). This consumes slightly more memory than ZeRO-3, and behaves more similar to PyTorch FSDP, while still supporting parameter-efficient tuning.

## Create Merged Checkpoints

```Shell
python scripts/merge_lora_weights.py \
    --model-path /path/to/lora_model \
    --model-base /path/to/base_model \
    --save-model-path /path/to/merge_model
```


================================================
FILE: docs/MODEL_ZOO.md
================================================
# Model Zoo

**To Use LLaVA-1.6 checkpoints, your llava package version must be newer than 1.2.0. [Instructions](https://github.com/haotian-liu/LLaVA#upgrade-to-latest-code-base) on how to upgrade.**

If you are interested in including any other details in Model Zoo, please open an issue :)

The model weights below are *merged* weights. You do not need to apply delta. The usage of LLaVA checkpoints should comply with the base LLM's model license.

## LLaVA-v1.6

| Version | LLM | Schedule | Checkpoint | MMMU | MathVista | VQAv2 | GQA | VizWiz | SQA | TextVQA | POPE | MME | MM-Bench | MM-Bench-CN | SEED-IMG | LLaVA-Bench-Wild | MM-Vet |
|----------|----------|-----------|-----------|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| LLaVA-1.6 | Vicuna-7B | full_ft-1e | [liuhaotian/llava-v1.6-vicuna-7b](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b) | 35.8 | 34.6 | 81.8 | 64.2 | 57.6 | 70.1 | 64.9 | 86.5 | 1519/332 | 67.4 | 60.6 | 70.2 | 81.6 | 43.9 |
| LLaVA-1.6 | Vicuna-13B | full_ft-1e | [liuhaotian/llava-v1.6-vicuna-13b](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-13b) | 36.2 | 35.3 | 82.8 | 65.4 | 60.5 | 73.6 | 67.1 | 86.2 | 1575/326 | 70 | 64.4 | 71.9 | 87.3 | 48.4 |
| LLaVA-1.6 | Mistral-7B | full_ft-1e | [liuhaotian/llava-v1.6-mistral-7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b) | 35.3 | 37.7 | 82.2 | 64.8 | 60.0 | 72.8 | 65.7 | 86.7 | 1498/321 | 68.7 | 61.2 | 72.2 | 83.2 | 47.3 |
| LLaVA-1.6 | Hermes-Yi-34B | full_ft-1e | [liuhaotian/llava-v1.6-34b](https://huggingface.co/liuhaotian/llava-v1.6-34b) | 51.1 | 46.5 | 83.7 | 67.1 | 63.8 | 81.8 | 69.5 | 87.7 | 1631/397 | 79.3 | 79 | 75.9 | 89.6 | 57.4 |

*LLaVA-1.6-34B outperforms Gemini Pro on benchmarks like MMMU and MathVista.*


## LLaVA-v1.5

| Version | Size | Schedule | Checkpoint | VQAv2 | GQA | VizWiz | SQA | TextVQA | POPE | MME | MM-Bench | MM-Bench-CN | SEED | LLaVA-Bench-Wild | MM-Vet |
|----------|----------|-----------|-----------|---|---|---|---|---|---|---|---|---|---|---|---|
| LLaVA-1.5 | 7B | full_ft-1e | [liuhaotian/llava-v1.5-7b](https://huggingface.co/liuhaotian/llava-v1.5-7b) | 78.5 | 62.0 | 50.0 | 66.8 | 58.2 | 85.9 | 1510.7 | 64.3 | 58.3 | 58.6 | 65.4 | 31.1 |
| LLaVA-1.5 | 13B | full_ft-1e | [liuhaotian/llava-v1.5-13b](https://huggingface.co/liuhaotian/llava-v1.5-13b) | 80.0 | 63.3 | 53.6 | 71.6 | 61.3 | 85.9 | 1531.3 | 67.7 | 63.6 | 61.6 | 72.5 | 36.1 |
| LLaVA-1.5 | 7B | lora-1e | [liuhaotian/llava-v1.5-7b-lora](https://huggingface.co/liuhaotian/llava-v1.5-7b-lora) | 79.1 | 63.0 | 47.8 | 68.4 | 58.2 | 86.4 | 1476.9 | 66.1 | 58.9 | 60.1 | 67.9 | 30.2 |
| LLaVA-1.5 | 13B | lora-1e | [liuhaotian/llava-v1.5-13b-lora](https://huggingface.co/liuhaotian/llava-v1.5-13b-lora) | 80.0 | 63.3 | 58.9 | 71.2 | 60.2 | 86.7 | 1541.7 | 68.5 | 61.5 | 61.3 | 69.5 | 38.3 |

Base model: Vicuna v1.5. Training logs: [wandb](https://api.wandb.ai/links/lht/6orh56wc).

<p align="center">
  <img src="../images/llava_v1_5_radar.jpg" width="500px"> <br>
  LLaVA-1.5 achieves SoTA performance across 11 benchmarks.
</p>


## LLaVA-v1

*Note: We recommend using the most capable LLaVA-v1.6 series above for the best performance.*

| Base LLM | Vision Encoder | Pretrain Data | Pretraining schedule | Finetuning Data | Finetuning schedule | LLaVA-Bench-Conv | LLaVA-Bench-Detail | LLaVA-Bench-Complex | LLaVA-Bench-Overall | Download |
|----------|----------------|---------------|----------------------|-----------------|--------------------|------------------|--------------------|---------------------|---------------------|---------------------|
| Vicuna-13B-v1.3 | CLIP-L-336px | LCS-558K | 1e | LLaVA-Instruct-80K | proj-1e, lora-1e | 64.3 | 55.9 | 81.7 | 70.1 | [LoRA](https://huggingface.co/liuhaotian/llava-v1-0719-336px-lora-vicuna-13b-v1.3) [LoRA-Merged](https://huggingface.co/liuhaotian/llava-v1-0719-336px-lora-merge-vicuna-13b-v1.3) |
| LLaMA-2-13B-Chat | CLIP-L | LCS-558K | 1e | LLaVA-Instruct-80K | full_ft-1e | 56.7 | 58.6 | 80.0 | 67.9 | [ckpt](https://huggingface.co/liuhaotian/llava-llama-2-13b-chat-lightning-preview) |
| LLaMA-2-7B-Chat | CLIP-L | LCS-558K | 1e | LLaVA-Instruct-80K | lora-1e | 51.2 | 58.9 | 71.6 | 62.8 | [LoRA](https://huggingface.co/liuhaotian/llava-llama-2-7b-chat-lightning-lora-preview) |


## Projector weights

These are projector weights we have pretrained. You can use these projector weights for visual instruction tuning. They are just pretrained on image-text pairs and are NOT instruction-tuned, which means they do NOT follow instructions as well as our official models and can output repetitive, lengthy, and garbled outputs. If you want to have nice conversations with LLaVA, use the checkpoints above (LLaVA v1.6).

NOTE: These projector weights are only compatible with `llava>=1.0.0`. Please check out the latest codebase if your local code version is below v1.0.0.

NOTE: When you use our pretrained projector for visual instruction tuning, it is very important to use the same base LLM and vision encoder as the one we used for pretraining the projector. Otherwise, the performance will be very poor.

When using these projector weights to instruction-tune your LMM, please make sure that these options are correctly set as follows,

```Shell
--mm_use_im_start_end False
--mm_use_im_patch_token False
```

| Base LLM | Vision Encoder | Projection | Pretrain Data | Pretraining schedule | Download |
|----------|----------------|---------------|----------------------|----------|----------|
| Vicuna-13B-v1.5 | CLIP-L-336px | MLP-2x | LCS-558K | 1e | [projector](https://huggingface.co/liuhaotian/llava-v1.5-mlp2x-336px-pretrain-vicuna-13b-v1.5) |
| Vicuna-7B-v1.5 | CLIP-L-336px | MLP-2x | LCS-558K | 1e | [projector](https://huggingface.co/liuhaotian/llava-v1.5-mlp2x-336px-pretrain-vicuna-7b-v1.5) |
| LLaMA-2-13B-Chat | CLIP-L-336px | Linear | LCS-558K | 1e | [projector](https://huggingface.co/liuhaotian/llava-336px-pretrain-llama-2-13b-chat) |
| LLaMA-2-7B-Chat | CLIP-L-336px | Linear | LCS-558K | 1e | [projector](https://huggingface.co/liuhaotian/llava-336px-pretrain-llama-2-7b-chat) |
| LLaMA-2-13B-Chat | CLIP-L | Linear | LCS-558K | 1e | [projector](https://huggingface.co/liuhaotian/llava-pretrain-llama-2-13b-chat) |
| LLaMA-2-7B-Chat | CLIP-L | Linear | LCS-558K | 1e | [projector](https://huggingface.co/liuhaotian/llava-pretrain-llama-2-7b-chat) |
| Vicuna-13B-v1.3 | CLIP-L-336px | Linear | LCS-558K | 1e | [projector](https://huggingface.co/liuhaotian/llava-336px-pretrain-vicuna-13b-v1.3) |
| Vicuna-7B-v1.3 | CLIP-L-336px | Linear | LCS-558K | 1e | [projector](https://huggingface.co/liuhaotian/llava-336px-pretrain-vicuna-7b-v1.3) |
| Vicuna-13B-v1.3 | CLIP-L | Linear | LCS-558K | 1e | [projector](https://huggingface.co/liuhaotian/llava-pretrain-vicuna-13b-v1.3) |
| Vicuna-7B-v1.3 | CLIP-L | Linear | LCS-558K | 1e | [projector](https://huggingface.co/liuhaotian/llava-pretrain-vicuna-7b-v1.3) |


## Science QA Checkpoints

| Base LLM | Vision Encoder | Pretrain Data | Pretraining schedule | Finetuning Data | Finetuning schedule | Download |
|----------|----------------|---------------|----------------------|-----------------|--------------------|---------------------|
| Vicuna-13B-v1.3 | CLIP-L | LCS-558K | 1e | ScienceQA | full_ft-12e | [ckpt](https://huggingface.co/liuhaotian/llava-lcs558k-scienceqa-vicuna-13b-v1.3) |


## Legacy Models (merged weights)

The model weights below are *merged* weights. You do not need to apply delta. The usage of LLaVA checkpoints should comply with the base LLM's model license.

| Base LLM | Vision Encoder | Pretrain Data | Pretraining schedule | Finetuning Data | Finetuning schedule | Download |
|----------|----------------|---------------|----------------------|-----------------|--------------------|------------------|
| MPT-7B-Chat | CLIP-L | LCS-558K | 1e | LLaVA-Instruct-80K | full_ft-1e | [preview](https://huggingface.co/liuhaotian/LLaVA-Lightning-MPT-7B-preview) |


## Legacy Models (delta weights)

The model weights below are *delta* weights. The usage of LLaVA checkpoints should comply with the base LLM's model license: [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md).

You can add our delta to the original LLaMA weights to obtain the LLaVA weights.

Instructions:

1. Get the original LLaMA weights in the huggingface format by following the instructions [here](https://huggingface.co/docs/transformers/main/model_doc/llama).
2. Use the following scripts to get LLaVA weights by applying our delta. It will automatically download delta weights from our Hugging Face account. In the script below, we use the delta weights of [`liuhaotian/LLaVA-7b-delta-v0`](https://huggingface.co/liuhaotian/LLaVA-7b-delta-v0) as an example. It can be adapted for other delta weights by changing the `--delta` argument (and base/target accordingly).

```bash
python3 -m llava.model.apply_delta \
    --base /path/to/llama-7b \
    --target /output/path/to/LLaVA-7B-v0 \
    --delta liuhaotian/LLaVA-7b-delta-v0
```

| Base LLM | Vision Encoder | Pretrain Data | Pretraining schedule | Finetuning Data | Finetuning schedule | Download |
|----------|----------------|---------------|----------------------|-----------------|--------------------|------------------|
| Vicuna-13B-v1.1 | CLIP-L | CC-595K | 1e | LLaVA-Instruct-158K | full_ft-3e | [delta-weights](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1) |
| Vicuna-7B-v1.1 | CLIP-L | LCS-558K | 1e | LLaVA-Instruct-80K | full_ft-1e | [delta-weights](https://huggingface.co/liuhaotian/LLaVA-Lightning-7B-delta-v1-1) |
| Vicuna-13B-v0 | CLIP-L | CC-595K | 1e | LLaVA-Instruct-158K | full_ft-3e | [delta-weights](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v0) |
| Vicuna-13B-v0 | CLIP-L | CC-595K | 1e | ScienceQA | full_ft-12e | [delta-weights](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v0-science_qa) |
| Vicuna-7B-v0 | CLIP-L | CC-595K | 1e | LLaVA-Instruct-158K | full_ft-3e | [delta-weights](https://huggingface.co/liuhaotian/LLaVA-7b-delta-v0) |



## Legacy Projector weights

The following projector weights are deprecated, and the support for them may be removed in the future. They do not support zero-shot inference. Please use the projector weights in the [table above](#projector-weights) if possible.

**NOTE**: When you use our pretrained projector for visual instruction tuning, it is very important to **use the same base LLM and vision encoder** as the one we used for pretraining the projector. Otherwise, the performance will be very bad.

When using these projector weights to instruction tune your LMM, please make sure that these options are correctly set as follows,

```Shell
--mm_use_im_start_end True
--mm_use_im_patch_token False
```

| Base LLM | Vision Encoder | Pretrain Data | Pretraining schedule | Download |
|----------|----------------|---------------|----------------------|----------|
| Vicuna-7B-v1.1 | CLIP-L | LCS-558K | 1e | [projector](https://huggingface.co/liuhaotian/LLaVA-Pretrained-Projectors/blob/main/LLaVA-7b-pretrain-projector-v1-1-LCS-558K-blip_caption.bin) |
| Vicuna-13B-v0 | CLIP-L | CC-595K | 1e | [projector](https://huggingface.co/liuhaotian/LLaVA-Pretrained-Projectors/blob/main/LLaVA-13b-pretrain-projector-v0-CC3M-595K-original_caption.bin) |
| Vicuna-7B-v0 | CLIP-L | CC-595K | 1e | [projector](https://huggingface.co/liuhaotian/LLaVA-Pretrained-Projectors/blob/main/LLaVA-7b-pretrain-projector-v0-CC3M-595K-original_caption.bin) |

When using these projector weights to instruction tune your LMM, please make sure that these options are correctly set as follows,

```Shell
--mm_use_im_start_end False
--mm_use_im_patch_token False
```

| Base LLM | Vision Encoder | Pretrain Data | Pretraining schedule | Download |
|----------|----------------|---------------|----------------------|----------|
| Vicuna-13B-v0 | CLIP-L | CC-595K | 1e | [projector](https://huggingface.co/liuhaotian/LLaVA-Pretrained-Projectors/blob/main/LLaVA-13b-pretrain-projector-v0-CC3M-595K-original_caption-no_im_token.bin) |


================================================
FILE: docs/ScienceQA.md
================================================
### ScienceQA

#### Prepare Data
1. Please see ScienceQA [repo](https://github.com/lupantech/ScienceQA) for setting up the dataset.
2. Generate ScienceQA dataset for LLaVA conversation-style format.

```Shell
python scripts/convert_sqa_to_llava.py \
    convert_to_llava \
    --base-dir /path/to/ScienceQA/data/scienceqa \
    --prompt-format "QCM-LEA" \
    --split {train,val,minival,test,minitest}
```

#### Training

1. Pretraining

You can download our pretrained projector weights from our [Model Zoo](), or train your own projector weights using [`pretrain.sh`](https://github.com/haotian-liu/LLaVA/blob/main/scripts/pretrain.sh).

2. Finetuning

See [`finetune_sqa.sh`](https://github.com/haotian-liu/LLaVA/blob/main/scripts/finetune_sqa.sh).

#### Evaluation

1. Multiple-GPU inference
You may evaluate this with multiple GPUs, and concatenate the generated jsonl files.  Please refer to our script for [batch evaluation](https://github.com/haotian-liu/LLaVA/blob/main/scripts/sqa_eval_batch.sh) and [results gathering](https://github.com/haotian-liu/LLaVA/blob/main/scripts/sqa_eval_gather.sh).

2. Single-GPU inference

(a) Generate LLaVA responses on ScienceQA dataset

```Shell
python -m llava.eval.model_vqa_science \
    --model-path liuhaotian/llava-lcs558k-scienceqa-vicuna-13b-v1.3 \
    --question-file /path/to/ScienceQA/data/scienceqa/llava_test_QCM-LEA.json \
    --image-folder /path/to/ScienceQA/data/scienceqa/images/test \
    --answers-file vqa/results/ScienceQA/test_llava-13b.jsonl \
    --conv-mode llava_v1
```

(b) Evaluate the generated responses

```Shell
python eval_science_qa.py \
    --base-dir /path/to/ScienceQA/data/scienceqa \
    --result-file vqa/results/ScienceQA/test_llava-13b.jsonl \
    --output-file vqa/results/ScienceQA/test_llava-13b_output.json \
    --output-result vqa/results/ScienceQA/test_llava-13b_result.json \
```

For reference, we attach our prediction file [`test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json`](https://github.com/haotian-liu/LLaVA/blob/main/llava/eval/table/results/test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json) and [`test_sqa_llava_13b_v0.json`](https://github.com/haotian-liu/LLaVA/blob/main/llava/eval/table/results/test_sqa_llava_13b_v0.json) for comparison when reproducing our results, as well as for further analysis in detail.


================================================
FILE: docs/Windows.md
================================================
# Run LLaVA on Windows

*NOTE: LLaVA on Windows is not fully supported. Currently we only support 16-bit inference. For a more complete support, please use [WSL2](https://learn.microsoft.com/en-us/windows/wsl/install) for now. More functionalities on Windows is to be added soon, stay tuned.*

## Installation

1. Clone this repository and navigate to LLaVA folder
```bash
git clone https://github.com/haotian-liu/LLaVA.git
cd LLaVA
```

2. Install Package
```Shell
conda create -n llava python=3.10 -y
conda activate llava
python -m pip install --upgrade pip  # enable PEP 660 support
pip install torch==2.0.1+cu117 torchvision==0.15.2+cu117 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu117
pip install -e .
pip uninstall bitsandbytes
```

## Run demo

See instructions [here](https://github.com/haotian-liu/LLaVA#demo).

Note that quantization (4-bit, 8-bit) is *NOT* supported on Windows. Stay tuned for the 4-bit support on Windows!


================================================
FILE: docs/macOS.md
================================================
# Run LLaVA on macOS

*NOTE: LLaVA on macOS is not fully supported. Currently we only support 16-bit inference. More functionalities on macOS is to be added soon, stay tuned.*

## Installation

1. Clone this repository and navigate to LLaVA folder
```bash
git clone https://github.com/haotian-liu/LLaVA.git
cd LLaVA
```

2. Install Package
```Shell
conda create -n llava python=3.10 -y
conda activate llava
python -mpip install --upgrade pip  # enable PEP 660 support
pip install -e .
pip install torch==2.1.0 torchvision==0.16.0
pip uninstall bitsandbytes
```

## Run demo

Specify `--device mps` when launching model worker or CLI.

See instructions [here](https://github.com/haotian-liu/LLaVA#demo).

Note that quantization (4-bit, 8-bit) is *NOT* supported on macOS. Stay tuned for the 4-bit support on macOS!


================================================
FILE: llava/__init__.py
================================================
from .model import LlavaLlamaForCausalLM


================================================
FILE: llava/constants.py
================================================
CONTROLLER_HEART_BEAT_EXPIRATION = 30
WORKER_HEART_BEAT_INTERVAL = 15

LOGDIR = "."

# Model Constants
IGNORE_INDEX = -100
IMAGE_TOKEN_INDEX = -200
DEFAULT_IMAGE_TOKEN = "<image>"
DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
DEFAULT_IM_START_TOKEN = "<im_start>"
DEFAULT_IM_END_TOKEN = "<im_end>"
IMAGE_PLACEHOLDER = "<image-placeholder>"


================================================
FILE: llava/conversation.py
================================================
import dataclasses
from enum import auto, Enum
from typing import List, Tuple
import base64
from io import BytesIO
from PIL import Image


class SeparatorStyle(Enum):
    """Different separator style."""
    SINGLE = auto()
    TWO = auto()
    MPT = auto()
    PLAIN = auto()
    LLAMA_2 = auto()


@dataclasses.dataclass
class Conversation:
    """A class that keeps all conversation history."""
    system: str
    roles: List[str]
    messages: List[List[str]]
    offset: int
    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
    sep: str = "###"
    sep2: str = None
    version: str = "Unknown"

    skip_next: bool = False

    def get_prompt(self):
        messages = self.messages
        if len(messages) > 0 and type(messages[0][1]) is tuple:
            messages = self.messages.copy()
            init_role, init_msg = messages[0].copy()
            init_msg = init_msg[0].replace("<image>", "").strip()
            if 'mmtag' in self.version:
                messages[0] = (init_role, init_msg)
                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
                messages.insert(1, (self.roles[1], "Received."))
            else:
                messages[0] = (init_role, "<image>\n" + init_msg)

        if self.sep_style == SeparatorStyle.SINGLE:
            ret = self.system + self.sep
            for role, message in messages:
                if message:
                    if type(message) is tuple:
                        message, _, _ = message
                    ret += role + ": " + message + self.sep
                else:
                    ret += role + ":"
        elif self.sep_style == SeparatorStyle.TWO:
            seps = [self.sep, self.sep2]
            ret = self.system + seps[0]
            for i, (role, message) in enumerate(messages):
                if message:
                    if type(message) is tuple:
                        message, _, _ = message
                    ret += role + ": " + message + seps[i % 2]
                else:
                    ret += role + ":"
        elif self.sep_style == SeparatorStyle.MPT:
            ret = self.system + self.sep
            for role, message in messages:
                if message:
                    if type(message) is tuple:
                        message, _, _ = message
                    ret += role + message + self.sep
                else:
                    ret += role
        elif self.sep_style == SeparatorStyle.LLAMA_2:
            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
            ret = ""

            for i, (role, message) in enumerate(messages):
                if i == 0:
                    assert message, "first message should not be none"
                    assert role == self.roles[0], "first message should come from user"
                if message:
                    if type(message) is tuple:
                        message, _, _ = message
                    if i == 0: message = wrap_sys(self.system) + message
                    if i % 2 == 0:
                        message = wrap_inst(message)
                        ret += self.sep + message
                    else:
                        ret += " " + message + " " + self.sep2
                else:
                    ret += ""
            ret = ret.lstrip(self.sep)
        elif self.sep_style == SeparatorStyle.PLAIN:
            seps = [self.sep, self.sep2]
            ret = self.system
            for i, (role, message) in enumerate(messages):
                if message:
                    if type(message) is tuple:
                        message, _, _ = message
                    ret += message + seps[i % 2]
                else:
                    ret += ""
        else:
            raise ValueError(f"Invalid style: {self.sep_style}")

        return ret

    def append_message(self, role, message):
        self.messages.append([role, message])

    def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=1344, min_len=672):
        if image_process_mode == "Pad":
            def expand2square(pil_img, background_color=(122, 116, 104)):
                width, height = pil_img.size
                if width == height:
                    return pil_img
                elif width > height:
                    result = Image.new(pil_img.mode, (width, width), background_color)
                    result.paste(pil_img, (0, (width - height) // 2))
                    return result
                else:
                    result = Image.new(pil_img.mode, (height, height), background_color)
                    result.paste(pil_img, ((height - width) // 2, 0))
                    return result
            image = expand2square(image)
        elif image_process_mode in ["Default", "Crop"]:
            pass
        elif image_process_mode == "Resize":
            image = image.resize((336, 336))
        else:
            raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
        if max(image.size) > max_len:
            max_hw, min_hw = max(image.size), min(image.size)
            aspect_ratio = max_hw / min_hw
            shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
            longest_edge = int(shortest_edge * aspect_ratio)
            W, H = image.size
            if H > W:
                H, W = longest_edge, shortest_edge
            else:
                H, W = shortest_edge, longest_edge
            image = image.resize((W, H))
        if return_pil:
            return image
        else:
            buffered = BytesIO()
            image.save(buffered, format=image_format)
            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
            return img_b64_str

    def get_images(self, return_pil=False):
        images = []
        for i, (role, msg) in enumerate(self.messages[self.offset:]):
            if i % 2 == 0:
                if type(msg) is tuple:
                    msg, image, image_process_mode = msg
                    image = self.process_image(image, image_process_mode, return_pil=return_pil)
                    images.append(image)
        return images

    def to_gradio_chatbot(self):
        ret = []
        for i, (role, msg) in enumerate(self.messages[self.offset:]):
            if i % 2 == 0:
                if type(msg) is tuple:
                    msg, image, image_process_mode = msg
                    img_b64_str = self.process_image(
                        image, "Default", return_pil=False,
                        image_format='JPEG')
                    img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
                    msg = img_str + msg.replace('<image>', '').strip()
                    ret.append([msg, None])
                else:
                    ret.append([msg, None])
            else:
                ret[-1][-1] = msg
        return ret

    def copy(self):
        return Conversation(
            system=self.system,
            roles=self.roles,
            messages=[[x, y] for x, y in self.messages],
            offset=self.offset,
            sep_style=self.sep_style,
            sep=self.sep,
            sep2=self.sep2,
            version=self.version)

    def dict(self):
        if len(self.get_images()) > 0:
            return {
                "system": self.system,
                "roles": self.roles,
                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
                "offset": self.offset,
                "sep": self.sep,
                "sep2": self.sep2,
            }
        return {
            "system": self.system,
            "roles": self.roles,
            "messages": self.messages,
            "offset": self.offset,
            "sep": self.sep,
            "sep2": self.sep2,
        }


conv_vicuna_v0 = Conversation(
    system="A chat between a curious human and an artificial intelligence assistant. "
           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
    roles=("Human", "Assistant"),
    messages=(
        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
        ("Assistant",
            "Renewable energy sources are those that can be replenished naturally in a relatively "
            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
            "renewable and non-renewable energy sources:\n"
            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
            "energy sources are finite and will eventually run out.\n"
            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
            "and other negative effects.\n"
            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
            "have lower operational costs than non-renewable sources.\n"
            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
            "locations than non-renewable sources.\n"
            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
    ),
    offset=2,
    sep_style=SeparatorStyle.SINGLE,
    sep="###",
)

conv_vicuna_v1 = Conversation(
    system="A chat between a curious user and an artificial intelligence assistant. "
    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
    roles=("USER", "ASSISTANT"),
    version="v1",
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.TWO,
    sep=" ",
    sep2="</s>",
)

conv_llama_2 = Conversation(
    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
    roles=("USER", "ASSISTANT"),
    version="llama_v2",
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.LLAMA_2,
    sep="<s>",
    sep2="</s>",
)

conv_llava_llama_2 = Conversation(
    system="You are a helpful language and vision assistant. "
           "You are able to understand the visual content that the user provides, "
           "and assist the user with a variety of tasks using natural language.",
    roles=("USER", "ASSISTANT"),
    version="llama_v2",
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.LLAMA_2,
    sep="<s>",
    sep2="</s>",
)

conv_mpt = Conversation(
    system="""<|im_start|>system
A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
    version="mpt",
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.MPT,
    sep="<|im_end|>",
)

conv_llava_plain = Conversation(
    system="",
    roles=("", ""),
    messages=(
    ),
    offset=0,
    sep_style=SeparatorStyle.PLAIN,
    sep="\n",
)

conv_llava_v0 = Conversation(
    system="A chat between a curious human and an artificial intelligence assistant. "
           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
    roles=("Human", "Assistant"),
    messages=(
    ),
    offset=0,
    sep_style=SeparatorStyle.SINGLE,
    sep="###",
)

conv_llava_v0_mmtag = Conversation(
    system="A chat between a curious user and an artificial intelligence assistant. "
           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
           "The visual content will be provided with the following format: <Image>visual content</Image>.",
    roles=("Human", "Assistant"),
    messages=(
    ),
    offset=0,
    sep_style=SeparatorStyle.SINGLE,
    sep="###",
    version="v0_mmtag",
)

conv_llava_v1 = Conversation(
    system="A chat between a curious human and an artificial intelligence assistant. "
           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
    roles=("USER", "ASSISTANT"),
    version="v1",
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.TWO,
    sep=" ",
    sep2="</s>",
)

conv_llava_v1_mmtag = Conversation(
    system="A chat between a curious user and an artificial intelligence assistant. "
           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
           "The visual content will be provided with the following format: <Image>visual content</Image>.",
    roles=("USER", "ASSISTANT"),
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.TWO,
    sep=" ",
    sep2="</s>",
    version="v1_mmtag",
)

conv_mistral_instruct = Conversation(
    system="",
    roles=("USER", "ASSISTANT"),
    version="llama_v2",
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.LLAMA_2,
    sep="",
    sep2="</s>",
)

conv_chatml_direct = Conversation(
    system="""<|im_start|>system
Answer the questions.""",
    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
    version="mpt",
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.MPT,
    sep="<|im_end|>",
)

default_conversation = conv_vicuna_v1
conv_templates = {
    "default": conv_vicuna_v0,
    "v0": conv_vicuna_v0,
    "v1": conv_vicuna_v1,
    "vicuna_v1": conv_vicuna_v1,
    "llama_2": conv_llama_2,
    "mistral_instruct": conv_mistral_instruct,
    "chatml_direct": conv_chatml_direct,
    "mistral_direct": conv_chatml_direct,

    "plain": conv_llava_plain,
    "v0_plain": conv_llava_plain,
    "llava_v0": conv_llava_v0,
    "v0_mmtag": conv_llava_v0_mmtag,
    "llava_v1": conv_llava_v1,
    "v1_mmtag": conv_llava_v1_mmtag,
    "llava_llama_2": conv_llava_llama_2,

    "mpt": conv_mpt,
}


if __name__ == "__main__":
    print(default_conversation.get_prompt())


================================================
FILE: llava/eval/eval_gpt_review.py
================================================
import argparse
import json
import os

import openai
import tqdm
import ray
import time

NUM_SECONDS_TO_SLEEP = 3

@ray.remote(num_cpus=4)
def get_eval(content: str, max_tokens: int):
    while True:
        try:
            response = openai.ChatCompletion.create(
                model='gpt-4',
                messages=[{
                    'role': 'system',
                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
                }, {
                    'role': 'user',
                    'content': content,
                }],
                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
                max_tokens=max_tokens,
            )
            break
        except openai.error.RateLimitError:
            pass
        except Exception as e:
            print(e)
        time.sleep(NUM_SECONDS_TO_SLEEP)

    print('success!')
    return response['choices'][0]['message']['content']


def parse_score(review):
    try:
        score_pair = review.split('\n')[0]
        score_pair = score_pair.replace(',', ' ')
        sp = score_pair.split(' ')
        if len(sp) == 2:
            return [float(sp[0]), float(sp[1])]
        else:
            print('error', review)
            return [-1, -1]
    except Exception as e:
        print(e)
        print('error', review)
        return [-1, -1]


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
    parser.add_argument('-q', '--question')
    # parser.add_argument('-a', '--answer')
    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
    parser.add_argument('-r', '--rule')
    parser.add_argument('-o', '--output')
    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
    args = parser.parse_args()

    ray.init()

    f_q = open(os.path.expanduser(args.question))
    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))

    review_file = open(f'{args.output}', 'w')

    js_list = []
    handles = []
    idx = 0
    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
        # if idx == 1:
        #     break

        ques = json.loads(ques_js)
        ans1 = json.loads(ans1_js)
        ans2 = json.loads(ans2_js)

        category = json.loads(ques_js)['category']
        if category in rule_dict:
            rule = rule_dict[category]
        else:
            rule = rule_dict['default']
        prompt = rule['prompt']
        role = rule['role']
        content = (f'[Question]\n{ques["text"]}\n\n'
                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
                   f'[System]\n{prompt}\n\n')
        js_list.append({
            'id': idx+1,
            'question_id': ques['question_id'],
            'answer1_id': ans1['answer_id'],
            'answer2_id': ans2['answer_id'],
            'category': category})
        idx += 1
        handles.append(get_eval.remote(content, args.max_tokens))
        # To avoid the rate limit set by OpenAI
        time.sleep(NUM_SECONDS_TO_SLEEP)

    reviews = ray.get(handles)
    for idx, review in enumerate(reviews):
        scores = parse_score(review)
        js_list[idx]['content'] = review
        js_list[idx]['tuple'] = scores
        review_file.write(json.dumps(js_list[idx]) + '\n')
    review_file.close()


================================================
FILE: llava/eval/eval_gpt_review_bench.py
================================================
import argparse
import json
import os

import openai
import time

NUM_SECONDS_TO_SLEEP = 0.5


def get_eval(content: str, max_tokens: int):
    while True:
        try:
            response = openai.ChatCompletion.create(
                model='gpt-4-0314',
                messages=[{
                    'role': 'system',
                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
                }, {
                    'role': 'user',
                    'content': content,
                }],
                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
                max_tokens=max_tokens,
            )
            break
        except openai.error.RateLimitError:
            pass
        except Exception as e:
            print(e)
        time.sleep(NUM_SECONDS_TO_SLEEP)

    return response['choices'][0]['message']['content']


def parse_score(review):
    try:
        score_pair = review.split('\n')[0]
        score_pair = score_pair.replace(',', ' ')
        sp = score_pair.split(' ')
        if len(sp) == 2:
            return [float(sp[0]), float(sp[1])]
        else:
            print('error', review)
            return [-1, -1]
    except Exception as e:
        print(e)
        print('error', review)
        return [-1, -1]


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
    parser.add_argument('-q', '--question')
    parser.add_argument('-c', '--context')
    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
    parser.add_argument('-r', '--rule')
    parser.add_argument('-o', '--output')
    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
    args = parser.parse_args()

    f_q = open(os.path.expanduser(args.question))
    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))

    if os.path.isfile(os.path.expanduser(args.output)):
        cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
    else:
        cur_reviews = []

    review_file = open(f'{args.output}', 'a')

    context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
    image_to_context = {context['image']: context for context in context_list}

    handles = []
    idx = 0
    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
        ques = json.loads(ques_js)
        ans1 = json.loads(ans1_js)
        ans2 = json.loads(ans2_js)

        inst = image_to_context[ques['image']]

        if isinstance(inst['caption'], list):
            cap_str = '\n'.join(inst['caption'])
        else:
            cap_str = inst['caption']

        category = 'llava_bench_' + json.loads(ques_js)['category']
        if category in rule_dict:
            rule = rule_dict[category]
        else:
            assert False, f"Visual QA category not found in rule file: {category}."
        prompt = rule['prompt']
        role = rule['role']
        content = (f'[Context]\n{cap_str}\n\n'
                   f'[Question]\n{ques["text"]}\n\n'
                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
                   f'[System]\n{prompt}\n\n')
        cur_js = {
            'id': idx+1,
            'question_id': ques['question_id'],
            'answer1_id': ans1.get('answer_id', ans1['question_id']),
            'answer2_id': ans2.get('answer_id', ans2['answer_id']),
            'category': category
        }
        if idx >= len(cur_reviews):
            review = get_eval(content, args.max_tokens)
            scores = parse_score(review)
            cur_js['content'] = review
            cur_js['tuple'] = scores
            review_file.write(json.dumps(cur_js) + '\n')
            review_file.flush()
        else:
            print(f'Skipping {idx} as we already have it.')
        idx += 1
        print(idx)
    review_file.close()


================================================
FILE: llava/eval/eval_gpt_review_visual.py
================================================
import argparse
import json
import os

import openai
import time

NUM_SECONDS_TO_SLEEP = 0.5


def get_eval(content: str, max_tokens: int):
    while True:
        try:
            response = openai.ChatCompletion.create(
                model='gpt-4-0314',
                messages=[{
                    'role': 'system',
                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
                }, {
                    'role': 'user',
                    'content': content,
                }],
                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
                max_tokens=max_tokens,
            )
            break
        except openai.error.RateLimitError:
            pass
        except Exception as e:
            print(e)
        time.sleep(NUM_SECONDS_TO_SLEEP)

    return response['choices'][0]['message']['content']


def parse_score(review):
    try:
        score_pair = review.split('\n')[0]
        score_pair = score_pair.replace(',', ' ')
        sp = score_pair.split(' ')
        if len(sp) == 2:
            return [float(sp[0]), float(sp[1])]
        else:
            print('error', review)
            return [-1, -1]
    except Exception as e:
        print(e)
        print('error', review)
        return [-1, -1]


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
    parser.add_argument('-q', '--question')
    parser.add_argument('-c', '--context')
    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
    parser.add_argument('-r', '--rule')
    parser.add_argument('-o', '--output')
    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
    args = parser.parse_args()

    f_q = open(os.path.expanduser(args.question))
    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))

    if os.path.isfile(os.path.expanduser(args.output)):
        cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
    else:
        cur_reviews = []

    review_file = open(f'{args.output}', 'a')

    context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
    image_to_context = {context['image']: context for context in context_list}

    handles = []
    idx = 0
    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
        ques = json.loads(ques_js)
        ans1 = json.loads(ans1_js)
        ans2 = json.loads(ans2_js)

        inst = image_to_context[ques['image']]
        cap_str = '\n'.join(inst['captions'])
        box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']])

        category = json.loads(ques_js)['category']
        if category in rule_dict:
            rule = rule_dict[category]
        else:
            assert False, f"Visual QA category not found in rule file: {category}."
        prompt = rule['prompt']
        role = rule['role']
        content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n'
                   f'[Question]\n{ques["text"]}\n\n'
                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
                   f'[System]\n{prompt}\n\n')
        cur_js = {
            'id': idx+1,
            'question_id': ques['question_id'],
            'answer1_id': ans1.get('answer_id', ans1['question_id']),
            'answer2_id': ans2.get('answer_id', ans2['answer_id']),
            'category': category
        }
        if idx >= len(cur_reviews):
            review = get_eval(content, args.max_tokens)
            scores = parse_score(review)
            cur_js['content'] = review
            cur_js['tuple'] = scores
            review_file.write(json.dumps(cur_js) + '\n')
            review_file.flush()
        else:
            print(f'Skipping {idx} as we already have it.')
        idx += 1
        print(idx)
    review_file.close()


================================================
FILE: llava/eval/eval_pope.py
================================================
import os
import json
import argparse

def eval_pope(answers, label_file):
    label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]

    for answer in answers:
        text = answer['text']

        # Only keep the first sentence
        if text.find('.') != -1:
            text = text.split('.')[0]

        text = text.replace(',', '')
        words = text.split(' ')
        if 'No' in words or 'not' in words or 'no' in words:
            answer['text'] = 'no'
        else:
            answer['text'] = 'yes'

    for i in range(len(label_list)):
        if label_list[i] == 'no':
            label_list[i] = 0
        else:
            label_list[i] = 1

    pred_list = []
    for answer in answers:
        if answer['text'] == 'no':
            pred_list.append(0)
        else:
            pred_list.append(1)

    pos = 1
    neg = 0
    yes_ratio = pred_list.count(1) / len(pred_list)

    TP, TN, FP, FN = 0, 0, 0, 0
    for pred, label in zip(pred_list, label_list):
        if pred == pos and label == pos:
            TP += 1
        elif pred == pos and label == neg:
            FP += 1
        elif pred == neg and label == neg:
            TN += 1
        elif pred == neg and label == pos:
            FN += 1

    print('TP\tFP\tTN\tFN\t')
    print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))

    precision = float(TP) / float(TP + FP)
    recall = float(TP) / float(TP + FN)
    f1 = 2*precision*recall / (precision + recall)
    acc = (TP + TN) / (TP + TN + FP + FN)
    print('Accuracy: {}'.format(acc))
    print('Precision: {}'.format(precision))
    print('Recall: {}'.format(recall))
    print('F1 score: {}'.format(f1))
    print('Yes ratio: {}'.format(yes_ratio))
    print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) )

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--annotation-dir", type=str)
    parser.add_argument("--question-file", type=str)
    parser.add_argument("--result-file", type=str)
    args = parser.parse_args()

    questions = [json.loads(line) for line in open(args.question_file)]
    questions = {question['question_id']: question for question in questions}
    answers = [json.loads(q) for q in open(args.result_file)]
    for file in os.listdir(args.annotation_dir):
        assert file.startswith('coco_pope_')
        assert file.endswith('.json')
        category = file[10:-5]
        cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
        print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
        eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
        print("====================================")


================================================
FILE: llava/eval/eval_science_qa.py
================================================
import argparse
import json
import os
import re
import random


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--base-dir', type=str)
    parser.add_argument('--result-file', type=str)
    parser.add_argument('--output-file', type=str)
    parser.add_argument('--output-result', type=str)
    parser.add_argument('--split', type=str, default='test')
    parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
    return parser.parse_args()


def convert_caps(results):
    fakecaps = []
    for result in results:
        image_id = result['question_id']
        caption = result['text']
        fakecaps.append({"image_id": int(image_id), "caption": caption})
    return fakecaps


def get_pred_idx(prediction, choices, options):
    """
    Get the index (e.g. 2) from the prediction (e.g. 'C')
    """
    if prediction in options[:len(choices)]:
        return options.index(prediction)
    else:
        return -1
        return random.choice(range(len(choices)))


if __name__ == "__main__":
    args = get_args()

    base_dir = args.base_dir
    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
    problems = json.load(open(os.path.join(base_dir, "problems.json")))
    predictions = [json.loads(line) for line in open(args.result_file)]
    predictions = {pred['question_id']: pred for pred in predictions}
    split_problems = {idx: problems[idx] for idx in split_indices}

    results = {'correct': [], 'incorrect': []}
    sqa_results = {}
    sqa_results['acc'] = None
    sqa_results['correct'] = None
    sqa_results['count'] = None
    sqa_results['results'] = {}
    sqa_results['outputs'] = {}

    for prob_id, prob in split_problems.items():
        if prob_id not in predictions:
            pred = {'text': 'FAILED', 'prompt': 'Unknown'}
            pred_text = 'FAILED'
        else:
            pred = predictions[prob_id]
            pred_text = pred['text']

        if pred_text in args.options:
            answer = pred_text
        elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ":
            answer = pred_text[0]
        else:
            pattern = re.compile(r'The answer is ([A-Z]).')
            res = pattern.findall(pred_text)
            if len(res) == 1:
                answer = res[0]  # 'A', 'B', ...
            else:
                answer = "FAILED"

        pred_idx = get_pred_idx(answer, prob['choices'], args.options)

        analysis = {
            'question_id': prob_id,
            'parsed_ans': answer,
            'ground_truth': args.options[prob['answer']],
            'question': pred['prompt'],
            'pred': pred_text,
            'is_multimodal': '<image>' in pred['prompt'],
        }

        sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options)
        sqa_results['outputs'][prob_id] = pred_text

        if pred_idx == prob['answer']:
            results['correct'].append(analysis)
        else:
            results['incorrect'].append(analysis)

    correct = len(results['correct'])
    total = len(results['correct']) + len(results['incorrect'])

    ###### IMG ######
    multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']])
    multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']])
    multimodal_total = multimodal_correct + multimodal_incorrect
    ###### IMG ######

    print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%')

    sqa_results['acc'] = correct / total * 100
    sqa_results['correct'] = correct
    sqa_results['count'] = total

    with open(args.output_file, 'w') as f:
        json.dump(results, f, indent=2)
    with open(args.output_result, 'w') as f:
        json.dump(sqa_results, f, indent=2)


================================================
FILE: llava/eval/eval_science_qa_gpt4.py
================================================
import argparse
import json
import os
import re
import random
from collections import defaultdict


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--base-dir', type=str)
    parser.add_argument('--gpt4-result', type=str)
    parser.add_argument('--our-result', type=str)
    parser.add_argument('--split', type=str, default='test')
    parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
    return parser.parse_args()


def convert_caps(results):
    fakecaps = []
    for result in results:
        image_id = result['question_id']
        caption = result['text']
        fakecaps.append({"image_id": int(image_id), "caption": caption})
    return fakecaps


def get_pred_idx(prediction, choices, options):
    """
    Get the index (e.g. 2) from the prediction (e.g. 'C')
    """
    if prediction in options[:len(choices)]:
        return options.index(prediction)
    else:
        return random.choice(range(len(choices)))


if __name__ == "__main__":
    args = get_args()

    base_dir = args.base_dir
    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
    problems = json.load(open(os.path.join(base_dir, "problems.json")))
    our_predictions = [json.loads(line) for line in open(args.our_result)]
    our_predictions = {pred['question_id']: pred for pred in our_predictions}
    split_problems = {idx: problems[idx] for idx in split_indices}

    gpt4_predictions = json.load(open(args.gpt4_result))['outputs']

    results = defaultdict(lambda: 0)

    for prob_id, prob in split_problems.items():
        if prob_id not in our_predictions:
            continue
        if prob_id not in gpt4_predictions:
            continue
        our_pred = our_predictions[prob_id]['text']
        gpt4_pred = gpt4_predictions[prob_id]

        pattern = re.compile(r'The answer is ([A-Z]).')
        our_res = pattern.findall(our_pred)
        if len(our_res) == 1:
            our_answer = our_res[0]  # 'A', 'B', ...
        else:
            our_answer = "FAILED"
        gpt4_res = pattern.findall(gpt4_pred)
        if len(gpt4_res) == 1:
            gpt4_answer = gpt4_res[0]  # 'A', 'B', ...
        else:
            gpt4_answer = "FAILED"

        our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
        gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)

        if gpt4_answer == 'FAILED':
            results['gpt4_failed'] += 1
            # continue
            gpt4_pred_idx = our_pred_idx
            # if our_pred_idx != prob['answer']:
            #     print(our_predictions[prob_id]['prompt'])
            #     print('-----------------')
            #     print(f'LECTURE: {prob["lecture"]}')
            #     print(f'SOLUTION: {prob["solution"]}')
            #     print('=====================')
        else:
            # continue
            pass
        # gpt4_pred_idx = our_pred_idx

        if gpt4_pred_idx == prob['answer']:
            results['correct'] += 1
        else:
            results['incorrect'] += 1


        if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
            results['correct_upperbound'] += 1

    correct = results['correct']
    total = results['correct'] + results['incorrect']
    print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%')
    print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
    print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')



================================================
FILE: llava/eval/eval_science_qa_gpt4_requery.py
================================================
import argparse
import json
import os
import re
import random
from collections import defaultdict


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--base-dir', type=str)
    parser.add_argument('--gpt4-result', type=str)
    parser.add_argument('--requery-result', type=str)
    parser.add_argument('--our-result', type=str)
    parser.add_argument('--output-result', type=str)
    parser.add_argument('--split', type=str, default='test')
    parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
    return parser.parse_args()


def convert_caps(results):
    fakecaps = []
    for result in results:
        image_id = result['question_id']
        caption = result['text']
        fakecaps.append({"image_id": int(image_id), "caption": caption})
    return fakecaps


def get_pred_idx(prediction, choices, options):
    """
    Get the index (e.g. 2) from the prediction (e.g. 'C')
    """
    if prediction in options[:len(choices)]:
        return options.index(prediction)
    else:
        return random.choice(range(len(choices)))


if __name__ == "__main__":
    args = get_args()

    base_dir = args.base_dir
    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
    problems = json.load(open(os.path.join(base_dir, "problems.json")))
    our_predictions = [json.loads(line) for line in open(args.our_result)]
    our_predictions = {pred['question_id']: pred for pred in our_predictions}
    split_problems = {idx: problems[idx] for idx in split_indices}

    requery_predictions = [json.loads(line) for line in open(args.requery_result)]
    requery_predictions = {pred['question_id']: pred for pred in requery_predictions}

    gpt4_predictions = json.load(open(args.gpt4_result))['outputs']

    results = defaultdict(lambda: 0)

    sqa_results = {}
    sqa_results['acc'] = None
    sqa_results['correct'] = None
    sqa_results['count'] = None
    sqa_results['results'] = {}
    sqa_results['outputs'] = {}

    for prob_id, prob in split_problems.items():
        if prob_id not in our_predictions:
            assert False
        if prob_id not in gpt4_predictions:
            assert False
        our_pred = our_predictions[prob_id]['text']
        gpt4_pred = gpt4_predictions[prob_id]
        if prob_id not in requery_predictions:
            results['missing_requery'] += 1
            requery_pred = "MISSING"
        else:
            requery_pred = requery_predictions[prob_id]['text']

        pattern = re.compile(r'The answer is ([A-Z]).')
        our_res = pattern.findall(our_pred)
        if len(our_res) == 1:
            our_answer = our_res[0]  # 'A', 'B', ...
        else:
            our_answer = "FAILED"

        requery_res = pattern.findall(requery_pred)
        if len(requery_res) == 1:
            requery_answer = requery_res[0]  # 'A', 'B', ...
        else:
            requery_answer = "FAILED"

        gpt4_res = pattern.findall(gpt4_pred)
        if len(gpt4_res) == 1:
            gpt4_answer = gpt4_res[0]  # 'A', 'B', ...
        else:
            gpt4_answer = "FAILED"

        our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
        gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
        requery_pred_idx = get_pred_idx(requery_answer, prob['choices'], args.options)

        results['total'] += 1

        if gpt4_answer == 'FAILED':
            results['gpt4_failed'] += 1
            if gpt4_pred_idx == prob['answer']:
                results['gpt4_correct'] += 1
            if our_pred_idx == prob['answer']:
                results['gpt4_ourvisual_correct'] += 1
        elif gpt4_pred_idx == prob['answer']:
            results['gpt4_correct'] += 1
            results['gpt4_ourvisual_correct'] += 1

        if our_pred_idx == prob['answer']:
            results['our_correct'] += 1

        if requery_answer == 'FAILED':
            sqa_results['results'][prob_id] = our_pred_idx
            if our_pred_idx == prob['answer']:
                results['requery_correct'] += 1
        else:
            sqa_results['results'][prob_id] = requery_pred_idx
            if requery_pred_idx == prob['answer']:
                results['requery_correct'] += 1
            else:
                print(f"""
Question ({args.options[prob['answer']]}): {our_predictions[prob_id]['prompt']}
Our ({our_answer}): {our_pred}
GPT-4 ({gpt4_answer}): {gpt4_pred}
Requery ({requery_answer}): {requery_pred}
print("=====================================")
""")

        if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
            results['correct_upperbound'] += 1

    total = results['total']
    print(f'Total: {total}, Our-Correct: {results["our_correct"]}, Accuracy: {results["our_correct"] / total * 100:.2f}%')
    print(f'Total: {total}, GPT-4-Correct: {results["gpt4_correct"]}, Accuracy: {results["gpt4_correct"] / total * 100:.2f}%')
    print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
    print(f'Total: {total}, GPT-4-OursVisual-Correct: {results["gpt4_ourvisual_correct"]}, Accuracy: {results["gpt4_ourvisual_correct"] / total * 100:.2f}%')
    print(f'Total: {total}, Requery-Correct: {results["requery_correct"]}, Accuracy: {results["requery_correct"] / total * 100:.2f}%')
    print(f'Total: {total}, Correct upper: {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')

    sqa_results['acc'] = results["requery_correct"] / total * 100
    sqa_results['correct'] = results["requery_correct"]
    sqa_results['count'] = total

    with open(args.output_result, 'w') as f:
        json.dump(sqa_results, f, indent=2)



================================================
FILE: llava/eval/eval_textvqa.py
================================================
import os
import argparse
import json
import re

from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--annotation-file', type=str)
    parser.add_argument('--result-file', type=str)
    parser.add_argument('--result-dir', type=str)
    return parser.parse_args()


def prompt_processor(prompt):
    if prompt.startswith('OCR tokens: '):
        pattern = r"Question: (.*?) Short answer:"
        match = re.search(pattern, prompt, re.DOTALL)
        question = match.group(1)
    elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
        if prompt.startswith('Reference OCR token:'):
            question = prompt.split('\n')[1]
        else:
            question = prompt.split('\n')[0]
    elif len(prompt.split('\n')) == 2:
        question = prompt.split('\n')[0]
    else:
        assert False

    return question.lower()


def eval_single(annotation_file, result_file):
    experiment_name = os.path.splitext(os.path.basename(result_file))[0]
    print(experiment_name)
    annotations = json.load(open(annotation_file))['data']
    annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
    results = [json.loads(line) for line in open(result_file)]

    pred_list = []
    for result in results:
        annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
        pred_list.append({
            "pred_answer": result['text'],
            "gt_answers": annotation['answers'],
        })

    evaluator = TextVQAAccuracyEvaluator()
    print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))


if __name__ == "__main__":
    args = get_args()

    if args.result_file is not None:
        eval_single(args.annotation_file, args.result_file)

    if args.result_dir is not None:
        for result_file in sorted(os.listdir(args.result_dir)):
            if not result_file.endswith('.jsonl'):
                print(f'Skipping {result_file}')
                continue
            eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))


================================================
FILE: llava/eval/generate_webpage_data_from_table.py
================================================
"""Generate json file for webpage."""
import json
import os
import re

# models = ['llama', 'alpaca', 'gpt35', 'bard']
models = ['vicuna']


def read_jsonl(path: str, key: str=None):
    data = []
    with open(os.path.expanduser(path)) as f:
        for line in f:
            if not line:
                continue
            data.append(json.loads(line))
    if key is not None:
        data.sort(key=lambda x: x[key])
        data = {item[key]: item for item in data}
    return data


def trim_hanging_lines(s: str, n: int) -> str:
    s = s.strip()
    for _ in range(n):
        s = s.split('\n', 1)[1].strip()
    return s


if __name__ == '__main__':
    questions = read_jsonl('table/question.jsonl', key='question_id')

    # alpaca_answers = read_jsonl('table/answer/answer_alpaca-13b.jsonl', key='question_id')
    # bard_answers = read_jsonl('table/answer/answer_bard.jsonl', key='question_id')
    # gpt35_answers = read_jsonl('table/answer/answer_gpt35.jsonl', key='question_id')
    # llama_answers = read_jsonl('table/answer/answer_llama-13b.jsonl', key='question_id')
    vicuna_answers = read_jsonl('table/answer/answer_vicuna-13b.jsonl', key='question_id')
    ours_answers = read_jsonl('table/results/llama-13b-hf-alpaca.jsonl', key='question_id')

    review_vicuna = read_jsonl('table/review/review_vicuna-13b_llama-13b-hf-alpaca.jsonl', key='question_id')
    # review_alpaca = read_jsonl('table/review/review_alpaca-13b_vicuna-13b.jsonl', key='question_id')
    # review_bard = read_jsonl('table/review/review_bard_vicuna-13b.jsonl', key='question_id')
    # review_gpt35 = read_jsonl('table/review/review_gpt35_vicuna-13b.jsonl', key='question_id')
    # review_llama = read_jsonl('table/review/review_llama-13b_vicuna-13b.jsonl', key='question_id')

    records = []
    for qid in questions.keys():
        r = {
            'id': qid,
            'category': questions[qid]['category'],
            'question': questions[qid]['text'],
            'answers': {
                # 'alpaca': alpaca_answers[qid]['text'],
                # 'llama': llama_answers[qid]['text'],
                # 'bard': bard_answers[qid]['text'],
                # 'gpt35': gpt35_answers[qid]['text'],
                'vicuna': vicuna_answers[qid]['text'],
                'ours': ours_answers[qid]['text'],
            },
            'evaluations': {
                # 'alpaca': review_alpaca[qid]['text'],
                # 'llama': review_llama[qid]['text'],
                # 'bard': review_bard[qid]['text'],
                'vicuna': review_vicuna[qid]['content'],
                # 'gpt35': review_gpt35[qid]['text'],
            },
            'scores': {
                'vicuna': review_vicuna[qid]['tuple'],
                # 'alpaca': review_alpaca[qid]['score'],
                # 'llama': review_llama[qid]['score'],
                # 'bard': review_bard[qid]['score'],
                # 'gpt35': review_gpt35[qid]['score'],
            },
        }

        # cleanup data
        cleaned_evals = {}
        for k, v in r['evaluations'].items():
            v = v.strip()
            lines = v.split('\n')
            # trim the first line if it's a pair of numbers
            if re.match(r'\d+[, ]+\d+', lines[0]):
                lines = lines[1:]
            v = '\n'.join(lines)
            cleaned_evals[k] = v.replace('Assistant 1', "**Assistant 1**").replace('Assistant 2', '**Assistant 2**')

        r['evaluations'] = cleaned_evals
        records.append(r)

    # Reorder the records, this is optional
    for r in records:
        if r['id'] <= 20:
            r['id'] += 60
        else:
            r['id'] -= 20
    for r in records:
        if r['id'] <= 50:
            r['id'] += 10
        elif 50 < r['id'] <= 60:
            r['id'] -= 50
    for r in records:
        if r['id'] == 7:
            r['id'] = 1
        elif r['id'] < 7:
            r['id'] += 1 

    records.sort(key=lambda x: x['id'])

    # Write to file
    with open('webpage/data.json', 'w') as f:
        json.dump({'questions': records, 'models': models}, f, indent=2)


================================================
FILE: llava/eval/m4c_evaluator.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
import re

from tqdm import tqdm


class EvalAIAnswerProcessor:
    """
    Processes an answer similar to Eval AI
        copied from
        https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897
    """

    CONTRACTIONS = {
        "aint": "ain't",
        "arent": "aren't",
        "cant": "can't",
        "couldve": "could've",
        "couldnt": "couldn't",
        "couldn'tve": "couldn't've",
        "couldnt've": "couldn't've",
        "didnt": "didn't",
        "doesnt": "doesn't",
        "dont": "don't",
        "hadnt": "hadn't",
        "hadnt've": "hadn't've",
        "hadn'tve": "hadn't've",
        "hasnt": "hasn't",
        "havent": "haven't",
        "hed": "he'd",
        "hed've": "he'd've",
        "he'dve": "he'd've",
        "hes": "he's",
        "howd": "how'd",
        "howll": "how'll",
        "hows": "how's",
        "Id've": "I'd've",
        "I'dve": "I'd've",
        "Im": "I'm",
        "Ive": "I've",
        "isnt": "isn't",
        "itd": "it'd",
        "itd've": "it'd've",
        "it'dve": "it'd've",
        "itll": "it'll",
        "let's": "let's",
        "maam": "ma'am",
        "mightnt": "mightn't",
        "mightnt've": "mightn't've",
        "mightn'tve": "mightn't've",
        "mightve": "might've",
        "mustnt": "mustn't",
        "mustve": "must've",
        "neednt": "needn't",
        "notve": "not've",
        "oclock": "o'clock",
        "oughtnt": "oughtn't",
        "ow's'at": "'ow's'at",
        "'ows'at": "'ow's'at",
        "'ow'sat": "'ow's'at",
        "shant": "shan't",
        "shed've": "she'd've",
        "she'dve": "she'd've",
        "she's": "she's",
        "shouldve": "should've",
        "shouldnt": "shouldn't",
        "shouldnt've": "shouldn't've",
        "shouldn'tve": "shouldn't've",
        "somebody'd": "somebodyd",
        "somebodyd've": "somebody'd've",
        "somebody'dve": "somebody'd've",
        "somebodyll": "somebody'll",
        "somebodys": "somebody's",
        "someoned": "someone'd",
        "someoned've": "someone'd've",
        "someone'dve": "someone'd've",
        "someonell": "someone'll",
        "someones": "someone's",
        "somethingd": "something'd",
        "somethingd've": "something'd've",
        "something'dve": "something'd've",
        "somethingll": "something'll",
        "thats": "that's",
        "thered": "there'd",
        "thered've": "there'd've",
        "there'dve": "there'd've",
        "therere": "there're",
        "theres": "there's",
        "theyd": "they'd",
        "theyd've": "they'd've",
        "they'dve": "they'd've",
        "theyll": "they'll",
        "theyre": "they're",
        "theyve": "they've",
        "twas": "'twas",
        "wasnt": "wasn't",
        "wed've": "we'd've",
        "we'dve": "we'd've",
        "weve": "we've",
        "werent": "weren't",
        "whatll": "what'll",
        "whatre": "what're",
        "whats": "what's",
        "whatve": "what've",
        "whens": "when's",
        "whered": "where'd",
        "wheres": "where's",
        "whereve": "where've",
        "whod": "who'd",
        "whod've": "who'd've",
        "who'dve": "who'd've",
        "wholl": "who'll",
        "whos": "who's",
        "whove": "who've",
        "whyll": "why'll",
        "whyre": "why're",
        "whys": "why's",
        "wont": "won't",
        "wouldve": "would've",
        "wouldnt": "wouldn't",
        "wouldnt've": "wouldn't've",
        "wouldn'tve": "wouldn't've",
        "yall": "y'all",
        "yall'll": "y'all'll",
        "y'allll": "y'all'll",
        "yall'd've": "y'all'd've",
        "y'alld've": "y'all'd've",
        "y'all'dve": "y'all'd've",
        "youd": "you'd",
        "youd've": "you'd've",
        "you'dve": "you'd've",
        "youll": "you'll",
        "youre": "you're",
        "youve": "you've",
    }

    NUMBER_MAP = {
        "none": "0",
        "zero": "0",
        "one": "1",
        "two": "2",
        "three": "3",
        "four": "4",
        "five": "5",
        "six": "6",
        "seven": "7",
        "eight": "8",
        "nine": "9",
        "ten": "10",
    }
    ARTICLES = ["a", "an", "the"]
    PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)")
    COMMA_STRIP = re.compile(r"(?<=\d)(\,)+(?=\d)")
    PUNCTUATIONS = [
        ";",
        r"/",
        "[",
        "]",
        '"',
        "{",
        "}",
        "(",
        ")",
        "=",
        "+",
        "\\",
        "_",
        "-",
        ">",
        "<",
        "@",
        "`",
        ",",
        "?",
        "!",
    ]

    def __init__(self, *args, **kwargs):
        pass

    def word_tokenize(self, word):
        word = word.lower()
        word = word.replace(",", "").replace("?", "").replace("'s", " 's")
        return word.strip()

    def process_punctuation(self, in_text):
        out_text = in_text
        for p in self.PUNCTUATIONS:
            if (p + " " in in_text or " " + p in in_text) or (
                re.search(self.COMMA_STRIP, in_text) is not None
            ):
                out_text = out_text.replace(p, "")
            else:
                out_text = out_text.replace(p, " ")
        out_text = self.PERIOD_STRIP.sub("", out_text, re.UNICODE)
        return out_text

    def process_digit_article(self, in_text):
        out_text = []
        temp_text = in_text.lower().split()
        for word in temp_text:
            word = self.NUMBER_MAP.setdefault(word, word)
            if word not in self.ARTICLES:
                out_text.append(word)
            else:
                pass
        for word_id, word in enumerate(out_text):
            if word in self.CONTRACTIONS:
                out_text[word_id] = self.CONTRACTIONS[word]
        out_text = " ".join(out_text)
        return out_text

    def __call__(self, item):
        item = self.word_tokenize(item)
        item = item.replace("\n", " ").replace("\t", " ").strip()
        item = self.process_punctuation(item)
        item = self.process_digit_article(item)
        return item


class TextVQAAccuracyEvaluator:
    def __init__(self):
        self.answer_processor = EvalAIAnswerProcessor()

    def _compute_answer_scores(self, raw_answers):
        """
        compute the accuracy (soft score) of human answers
        """
        answers = [self.answer_processor(a) for a in raw_answers]
        assert len(answers) == 10
        gt_answers = list(enumerate(answers))
        unique_answers = set(answers)
        unique_answer_scores = {}

        for unique_answer in unique_answers:
            accs = []
            for gt_answer in gt_answers:
                other_answers = [item for item in gt_answers if item != gt_answer]
                matching_answers = [
                    item for item in other_answers if item[1] == unique_answer
                ]
                acc = min(1, float(len(matching_answers)) / 3)
                accs.append(acc)
            unique_answer_scores[unique_answer] = sum(accs) / len(accs)

        return unique_answer_scores

    def eval_pred_list(self, pred_list):
        pred_scores = []
        for entry in tqdm(pred_list):
            pred_answer = self.answer_processor(entry["pred_answer"])
            unique_answer_scores = self._compute_answer_scores(entry["gt_answers"])
            score = unique_answer_scores.get(pred_answer, 0.0)
            pred_scores.append(score)

        accuracy = sum(pred_scores) / len(pred_scores)
        return accuracy


class STVQAAccuracyEvaluator:
    def __init__(self):
        self.answer_processor = EvalAIAnswerProcessor()

    def eval_pred_list(self, pred_list):
        pred_scores = []
        for entry in pred_list:
            pred_answer = self.answer_processor(entry["pred_answer"])
            gts = [self.answer_processor(a) for a in entry["gt_answers"]]
            score = 1.0 if pred_answer in gts else 0.0
            pred_scores.append(score)

        accuracy = sum(pred_scores) / len(pred_scores)
        return accuracy


class STVQAANLSEvaluator:
    def __init__(self):
        import editdistance  # install with `pip install editdistance`

        self.get_edit_distance = editdistance.eval

    def get_anls(self, s1, s2):
        s1 = s1.lower().strip()
        s2 = s2.lower().strip()
        iou = 1 - self.get_edit_distance(s1, s2) / max(len(s1), len(s2))
        anls = iou if iou >= 0.5 else 0.0
        return anls

    def eval_pred_list(self, pred_list):
        pred_scores = []
        for entry in pred_list:
            anls = max(
                self.get_anls(entry["pred_answer"], gt) for gt in entry["gt_answers"]
            )
            pred_scores.append(anls)

        accuracy = sum(pred_scores) / len(pred_scores)
        return accuracy


class TextCapsBleu4Evaluator:
    def __init__(self):
        # The following script requires Java 1.8.0 and pycocotools installed.
        # The pycocoevalcap can be installed with pip as
        # pip install git+https://github.com/ronghanghu/coco-caption.git@python23
        # Original pycocoevalcap code is at https://github.com/tylin/coco-caption
        # but has no python3 support yet.
        try:
            from pycocoevalcap.bleu.bleu import Bleu
            from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
        except ModuleNotFoundError:
            print(
                "Please install pycocoevalcap module using "
                "pip install git+https://github.com/ronghanghu/coco-caption.git@python23"  # noqa
            )
            raise

        self.tokenizer = PTBTokenizer()
        self.scorer = Bleu(4)

    def eval_pred_list(self, pred_list):
        # Create reference and hypotheses captions.
        gts = {}
        res = {}
        for idx, entry in enumerate(pred_list):
            gts[idx] = [{"caption": a} for a in entry["gt_answers"]]
            res[idx] = [{"caption": entry["pred_answer"]}]

        gts = self.tokenizer.tokenize(gts)
        res = self.tokenizer.tokenize(res)
        score, _ = self.scorer.compute_score(gts, res)

        bleu4 = score[3]  # score is (Bleu-1, Bleu-2, Bleu-3, Bleu-4)
        return bleu4


================================================
FILE: llava/eval/model_qa.py
================================================
import argparse
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria
import torch
import os
import json
from tqdm import tqdm
import shortuuid

from llava.conversation import default_conversation
from llava.utils import disable_torch_init


@torch.inference_mode()
def eval_model(model_name, questions_file, answers_file):
    # Model
    disable_torch_init()
    model_name = os.path.expanduser(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
    model = AutoModelForCausalLM.from_pretrained(model_name,
        torch_dtype=torch.float16).cuda()


    ques_file = open(os.path.expanduser(questions_file), "r")
    ans_file = open(os.path.expanduser(answers_file), "w")
    for i, line in enumerate(tqdm(ques_file)):
        idx = json.loads(line)["question_id"]
        qs = json.loads(line)["text"]
        cat = json.loads(line)["category"]
        conv = default_conversation.copy()
        conv.append_message(conv.roles[0], qs)
        prompt = conv.get_prompt()
        inputs = tokenizer([prompt])
        input_ids = torch.as_tensor(inputs.input_ids).cuda()
        output_ids = model.generate(
            input_ids,
            do_sample=True,
            use_cache=True,
            temperature=0.7,
            max_new_tokens=1024,)
        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
        try:
            index = outputs.index(conv.sep, len(prompt))
        except ValueError:
            outputs += conv.sep
            index = outputs.index(conv.sep, len(prompt))

        outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
        ans_id = shortuuid.uuid()
        ans_file.write(json.dumps({"question_id": idx,
                                   "text": outputs,
                                   "answer_id": ans_id,
                                   "model_id": model_name,
                                   "metadata": {}}) + "\n")
        ans_file.flush()
    ans_file.close()

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
    args = parser.parse_args()

    eval_model(args.model_name, args.question_file, args.answers_file)


================================================
FILE: llava/eval/model_vqa.py
================================================
import argparse
import torch
import os
import json
from tqdm import tqdm
import shortuuid

from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path

from PIL import Image
import math


def split_list(lst, n):
    """Split a list into n (roughly) equal-sized chunks"""
    chunk_size = math.ceil(len(lst) / n)  # integer division
    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]


def get_chunk(lst, n, k):
    chunks = split_list(lst, n)
    return chunks[k]


def eval_model(args):
    # Model
    disable_torch_init()
    model_path = os.path.expanduser(args.model_path)
    model_name = get_model_name_from_path(model_path)
    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)

    questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
    answers_file = os.path.expanduser(args.answers_file)
    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
    ans_file = open(answers_file, "w")
    for line in tqdm(questions):
        idx = line["question_id"]
        image_file = line["image"]
        qs = line["text"]
        cur_prompt = qs
        if model.config.mm_use_im_start_end:
            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
        else:
            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs

        conv = conv_templates[args.conv_mode].copy()
        conv.append_message(conv.roles[0], qs)
        conv.append_message(conv.roles[1], None)
        prompt = conv.get_prompt()

        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()

        image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB')
        image_tensor = process_images([image], image_processor, model.config)[0]

        with torch.inference_mode():
            output_ids = model.generate(
                input_ids,
                images=image_tensor.unsqueeze(0).half().cuda(),
                image_sizes=[image.size],
                do_sample=True if args.temperature > 0 else False,
                temperature=args.temperature,
                top_p=args.top_p,
                num_beams=args.num_beams,
                # no_repeat_ngram_size=3,
                max_new_tokens=1024,
                use_cache=True)

        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

        ans_id = shortuuid.uuid()
        ans_file.write(json.dumps({"question_id": idx,
                                   "prompt": cur_prompt,
                                   "text": outputs,
                                   "answer_id": ans_id,
                                   "model_id": model_name,
                                   "metadata": {}}) + "\n")
        ans_file.flush()
    ans_file.close()

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
    parser.add_argument("--model-base", type=str, default=None)
    parser.add_argument("--image-folder", type=str, default="")
    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
    parser.add_argument("--conv-mode", type=str, default="llava_v1")
    parser.add_argument("--num-chunks", type=int, default=1)
    parser.add_argument("--chunk-idx", type=int, default=0)
    parser.add_argument("--temperature", type=float, default=0.2)
    parser.add_argument("--top_p", type=float, default=None)
    parser.add_argument("--num_beams", type=int, default=1)
    args = parser.parse_args()

    eval_model(args)


================================================
FILE: llava/eval/model_vqa_loader.py
================================================
import argparse
import torch
import os
import json
from tqdm import tqdm
import shortuuid

from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
from torch.utils.data import Dataset, DataLoader

from PIL import Image
import math


def split_list(lst, n):
    """Split a list into n (roughly) equal-sized chunks"""
    chunk_size = math.ceil(len(lst) / n)  # integer division
    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]


def get_chunk(lst, n, k):
    chunks = split_list(lst, n)
    return chunks[k]


# Custom dataset class
class CustomDataset(Dataset):
    def __init__(self, questions, image_folder, tokenizer, image_processor, model_config):
        self.questions = questions
        self.image_folder = image_folder
        self.tokenizer = tokenizer
        self.image_processor = image_processor
        self.model_config = model_config

    def __getitem__(self, index):
        line = self.questions[index]
        image_file = line["image"]
        qs = line["text"]
        if self.model_config.mm_use_im_start_end:
            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
        else:
            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs

        conv = conv_templates[args.conv_mode].copy()
        conv.append_message(conv.roles[0], qs)
        conv.append_message(conv.roles[1], None)
        prompt = conv.get_prompt()

        image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
        image_tensor = process_images([image], self.image_processor, self.model_config)[0]

        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')

        return input_ids, image_tensor, image.size

    def __len__(self):
        return len(self.questions)


def collate_fn(batch):
    input_ids, image_tensors, image_sizes = zip(*batch)
    input_ids = torch.stack(input_ids, dim=0)
    image_tensors = torch.stack(image_tensors, dim=0)
    return input_ids, image_tensors, image_sizes


# DataLoader
def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4):
    assert batch_size == 1, "batch_size must be 1"
    dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config)
    data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, collate_fn=collate_fn)
    return data_loader


def eval_model(args):
    # Model
    disable_torch_init()
    model_path = os.path.expanduser(args.model_path)
    model_name = get_model_name_from_path(model_path)
    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)

    questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
    answers_file = os.path.expanduser(args.answers_file)
    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
    ans_file = open(answers_file, "w")

    if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
        args.conv_mode = args.conv_mode + '_mmtag'
        print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')

    data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config)

    for (input_ids, image_tensor, image_sizes), line in tqdm(zip(data_loader, questions), total=len(questions)):
        idx = line["question_id"]
        cur_prompt = line["text"]

        input_ids = input_ids.to(device='cuda', non_blocking=True)

        with torch.inference_mode():
            output_ids = model.generate(
                input_ids,
                images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
                image_sizes=image_sizes,
                do_sample=True if args.temperature > 0 else False,
                temperature=args.temperature,
                top_p=args.top_p,
                num_beams=args.num_beams,
                max_new_tokens=args.max_new_tokens,
                use_cache=True)

        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

        ans_id = shortuuid.uuid()
        ans_file.write(json.dumps({"question_id": idx,
                                   "prompt": cur_prompt,
                                   "text": outputs,
                                   "answer_id": ans_id,
                                   "model_id": model_name,
                                   "metadata": {}}) + "\n")
        # ans_file.flush()
    ans_file.close()

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
    parser.add_argument("--model-base", type=str, default=None)
    parser.add_argument("--image-folder", type=str, default="")
    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
    parser.add_argument("--conv-mode", type=str, default="llava_v1")
    parser.add_argument("--num-chunks", type=int, default=1)
    parser.add_argument("--chunk-idx", type=int, default=0)
    parser.add_argument("--temperature", type=float, default=0.2)
    parser.add_argument("--top_p", type=float, default=None)
    parser.add_argument("--num_beams", type=int, default=1)
    parser.add_argument("--max_new_tokens", type=int, default=128)
    args = parser.parse_args()

    eval_model(args)


================================================
FILE: llava/eval/model_vqa_mmbench.py
================================================
import argparse
import torch
import os
import json
import pandas as pd
from tqdm import tqdm
import shortuuid

from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import tokenizer_image_token, process_images, load_image_from_base64, get_model_name_from_path

from PIL import Image
import math


all_options = ['A', 'B', 'C', 'D']


def split_list(lst, n):
    """Split a list into n (roughly) equal-sized chunks"""
    chunk_size = math.ceil(len(lst) / n)  # integer division
    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]


def get_chunk(lst, n, k):
    chunks = split_list(lst, n)
    return chunks[k]


def is_none(value):
    if value is None:
        return True
    if type(value) is float and math.isnan(value):
        return True
    if type(value) is str and value.lower() == 'nan':
        return True
    if type(value) is str and value.lower() == 'none':
        return True
    return False

def get_options(row, options):
    parsed_options = []
    for option in options:
        option_value = row[option]
        if is_none(option_value):
            break
        parsed_options.append(option_value)
    return parsed_options


def eval_model(args):
    # Model
    disable_torch_init()
    model_path = os.path.expanduser(args.model_path)
    model_name = get_model_name_from_path(model_path)
    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)

    questions = pd.read_table(os.path.expanduser(args.question_file))
    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
    answers_file = os.path.expanduser(args.answers_file)
    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
    ans_file = open(answers_file, "w")

    if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
        args.conv_mode = args.conv_mode + '_mmtag'
        print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')

    for index, row in tqdm(questions.iterrows(), total=len(questions)):
        options = get_options(row, all_options)
        cur_option_char = all_options[:len(options)]

        if args.all_rounds:
            num_rounds = len(options)
        else:
            num_rounds = 1

        for round_idx in range(num_rounds):
            idx = row['index']
            question = row['question']
            hint = row['hint']
            image = load_image_from_base64(row['image'])
            if not is_none(hint):
                question = hint + '\n' + question
            for option_char, option in zip(all_options[:len(options)], options):
                question = question + '\n' + option_char + '. ' + option
            qs = cur_prompt = question
            if model.config.mm_use_im_start_end:
                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
            else:
                qs = DEFAULT_IMAGE_TOKEN + '\n' + qs

            if args.single_pred_prompt:
                if args.lang == 'cn':
                    qs = qs + '\n' + "请直接回答选项字母。"
                else:
                    qs = qs + '\n' + "Answer with the option's letter from the given choices directly."

            conv = conv_templates[args.conv_mode].copy()
            conv.append_message(conv.roles[0], qs)
            conv.append_message(conv.roles[1], None)
            prompt = conv.get_prompt()

            input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()

            image_tensor = process_images([image], image_processor, model.config)[0]

            with torch.inference_mode():
                output_ids = model.generate(
                    input_ids,
                    images=image_tensor.unsqueeze(0).half().cuda(),
                    image_sizes=[image.size],
                    do_sample=True if args.temperature > 0 else False,
                    temperature=args.temperature,
                    top_p=args.top_p,
                    num_beams=args.num_beams,
                    # no_repeat_ngram_size=3,
                    max_new_tokens=1024,
                    use_cache=True)

            outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

            ans_id = shortuuid.uuid()
            ans_file.write(json.dumps({"question_id": idx,
                                    "round_id": round_idx,
                                    "prompt": cur_prompt,
                                    "text": outputs,
                                    "options": options,
                                    "option_char": cur_option_char,
                                    "answer_id": ans_id,
                                    "model_id": model_name,
                                    "metadata": {}}) + "\n")
            ans_file.flush()

            # rotate options
            options = options[1:] + options[:1]
            cur_option_char = cur_option_char[1:] + cur_option_char[:1]
    ans_file.close()

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
    parser.add_argument("--model-base", type=str, default=None)
    parser.add_argument("--image-folder", type=str, default="")
    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
    parser.add_argument("--conv-mode", type=str, default="llava_v1")
    parser.add_argument("--num-chunks", type=int, default=1)
    parser.add_argument("--chunk-idx", type=int, default=0)
    parser.add_argument("--temperature", type=float, default=0.2)
    parser.add_argument("--top_p", type=float, default=None)
    parser.add_argument("--num_beams", type=int, default=1)
    parser.add_argument("--all-rounds", action="store_true")
    parser.add_argument("--single-pred-prompt", action="store_true")
    parser.add_argument("--lang", type=str, default="en")
    args = parser.parse_args()

    eval_model(args)


================================================
FILE: llava/eval/model_vqa_science.py
================================================
import argparse
import torch
import os
import json
from tqdm import tqdm
import shortuuid

from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path

from PIL import Image
import math


def split_list(lst, n):
    """Split a list into n (roughly) equal-sized chunks"""
    chunk_size = math.ceil(len(lst) / n)  # integer division
    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]


def get_chunk(lst, n, k):
    chunks = split_list(lst, n)
    return chunks[k]


def eval_model(args):
    # Model
    disable_torch_init()
    model_path = os.path.expanduser(args.model_path)
    model_name = get_model_name_from_path(model_path)
    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)

    questions = json.load(open(os.path.expanduser(args.question_file), "r"))
    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
    answers_file = os.path.expanduser(args.answers_file)
    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
    ans_file = open(answers_file, "w")
    for i, line in enumerate(tqdm(questions)):
        idx = line["id"]
        question = line['conversations'][0]
        qs = question['value'].replace('<image>', '').strip()
        cur_prompt = qs

        if 'image' in line:
            image_file = line["image"]
            image = Image.open(os.path.join(args.image_folder, image_file))
            image_tensor = process_images([image], image_processor, model.config)[0]
            images = image_tensor.unsqueeze(0).half().cuda()
            image_sizes = [image.size]
            if getattr(model.config, 'mm_use_im_start_end', False):
                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
            else:
                qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
            cur_prompt = '<image>' + '\n' + cur_prompt
        else:
            images = None
            image_sizes = None

        if args.single_pred_prompt:
            qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
            cur_prompt = cur_prompt + '\n' + "Answer with the option's letter from the given choices directly."

        conv = conv_templates[args.conv_mode].copy()
        conv.append_message(conv.roles[0], qs)
        conv.append_message(conv.roles[1], None)
        prompt = conv.get_prompt()

        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()

        with torch.inference_mode():
            output_ids = model.generate(
                input_ids,
                images=images,
                image_sizes=image_sizes,
                do_sample=True if args.temperature > 0 else False,
                temperature=args.temperature,
                max_new_tokens=1024,
                use_cache=True,
            )

        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

        ans_id = shortuuid.uuid()
        ans_file.write(json.dumps({"question_id": idx,
                                   "prompt": cur_prompt,
                                   "text": outputs,
                                   "answer_id": ans_id,
                                   "model_id": model_name,
                                   "metadata": {}}) + "\n")
        ans_file.flush()
    ans_file.close()

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
    parser.add_argument("--model-base", type=str, default=None)
    parser.add_argument("--image-folder", type=str, default="")
    parser.add_argument("--question-file", type=str, default="tables/question.json")
    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
    parser.add_argument("--conv-mode", type=str, default="llava_v0")
    parser.add_argument("--num-chunks", type=int, default=1)
    parser.add_argument("--chunk-idx", type=int, default=0)
    parser.add_argument("--temperature", type=float, default=0.2)
    parser.add_argument("--answer-prompter", action="store_true")
    parser.add_argument("--single-pred-prompt", action="store_true")
    args = parser.parse_args()

    eval_model(args)


================================================
FILE: llava/eval/qa_baseline_gpt35.py
================================================
"""Generate answers with GPT-3.5"""
# Note: you need to be using OpenAI Python v0.27.0 for the code below to work
import argparse
import json
import os
import time
import concurrent.futures

import openai
import tqdm
import shortuuid

MODEL = 'gpt-3.5-turbo'
MODEL_ID = 'gpt-3.5-turbo:20230327'

def get_answer(question_id: int, question: str, max_tokens: int):
    ans = {
        'answer_id': shortuuid.uuid(),
        'question_id': question_id,
        'model_id': MODEL_ID,
    }
    for _ in range(3):
        try:
            response = openai.ChatCompletion.create(
                model=MODEL,
                messages=[{
                    'role': 'system',
                    'content': 'You are a helpful assistant.'
                }, {
                    'role': 'user',
                    'content': question,
                }],
                max_tokens=max_tokens,
            )
            ans['text'] = response['choices'][0]['message']['content']
            return ans
        except Exception as e:
            print('[ERROR]', e)
            ans['text'] = '#ERROR#'
            time.sleep(1)
    return ans


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='ChatGPT answer generation.')
    parser.add_argument('-q', '--question')
    parser.add_argument('-o', '--output')
    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
    args = parser.parse_args()

    questions_dict = {}
    with open(os.path.expanduser(args.question)) as f:
        for line in f:
            if not line:
                continue
            q = json.loads(line)
            questions_dict[q['question_id']] = q['text']

    answers = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
        futures = []
        for qid, question in questions_dict.items():
            future = executor.submit(get_answer, qid, question, args.max_tokens)
            futures.append(future)

        for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            answers.append(future.result())

    answers.sort(key=lambda x: x['question_id'])

    with open(os.path.expanduser(args.output), 'w') as f:
        table = [json.dumps(ans) for ans in answers]
        f.write('\n'.join(table))


================================================
FILE: llava/eval/run_llava.py
================================================
import argparse
import torch

from llava.constants import (
    IMAGE_TOKEN_INDEX,
    DEFAULT_IMAGE_TOKEN,
    DEFAULT_IM_START_TOKEN,
    DEFAULT_IM_END_TOKEN,
    IMAGE_PLACEHOLDER,
)
from llava.conversation import conv_templates, SeparatorStyle
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import (
    process_images,
    tokenizer_image_token,
    get_model_name_from_path,
)

from PIL import Image

import requests
from PIL import Image
from io import BytesIO
import re


def image_parser(args):
    out = args.image_file.split(args.sep)
    return out


def load_image(image_file):
    if image_file.startswith("http") or image_file.startswith("https"):
        response = requests.get(image_file)
        image = Image.open(BytesIO(response.content)).convert("RGB")
    else:
        image = Image.open(image_file).convert("RGB")
    return image


def load_images(image_files):
    out = []
    for image_file in image_files:
        image = load_image(image_file)
        out.append(image)
    return out


def eval_model(args):
    # Model
    disable_torch_init()

    model_name = get_model_name_from_path(args.model_path)
    tokenizer, model, image_processor, context_len = load_pretrained_model(
        args.model_path, args.model_base, model_name
    )

    qs = args.query
    image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
    if IMAGE_PLACEHOLDER in qs:
        if model.config.mm_use_im_start_end:
            qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
        else:
            qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
    else:
        if model.config.mm_use_im_start_end:
            qs = image_token_se + "\n" + qs
        else:
            qs = DEFAULT_IMAGE_TOKEN + "\n" + qs

    if "llama-2" in model_name.lower():
        conv_mode = "llava_llama_2"
    elif "mistral" in model_name.lower():
        conv_mode = "mistral_instruct"
    elif "v1.6-34b" in model_name.lower():
        conv_mode = "chatml_direct"
    elif "v1" in model_name.lower():
        conv_mode = "llava_v1"
    elif "mpt" in model_name.lower():
        conv_mode = "mpt"
    else:
        conv_mode = "llava_v0"

    if args.conv_mode is not None and conv_mode != args.conv_mode:
        print(
            "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(
                conv_mode, args.conv_mode, args.conv_mode
            )
        )
    else:
        args.conv_mode = conv_mode

    conv = conv_templates[args.conv_mode].copy()
    conv.append_message(conv.roles[0], qs)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()

    image_files = image_parser(args)
    images = load_images(image_files)
    image_sizes = [x.size for x in images]
    images_tensor = process_images(
        images,
        image_processor,
        model.config
    ).to(model.device, dtype=torch.float16)

    input_ids = (
        tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
        .unsqueeze(0)
        .cuda()
    )

    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            images=images_tensor,
            image_sizes=image_sizes,
            do_sample=True if args.temperature > 0 else False,
            temperature=args.temperature,
            top_p=args.top_p,
            num_beams=args.num_beams,
            max_new_tokens=args.max_new_tokens,
            use_cache=True,
        )

    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
    print(outputs)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
    parser.add_argument("--model-base", type=str, default=None)
    parser.add_argument("--image-file", type=str, required=True)
    parser.add_argument("--query", type=str, required=True)
    parser.add_argument("--conv-mode", type=str, default=None)
    parser.add_argument("--sep", type=str, default=",")
    parser.add_argument("--temperature", type=float, default=0.2)
    parser.add_argument("--top_p", type=float, default=None)
    parser.add_argument("--num_beams", type=int, default=1)
    parser.add_argument("--max_new_tokens", type=int, default=512)
    args = parser.parse_args()

    eval_model(args)


================================================
FILE: llava/eval/summarize_gpt_review.py
================================================
import json
import os
from collections import defaultdict

import numpy as np

import argparse

def parse_args():
    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
    parser.add_argument('-d', '--dir', default=None)
    parser.add_argument('-v', '--version', default=None)
    parser.add_argument('-s', '--select', nargs='*', default=None)
    parser.add_argument('-f', '--files', nargs='*', default=[])
    parser.add_argument('-i', '--ignore', nargs='*', default=[])
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()

    if args.ignore is not None:
        args.ignore = [int(x) for x in args.ignore]

    if len(args.files) > 0:
        review_files = args.files
    else:
        review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)]

    for review_file in sorted(review_files):
        config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
        if args.select is not None and any(x not in config for x in args.select):
            continue
        if '0613' in config:
            version = '0613'
        else:
            version = '0314'
        if args.version is not None and args.version != version:
            continue
        scores = defaultdict(list)
        print(config)
        with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
            for review_str in f:
                review = json.loads(review_str)
                if review['question_id'] in args.ignore:
                    continue
                if 'category' in review:
                    scores[review['category']].append(review['tuple'])
                    scores['all'].append(review['tuple'])
                else:
                    if 'tuple' in review:
                        scores['all'].append(review['tuple'])
                    else:
                        scores['all'].append(review['score'])
        for k, v in sorted(scores.items()):
            stats = np.asarray(v).mean(0).tolist()
            stats = [round(x, 3) for x in stats]
            # print(k, stats, round(stats[1]/stats[0]*100, 1))
            print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1))
        print('=================================')


================================================
FILE: llava/eval/webpage/index.html
================================================
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Who's GPT-4's favorite? Battles between State-of-the-Art Chatbots</title>
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css">
    <link rel="stylesheet" href="https://fonts.googleapis.com/icon?family=Material+Icons">
    <link rel="stylesheet" href="styles.css">
</head>

<body>
    <nav class="navbar navbar-expand-lg navbar-dark bg-dark">
        <a class="navbar-brand" href="#">🏔️ Vicuna Evaluation Examples</a>
        <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
          <span class="navbar-toggler-icon"></span>
        </button>
        <div class="collapse navbar-collapse" id="navbarNav">
          <ul class="navbar-nav mr-auto">
            <li class="nav-item">
                <a class="nav-link" href="https://chat.lmsys.org/">Demo</a>
              </li>
              <li class="nav-item">
                <a class="nav-link" href="https://vicuna.lmsys.org">Blog</a>
              </li>
              <li class="nav-item">
                <a class="nav-link" href="https://github.com/lm-sys/FastChat">Github</a>
              </li>
          </ul>
        </div>
    </nav>

    <div class="container mt-5">
        <h2 class="text-center mb-5">Who's GPT-4's favorite? Battles between State-of-the-Art Chatbots</h2>

        <!-- Selection -->
        <div class="form-row">
            <div class="form-group col-md-2">
                <label for="category-select">Category</label>
                <select class="form-control" id="category-select"></select>
            </div>
            <div class="form-group col-md-8">
                <label for="question-select">Question</label>
                <select class="form-control" id="question-select"></select>
            </div>
            <div class="form-group col-md-2">
                <div class="col-md-2"><label>&nbsp;</label></div>
                <div class="btn-group" role="group" aria-label="Left and Right Controller">
                    <button type="button" class="form-control btn btn-primary" id="prev-question"><i class="material-icons">keyboard_arrow_left</i></button>
                    <button type="button" class="form-control btn btn-primary" id="next-question"><i class="material-icons">keyboard_arrow_right</i></button>
                </div>
            </div>
        </div>

        <!-- "Battle" -->
        <div class="row mb-4" style="justify-content: center;">
            <div class="col" style="display: flex; justify-content: center; align-items: center;">
                <label class="adjustable-font-size" id="other-score-label">*/10</label>
            </div>
            <div class="col">
                <div class="vertical-flex-layout">
                    <img class="shadow figure-img img-fluid" src="" alt="other logo" width="150" id="other-model-figure">
                </div>
            </div>
            <div class="col">
                <div class="vertical-flex-layout">
                    <!-- from: https://fonts.google.com/icons?icon.query=battle&selected=Material+Symbols+Outlined:swords:FILL@0;wght@300;GRAD@0;opsz@48&icon.style=Outlined -->
                    <img class="figure-img img-fluid" src="figures/swords_FILL0_wght300_GRAD0_opsz48.svg" width="60" height="60">
                </div>
            </div>
            <div class="col">
                <div class="vertical-flex-layout">
                    <img class="shadow figure-img img-fluid" src="figures/vicuna.jpeg" alt="vicuna logo" width="150" id="our-model-figure">
                </div>
            </div>
            <div class="col" style="display: flex; justify-content: center; align-items: center;">
                <label class="adjustable-font-size" id="our-score-label">*/10</label>
            </div>
        </div>

        <!-- Question Card -->
        <div class="card mb-4">
            <div class="card-body" id="selected-question"></div>
        </div>

        <!-- Answer Cards -->
        <div class="row">
            <div class="col-md-6">
                <div class="card mb-4 expandable-card">
                    <div class="card-header" style="padding-bottom: 0.2rem" id="other-model-header-bg">
                        <div class="row">
                            <div class="col-md-5" style="align-items: center; display: flex;">
                                <label id="other-model-header">Assistant #1</label>
                            </div>
                            <div class="col-md-7">
                                <select class="form-control" id="model-select" style="height: fit-content; margin-top: -0.3rem;"></select>
                            </div>
                        </div>
                    </div>
                    <div class="card-body">
                        <div class="card-text-container">
                            <div class="card-text" id="other-model-answer"></div>
                        </div>
                        <div class="btn btn-primary expand-btn" style="display:flex;"></div>
                    </div>
                </div>
            </div>
            <div class="col-md-6">
                <div class="card mb-4 expandable-card">
                    <div class="card-header" id="our-model-header">
                        Assistant #2 (Vicuna, our model)
                    </div>
                    <div class="card-body">
                        <div class="card-text-container">
                            <div class="card-text" id="our-model-answer"></div>
                        </div>
                        <div class="btn btn-primary expand-btn" style="display:flex;"></div>
                    </div>
                </div>
            </div>
        </div>

        <!-- Evaluation -->
        <div class="card expandable-card">
            <div class="card-header" style="background-color: #c9c9f2;" id="evaluation-header">GPT-4 Evaluation</div>
            <div class="card-body">
                <div class="card-text-container">
                    <div class="card-text" id="evaluation-result"></div>
                </div>
                <div class="btn btn-primary expand-btn" style="display:flex;"></div>
            </div>
        </div>
    </div>

    <div class="container-fluid bg-light py-2">
        <div class="text-center">
            <small class="text-muted">This website is co-authored with <a href="https://openai.com" target="_blank">GPT-4</a>.</small>
        </div>
    </div>

    <!-- Marked.js -->
    <script src="https://cdn.jsdelivr.net/npm/marked@4.3.0/lib/marked.umd.min.js"></script>
    <!-- Bootstrap and Popper.js JavaScript dependencies -->
    <script src="https://code.jquery.com/jquery-3.5.1.slim.min.js"></script>
    <script src="https://cdn.jsdelivr.net/npm/@popperjs/core@2.11.6/dist/umd/popper.min.js"></script>
    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.5.2/js/bootstrap.min.js"></script>

    <script src="script.js"></script>
    <script>
      // Fetch the JSON file
      fetch('data.json')
        .then(response => response.json())
        .then(json_data => {
            // Populate the models and questions.
            populateModels(json_data.models);
            populateQuestions(json_data.questions);
            displayQuestion(currentQuestionIndex);
        }).catch(error => console.error(error));
    </script>
</body>

</html>


================================================
FILE: llava/eval/webpage/script.js
================================================
// Description: Script for the evaluation webpage.

let currentQuestionIndex = 1;

// Store the model name map
Download .txt
gitextract_4r0bw0y0/

├── .devcontainer/
│   ├── Dockerfile
│   ├── devcontainer.env
│   ├── devcontainer.json
│   └── postCreateCommand.sh
├── .dockerignore
├── .editorconfig
├── .gitattributes
├── .github/
│   └── ISSUE_TEMPLATE/
│       ├── 1-usage.yaml
│       ├── 2-feature-request.yaml
│       ├── 3-question.yaml
│       └── 4-discussion.yaml
├── .gitignore
├── LICENSE
├── README.md
├── cog.yaml
├── docs/
│   ├── Customize_Component.md
│   ├── Data.md
│   ├── Evaluation.md
│   ├── Finetune_Custom_Data.md
│   ├── Intel.md
│   ├── LLaVA_Bench.md
│   ├── LLaVA_from_LLaMA2.md
│   ├── LoRA.md
│   ├── MODEL_ZOO.md
│   ├── ScienceQA.md
│   ├── Windows.md
│   └── macOS.md
├── llava/
│   ├── __init__.py
│   ├── constants.py
│   ├── conversation.py
│   ├── eval/
│   │   ├── eval_gpt_review.py
│   │   ├── eval_gpt_review_bench.py
│   │   ├── eval_gpt_review_visual.py
│   │   ├── eval_pope.py
│   │   ├── eval_science_qa.py
│   │   ├── eval_science_qa_gpt4.py
│   │   ├── eval_science_qa_gpt4_requery.py
│   │   ├── eval_textvqa.py
│   │   ├── generate_webpage_data_from_table.py
│   │   ├── m4c_evaluator.py
│   │   ├── model_qa.py
│   │   ├── model_vqa.py
│   │   ├── model_vqa_loader.py
│   │   ├── model_vqa_mmbench.py
│   │   ├── model_vqa_science.py
│   │   ├── qa_baseline_gpt35.py
│   │   ├── run_llava.py
│   │   ├── summarize_gpt_review.py
│   │   └── webpage/
│   │       ├── index.html
│   │       ├── script.js
│   │       └── styles.css
│   ├── mm_utils.py
│   ├── model/
│   │   ├── __init__.py
│   │   ├── apply_delta.py
│   │   ├── builder.py
│   │   ├── consolidate.py
│   │   ├── language_model/
│   │   │   ├── llava_llama.py
│   │   │   ├── llava_mistral.py
│   │   │   └── llava_mpt.py
│   │   ├── llava_arch.py
│   │   ├── make_delta.py
│   │   ├── multimodal_encoder/
│   │   │   ├── builder.py
│   │   │   └── clip_encoder.py
│   │   ├── multimodal_projector/
│   │   │   └── builder.py
│   │   └── utils.py
│   ├── serve/
│   │   ├── __init__.py
│   │   ├── cli.py
│   │   ├── controller.py
│   │   ├── gradio_web_server.py
│   │   ├── model_worker.py
│   │   ├── register_worker.py
│   │   ├── sglang_worker.py
│   │   └── test_message.py
│   ├── train/
│   │   ├── llama_flash_attn_monkey_patch.py
│   │   ├── llama_xformers_attn_monkey_patch.py
│   │   ├── llava_trainer.py
│   │   ├── train.py
│   │   ├── train_mem.py
│   │   └── train_xformers.py
│   └── utils.py
├── playground/
│   └── data/
│       └── prompts/
│           ├── complex_reasoning/
│           │   ├── 000_caps.txt
│           │   ├── 000_conv.txt
│           │   ├── 001_caps.txt
│           │   ├── 001_conv.txt
│           │   ├── 002_caps.txt
│           │   ├── 002_conv.txt
│           │   └── system_message.txt
│           ├── conversation/
│           │   ├── 000_caps.txt
│           │   ├── 000_conv.txt
│           │   ├── 001_caps.txt
│           │   ├── 001_conv.txt
│           │   └── system_message.txt
│           └── detail_description/
│               ├── 000_caps.txt
│               ├── 000_conv.txt
│               ├── 001_caps.txt
│               ├── 001_conv.txt
│               ├── 002_caps.txt
│               ├── 002_conv.txt
│               └── system_message.txt
├── predict.py
├── pyproject.toml
└── scripts/
    ├── convert_gqa_for_eval.py
    ├── convert_mmbench_for_submission.py
    ├── convert_mmvet_for_eval.py
    ├── convert_seed_for_submission.py
    ├── convert_sqa_to_llava.py
    ├── convert_sqa_to_llava_base_prompt.py
    ├── convert_vizwiz_for_submission.py
    ├── convert_vqav2_for_submission.py
    ├── extract_mm_projector.py
    ├── finetune.sh
    ├── finetune_full_schedule.sh
    ├── finetune_lora.sh
    ├── finetune_qlora.sh
    ├── finetune_sqa.sh
    ├── merge_lora_weights.py
    ├── pretrain.sh
    ├── pretrain_xformers.sh
    ├── sqa_eval_batch.sh
    ├── sqa_eval_gather.sh
    ├── upload_pypi.sh
    └── v1_5/
        ├── eval/
        │   ├── gqa.sh
        │   ├── llavabench.sh
        │   ├── mmbench.sh
        │   ├── mmbench_cn.sh
        │   ├── mme.sh
        │   ├── mmvet.sh
        │   ├── pope.sh
        │   ├── qbench.sh
        │   ├── qbench_zh.sh
        │   ├── seed.sh
        │   ├── sqa.sh
        │   ├── textvqa.sh
        │   ├── vizwiz.sh
        │   └── vqav2.sh
        ├── finetune.sh
        ├── finetune_lora.sh
        ├── finetune_task.sh
        ├── finetune_task_lora.sh
        └── pretrain.sh
Download .txt
SYMBOL INDEX (322 symbols across 53 files)

FILE: llava/conversation.py
  class SeparatorStyle (line 9) | class SeparatorStyle(Enum):
  class Conversation (line 19) | class Conversation:
    method get_prompt (line 32) | def get_prompt(self):
    method append_message (line 109) | def append_message(self, role, message):
    method process_image (line 112) | def process_image(self, image, image_process_mode, return_pil=False, i...
    method get_images (line 152) | def get_images(self, return_pil=False):
    method to_gradio_chatbot (line 162) | def to_gradio_chatbot(self):
    method copy (line 180) | def copy(self):
    method dict (line 191) | def dict(self):

FILE: llava/eval/eval_gpt_review.py
  function get_eval (line 13) | def get_eval(content: str, max_tokens: int):
  function parse_score (line 39) | def parse_score(review):

FILE: llava/eval/eval_gpt_review_bench.py
  function get_eval (line 11) | def get_eval(content: str, max_tokens: int):
  function parse_score (line 36) | def parse_score(review):

FILE: llava/eval/eval_gpt_review_visual.py
  function get_eval (line 11) | def get_eval(content: str, max_tokens: int):
  function parse_score (line 36) | def parse_score(review):

FILE: llava/eval/eval_pope.py
  function eval_pope (line 5) | def eval_pope(answers, label_file):

FILE: llava/eval/eval_science_qa.py
  function get_args (line 8) | def get_args():
  function convert_caps (line 19) | def convert_caps(results):
  function get_pred_idx (line 28) | def get_pred_idx(prediction, choices, options):

FILE: llava/eval/eval_science_qa_gpt4.py
  function get_args (line 9) | def get_args():
  function convert_caps (line 19) | def convert_caps(results):
  function get_pred_idx (line 28) | def get_pred_idx(prediction, choices, options):

FILE: llava/eval/eval_science_qa_gpt4_requery.py
  function get_args (line 9) | def get_args():
  function convert_caps (line 21) | def convert_caps(results):
  function get_pred_idx (line 30) | def get_pred_idx(prediction, choices, options):

FILE: llava/eval/eval_textvqa.py
  function get_args (line 9) | def get_args():
  function prompt_processor (line 17) | def prompt_processor(prompt):
  function eval_single (line 35) | def eval_single(annotation_file, result_file):

FILE: llava/eval/generate_webpage_data_from_table.py
  function read_jsonl (line 10) | def read_jsonl(path: str, key: str=None):
  function trim_hanging_lines (line 23) | def trim_hanging_lines(s: str, n: int) -> str:

FILE: llava/eval/m4c_evaluator.py
  class EvalAIAnswerProcessor (line 7) | class EvalAIAnswerProcessor:
    method __init__ (line 178) | def __init__(self, *args, **kwargs):
    method word_tokenize (line 181) | def word_tokenize(self, word):
    method process_punctuation (line 186) | def process_punctuation(self, in_text):
    method process_digit_article (line 198) | def process_digit_article(self, in_text):
    method __call__ (line 213) | def __call__(self, item):
  class TextVQAAccuracyEvaluator (line 221) | class TextVQAAccuracyEvaluator:
    method __init__ (line 222) | def __init__(self):
    method _compute_answer_scores (line 225) | def _compute_answer_scores(self, raw_answers):
    method eval_pred_list (line 248) | def eval_pred_list(self, pred_list):
  class STVQAAccuracyEvaluator (line 260) | class STVQAAccuracyEvaluator:
    method __init__ (line 261) | def __init__(self):
    method eval_pred_list (line 264) | def eval_pred_list(self, pred_list):
  class STVQAANLSEvaluator (line 276) | class STVQAANLSEvaluator:
    method __init__ (line 277) | def __init__(self):
    method get_anls (line 282) | def get_anls(self, s1, s2):
    method eval_pred_list (line 289) | def eval_pred_list(self, pred_list):
  class TextCapsBleu4Evaluator (line 301) | class TextCapsBleu4Evaluator:
    method __init__ (line 302) | def __init__(self):
    method eval_pred_list (line 321) | def eval_pred_list(self, pred_list):

FILE: llava/eval/model_qa.py
  function eval_model (line 14) | def eval_model(model_name, questions_file, answers_file):

FILE: llava/eval/model_vqa.py
  function split_list (line 18) | def split_list(lst, n):
  function get_chunk (line 24) | def get_chunk(lst, n, k):
  function eval_model (line 29) | def eval_model(args):

FILE: llava/eval/model_vqa_loader.py
  function split_list (line 19) | def split_list(lst, n):
  function get_chunk (line 25) | def get_chunk(lst, n, k):
  class CustomDataset (line 31) | class CustomDataset(Dataset):
    method __init__ (line 32) | def __init__(self, questions, image_folder, tokenizer, image_processor...
    method __getitem__ (line 39) | def __getitem__(self, index):
    method __len__ (line 60) | def __len__(self):
  function collate_fn (line 64) | def collate_fn(batch):
  function create_data_loader (line 72) | def create_data_loader(questions, image_folder, tokenizer, image_process...
  function eval_model (line 79) | def eval_model(args):

FILE: llava/eval/model_vqa_mmbench.py
  function split_list (line 22) | def split_list(lst, n):
  function get_chunk (line 28) | def get_chunk(lst, n, k):
  function is_none (line 33) | def is_none(value):
  function get_options (line 44) | def get_options(row, options):
  function eval_model (line 54) | def eval_model(args):

FILE: llava/eval/model_vqa_science.py
  function split_list (line 18) | def split_list(lst, n):
  function get_chunk (line 24) | def get_chunk(lst, n, k):
  function eval_model (line 29) | def eval_model(args):

FILE: llava/eval/qa_baseline_gpt35.py
  function get_answer (line 16) | def get_answer(question_id: int, question: str, max_tokens: int):

FILE: llava/eval/run_llava.py
  function image_parser (line 28) | def image_parser(args):
  function load_image (line 33) | def load_image(image_file):
  function load_images (line 42) | def load_images(image_files):
  function eval_model (line 50) | def eval_model(args):

FILE: llava/eval/summarize_gpt_review.py
  function parse_args (line 9) | def parse_args():

FILE: llava/eval/webpage/script.js
  function text2Markdown (line 35) | function text2Markdown(text) {
  function capitalizeFirstChar (line 41) | function capitalizeFirstChar(str) {
  function updateQuestionSelect (line 48) | function updateQuestionSelect(question_id) {
  function updateModelSelect (line 64) | function updateModelSelect() {
  function populateModels (line 70) | function populateModels(models) {
  function populateQuestions (line 81) | function populateQuestions(questions) {
  function displayQuestion (line 110) | function displayQuestion(index) {
  function displayAnswers (line 116) | function displayAnswers(index) {
  function switchQuestionAndCategory (line 203) | function switchQuestionAndCategory() {
  function updateExpandButtonVisibility (line 226) | function updateExpandButtonVisibility(card) {

FILE: llava/mm_utils.py
  function select_best_resolution (line 12) | def select_best_resolution(original_size, possible_resolutions):
  function resize_and_pad_image (line 42) | def resize_and_pad_image(image, target_resolution):
  function divide_to_patches (line 77) | def divide_to_patches(image, patch_size):
  function get_anyres_image_grid_shape (line 99) | def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
  function process_anyres_image (line 119) | def process_anyres_image(image, processor, grid_pinpoints):
  function load_image_from_base64 (line 148) | def load_image_from_base64(image):
  function expand2square (line 152) | def expand2square(pil_img, background_color):
  function process_images (line 166) | def process_images(images, image_processor, model_cfg):
  function tokenizer_image_token (line 185) | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOK...
  function get_model_name_from_path (line 207) | def get_model_name_from_path(model_path):
  class KeywordsStoppingCriteria (line 215) | class KeywordsStoppingCriteria(StoppingCriteria):
    method __init__ (line 216) | def __init__(self, keywords, tokenizer, input_ids):
    method call_for_batch (line 230) | def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.F...
    method __call__ (line 243) | def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTe...

FILE: llava/model/apply_delta.py
  function apply_delta (line 13) | def apply_delta(base_model_path, target_model_path, delta_path):

FILE: llava/model/builder.py
  function load_pretrained_model (line 26) | def load_pretrained_model(model_path, model_base, model_name, load_8bit=...

FILE: llava/model/consolidate.py
  function consolidate_ckpt (line 13) | def consolidate_ckpt(src_path, dst_path):

FILE: llava/model/language_model/llava_llama.py
  class LlavaConfig (line 30) | class LlavaConfig(LlamaConfig):
  class LlavaLlamaModel (line 34) | class LlavaLlamaModel(LlavaMetaModel, LlamaModel):
    method __init__ (line 37) | def __init__(self, config: LlamaConfig):
  class LlavaLlamaForCausalLM (line 41) | class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
    method __init__ (line 44) | def __init__(self, config):
    method get_model (line 54) | def get_model(self):
    method forward (line 57) | def forward(
    method generate (line 105) | def generate(
    method prepare_inputs_for_generation (line 144) | def prepare_inputs_for_generation(self, input_ids, past_key_values=None,

FILE: llava/model/language_model/llava_mistral.py
  class LlavaMistralConfig (line 31) | class LlavaMistralConfig(MistralConfig):
  class LlavaMistralModel (line 35) | class LlavaMistralModel(LlavaMetaModel, MistralModel):
    method __init__ (line 38) | def __init__(self, config: MistralConfig):
  class LlavaMistralForCausalLM (line 42) | class LlavaMistralForCausalLM(MistralForCausalLM, LlavaMetaForCausalLM):
    method __init__ (line 45) | def __init__(self, config):
    method get_model (line 54) | def get_model(self):
    method forward (line 57) | def forward(
    method generate (line 105) | def generate(
    method prepare_inputs_for_generation (line 144) | def prepare_inputs_for_generation(self, input_ids, past_key_values=None,

FILE: llava/model/language_model/llava_mpt.py
  class LlavaMptConfig (line 25) | class LlavaMptConfig(MptConfig):
  class LlavaMptModel (line 29) | class LlavaMptModel(LlavaMetaModel, MptModel):
    method __init__ (line 32) | def __init__(self, config: MptConfig):
    method embed_tokens (line 36) | def embed_tokens(self, x):
  class LlavaMptForCausalLM (line 40) | class LlavaMptForCausalLM(MptForCausalLM, LlavaMetaForCausalLM):
    method __init__ (line 44) | def __init__(self, config):
    method get_model (line 53) | def get_model(self):
    method _set_gradient_checkpointing (line 56) | def _set_gradient_checkpointing(self, module, value=False):
    method forward (line 60) | def forward(
    method prepare_inputs_for_generation (line 87) | def prepare_inputs_for_generation(self, input_ids, past_key_values=Non...

FILE: llava/model/llava_arch.py
  class LlavaMetaModel (line 29) | class LlavaMetaModel:
    method __init__ (line 31) | def __init__(self, config):
    method get_vision_tower (line 43) | def get_vision_tower(self):
    method initialize_vision_modules (line 49) | def initialize_vision_modules(self, model_args, fsdp=None):
  function unpad_image (line 100) | def unpad_image(tensor, original_size):
  class LlavaMetaForCausalLM (line 131) | class LlavaMetaForCausalLM(ABC):
    method get_model (line 134) | def get_model(self):
    method get_vision_tower (line 137) | def get_vision_tower(self):
    method encode_images (line 140) | def encode_images(self, images):
    method prepare_inputs_labels_for_multimodal (line 145) | def prepare_inputs_labels_for_multimodal(
    method initialize_vision_tokenizer (line 326) | def initialize_vision_tokenizer(self, model_args, tokenizer):

FILE: llava/model/make_delta.py
  function make_delta (line 13) | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_...

FILE: llava/model/multimodal_encoder/builder.py
  function build_vision_tower (line 5) | def build_vision_tower(vision_tower_cfg, **kwargs):

FILE: llava/model/multimodal_encoder/clip_encoder.py
  class CLIPVisionTower (line 7) | class CLIPVisionTower(nn.Module):
    method __init__ (line 8) | def __init__(self, vision_tower, args, delay_load=False):
    method load_model (line 24) | def load_model(self, device_map=None):
    method feature_select (line 35) | def feature_select(self, image_forward_outs):
    method forward (line 46) | def forward(self, images):
    method dummy_feature (line 60) | def dummy_feature(self):
    method dtype (line 64) | def dtype(self):
    method device (line 68) | def device(self):
    method config (line 72) | def config(self):
    method hidden_size (line 79) | def hidden_size(self):
    method num_patches_per_side (line 83) | def num_patches_per_side(self):
    method num_patches (line 87) | def num_patches(self):
  class CLIPVisionTowerS2 (line 92) | class CLIPVisionTowerS2(CLIPVisionTower):
    method __init__ (line 93) | def __init__(self, vision_tower, args, delay_load=False):
    method load_model (line 113) | def load_model(self, device_map=None):
    method forward_feature (line 128) | def forward_feature(self, images):
    method forward (line 134) | def forward(self, images):
    method hidden_size (line 146) | def hidden_size(self):

FILE: llava/model/multimodal_projector/builder.py
  class IdentityMap (line 6) | class IdentityMap(nn.Module):
    method __init__ (line 7) | def __init__(self):
    method forward (line 10) | def forward(self, x, *args, **kwargs):
    method config (line 14) | def config(self):
  class SimpleResBlock (line 18) | class SimpleResBlock(nn.Module):
    method __init__ (line 19) | def __init__(self, channels):
    method forward (line 28) | def forward(self, x):
  function build_vision_projector (line 33) | def build_vision_projector(config, delay_load=False, **kwargs):

FILE: llava/model/utils.py
  function auto_upgrade (line 4) | def auto_upgrade(config):

FILE: llava/serve/cli.py
  function load_image (line 18) | def load_image(image_file):
  function main (line 27) | def main(args):

FILE: llava/serve/controller.py
  class DispatchMethod (line 28) | class DispatchMethod(Enum):
    method from_str (line 33) | def from_str(cls, name):
  class WorkerInfo (line 43) | class WorkerInfo:
  function heart_beat_controller (line 51) | def heart_beat_controller(controller):
  class Controller (line 57) | class Controller:
    method __init__ (line 58) | def __init__(self, dispatch_method: str):
    method register_worker (line 69) | def register_worker(self, worker_name: str, check_heart_beat: bool,
    method get_worker_status (line 88) | def get_worker_status(self, worker_name: str):
    method remove_worker (line 101) | def remove_worker(self, worker_name: str):
    method refresh_all_workers (line 104) | def refresh_all_workers(self):
    method list_models (line 112) | def list_models(self):
    method get_worker_address (line 120) | def get_worker_address(self, model_name: str):
    method receive_heart_beat (line 173) | def receive_heart_beat(self, worker_name: str, queue_length: int):
    method remove_stable_workers_by_expiration (line 183) | def remove_stable_workers_by_expiration(self):
    method worker_api_generate_stream (line 193) | def worker_api_generate_stream(self, params):
    method worker_api_get_status (line 220) | def worker_api_get_status(self):
  function register_worker (line 243) | async def register_worker(request: Request):
  function refresh_all_workers (line 251) | async def refresh_all_workers():
  function list_models (line 256) | async def list_models():
  function get_worker_address (line 262) | async def get_worker_address(request: Request):
  function receive_heart_beat (line 269) | async def receive_heart_beat(request: Request):
  function worker_api_generate_stream (line 277) | async def worker_api_generate_stream(request: Request):
  function worker_api_get_status (line 284) | async def worker_api_get_status(request: Request):

FILE: llava/serve/gradio_web_server.py
  function get_conv_log_filename (line 32) | def get_conv_log_filename():
  function get_model_list (line 38) | def get_model_list():
  function load_demo (line 58) | def load_demo(url_params, request: gr.Request):
  function load_demo_refresh_model_list (line 71) | def load_demo_refresh_model_list(request: gr.Request):
  function vote_last_response (line 82) | def vote_last_response(state, vote_type, model_selector, request: gr.Req...
  function upvote_last_response (line 94) | def upvote_last_response(state, model_selector, request: gr.Request):
  function downvote_last_response (line 100) | def downvote_last_response(state, model_selector, request: gr.Request):
  function flag_last_response (line 106) | def flag_last_response(state, model_selector, request: gr.Request):
  function regenerate (line 112) | def regenerate(state, image_process_mode, request: gr.Request):
  function clear_history (line 122) | def clear_history(request: gr.Request):
  function add_text (line 128) | def add_text(state, text, image, image_process_mode, request: gr.Request):
  function http_bot (line 154) | def http_bot(state, model_selector, temperature, top_p, max_new_tokens, ...
  function build_demo (line 315) | def build_demo(embed_mode, cur_dir=None, concurrency_count=10):

FILE: llava/serve/model_worker.py
  function heart_beat_worker (line 37) | def heart_beat_worker(controller):
  class ModelWorker (line 44) | class ModelWorker:
    method __init__ (line 45) | def __init__(self, controller_addr, worker_addr,
    method register_to_controller (line 75) | def register_to_controller(self):
    method send_heart_beat (line 87) | def send_heart_beat(self):
    method get_queue_length (line 108) | def get_queue_length(self):
    method get_status (line 115) | def get_status(self):
    method generate_stream (line 123) | def generate_stream(self, params):
    method generate_stream_gate (line 195) | def generate_stream_gate(self, params):
  function release_model_semaphore (line 225) | def release_model_semaphore(fn=None):
  function generate_stream (line 232) | async def generate_stream(request: Request):
  function get_status (line 248) | async def get_status(request: Request):

FILE: llava/serve/sglang_worker.py
  function heart_beat_worker (line 38) | def heart_beat_worker(controller):
  function pipeline (line 45) | def pipeline(s, prompt, max_tokens):
  class ModelWorker (line 54) | class ModelWorker:
    method __init__ (line 55) | def __init__(self, controller_addr, worker_addr, sgl_endpoint,
    method register_to_controller (line 85) | def register_to_controller(self):
    method send_heart_beat (line 97) | def send_heart_beat(self):
    method get_queue_length (line 118) | def get_queue_length(self):
    method get_status (line 125) | def get_status(self):
    method generate_stream (line 132) | async def generate_stream(self, params):
    method generate_stream_gate (line 172) | async def generate_stream_gate(self, params):
  function release_model_semaphore (line 195) | def release_model_semaphore(fn=None):
  function generate_stream (line 202) | async def generate_stream(request: Request):
  function get_status (line 218) | async def get_status(request: Request):

FILE: llava/serve/test_message.py
  function main (line 9) | def main():

FILE: llava/train/llama_flash_attn_monkey_patch.py
  function forward (line 16) | def forward(
  function _prepare_decoder_attention_mask (line 98) | def _prepare_decoder_attention_mask(
  function replace_llama_attn_with_flash_attn (line 105) | def replace_llama_attn_with_flash_attn():

FILE: llava/train/llama_xformers_attn_monkey_patch.py
  function replace_llama_attn_with_xformers_attn (line 19) | def replace_llama_attn_with_xformers_attn():
  function xformers_forward (line 23) | def xformers_forward(

FILE: llava/train/llava_trainer.py
  function maybe_zero_3 (line 18) | def maybe_zero_3(param, ignore_status=False, name=None):
  function get_mm_adapter_state_maybe_zero_3 (line 32) | def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
  function split_to_even_chunks (line 38) | def split_to_even_chunks(indices, lengths, num_chunks):
  function get_modality_length_grouped_indices (line 60) | def get_modality_length_grouped_indices(lengths, batch_size, world_size,...
  function get_length_grouped_indices (line 88) | def get_length_grouped_indices(lengths, batch_size, world_size, generato...
  class LengthGroupedSampler (line 99) | class LengthGroupedSampler(Sampler):
    method __init__ (line 105) | def __init__(
    method __len__ (line 122) | def __len__(self):
    method __iter__ (line 125) | def __iter__(self):
  class LLaVATrainer (line 133) | class LLaVATrainer(Trainer):
    method _get_train_sampler (line 135) | def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
    method create_optimizer (line 150) | def create_optimizer(self):
    method _save_checkpoint (line 230) | def _save_checkpoint(self, model, trial, metrics=None):
    method _save (line 251) | def _save(self, output_dir: Optional[str] = None, state_dict=None):

FILE: llava/train/train.py
  function rank0_print (line 44) | def rank0_print(*args):
  class ModelArguments (line 54) | class ModelArguments:
  class DataArguments (line 70) | class DataArguments:
  class TrainingArguments (line 80) | class TrainingArguments(transformers.TrainingArguments):
  function maybe_zero_3 (line 115) | def maybe_zero_3(param, ignore_status=False, name=None):
  function get_peft_state_maybe_zero_3 (line 130) | def get_peft_state_maybe_zero_3(named_params, bias):
  function get_peft_state_non_lora_maybe_zero_3 (line 155) | def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only...
  function get_mm_adapter_state_maybe_zero_3 (line 163) | def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
  function find_all_linear_names (line 169) | def find_all_linear_names(model):
  function safe_save_model_for_hf_trainer (line 185) | def safe_save_model_for_hf_trainer(trainer: transformers.Trainer,
  function smart_tokenizer_and_embedding_resize (line 224) | def smart_tokenizer_and_embedding_resize(
  function _tokenize_fn (line 249) | def _tokenize_fn(strings: Sequence[str],
  function _mask_targets (line 276) | def _mask_targets(target, tokenized_lens, speakers):
  function _add_speaker_and_signal (line 287) | def _add_speaker_and_signal(header, source, get_conversation=True):
  function preprocess_multimodal (line 308) | def preprocess_multimodal(
  function preprocess_llama_2 (line 332) | def preprocess_llama_2(
  function preprocess_v1 (line 414) | def preprocess_v1(
  function preprocess_mpt (line 500) | def preprocess_mpt(
  function preprocess_plain (line 588) | def preprocess_plain(
  function preprocess (line 610) | def preprocess(
  class LazySupervisedDataset (line 658) | class LazySupervisedDataset(Dataset):
    method __init__ (line 661) | def __init__(self, data_path: str,
    method __len__ (line 672) | def __len__(self):
    method lengths (line 676) | def lengths(self):
    method modality_lengths (line 684) | def modality_lengths(self):
    method __getitem__ (line 692) | def __getitem__(self, i) -> Dict[str, torch.Tensor]:
  class DataCollatorForSupervisedDataset (line 743) | class DataCollatorForSupervisedDataset(object):
    method __call__ (line 748) | def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
  function make_supervised_data_module (line 776) | def make_supervised_data_module(tokenizer: transformers.PreTrainedTokeni...
  function train (line 788) | def train(attn_implementation=None):

FILE: llava/utils.py
  function build_logger (line 17) | def build_logger(logger_name, logger_filename):
  class StreamToLogger (line 60) | class StreamToLogger(object):
    method __init__ (line 64) | def __init__(self, logger, log_level=logging.INFO):
    method __getattr__ (line 70) | def __getattr__(self, attr):
    method write (line 73) | def write(self, buf):
    method flush (line 87) | def flush(self):
  function disable_torch_init (line 93) | def disable_torch_init():
  function violates_moderation (line 102) | def violates_moderation(text):
  function pretty_print_semaphore (line 123) | def pretty_print_semaphore(semaphore):

FILE: predict.py
  function download_json (line 54) | def download_json(url: str, dest: Path):
  function download_weights (line 62) | def download_weights(baseurl: str, basedest: str, files: list[str]):
  class Predictor (line 78) | class Predictor(BasePredictor):
    method setup (line 79) | def setup(self) -> None:
    method predict (line 87) | def predict(
  function load_image (line 148) | def load_image(image_file):

FILE: scripts/convert_mmbench_for_submission.py
  function get_args (line 6) | def get_args():

FILE: scripts/convert_seed_for_submission.py
  function get_args (line 6) | def get_args():
  function eval_single (line 14) | def eval_single(result_file, eval_only_type=None):

FILE: scripts/convert_sqa_to_llava.py
  function convert_to_llava (line 8) | def convert_to_llava(base_dir, split, prompt_format="QCM-LEA"):
  function convert_to_jsonl (line 49) | def convert_to_jsonl(base_dir, split, prompt_format="QCM-LEPA"):
  function main (line 83) | def main(task, **kwargs):

FILE: scripts/convert_sqa_to_llava_base_prompt.py
  function get_question_text (line 1) | def get_question_text(problem):
  function get_context_text (line 6) | def get_context_text(problem, use_caption):
  function get_choice_text (line 15) | def get_choice_text(probelm, options):
  function get_answer (line 25) | def get_answer(problem, options):
  function get_lecture_text (line 29) | def get_lecture_text(problem):
  function get_solution_text (line 35) | def get_solution_text(problem):
  function create_one_example_chatbot (line 41) | def create_one_example_chatbot(format, question, context, choice, answer...
  function create_one_example (line 106) | def create_one_example(format, question, context, choice, answer, lectur...
  function create_one_example_gpt4 (line 162) | def create_one_example_gpt4(format, question, context, choice, answer, l...
  function build_prompt_chatbot (line 221) | def build_prompt_chatbot(problems, shot_qids, prompt_format, use_caption...
  function build_prompt (line 244) | def build_prompt(problems, shot_qids, test_qid, args):
  function build_prompt_gpt4 (line 291) | def build_prompt_gpt4(problems, shot_qids, test_qid, args):

FILE: scripts/convert_vizwiz_for_submission.py
  function parse_args (line 8) | def parse_args():

FILE: scripts/convert_vqav2_for_submission.py
  function parse_args (line 8) | def parse_args():

FILE: scripts/extract_mm_projector.py
  function parse_args (line 15) | def parse_args():

FILE: scripts/merge_lora_weights.py
  function merge_lora (line 6) | def merge_lora(args):
Condensed preview — 140 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (496K chars).
[
  {
    "path": ".devcontainer/Dockerfile",
    "chars": 1566,
    "preview": "FROM mcr.microsoft.com/devcontainers/base:ubuntu-20.04\n\nSHELL [ \"bash\", \"-c\" ]\n\n# update apt and install packages\nRUN ap"
  },
  {
    "path": ".devcontainer/devcontainer.env",
    "chars": 60,
    "preview": "SAMPLE_ENV_VAR1=\"Sample Value\"\nSAMPLE_ENV_VAR2=332431bf-68bf"
  },
  {
    "path": ".devcontainer/devcontainer.json",
    "chars": 2484,
    "preview": "{\n    \"name\": \"LLaVA\",\n    \"build\": {\n        \"dockerfile\": \"Dockerfile\",\n        \"context\": \"..\",\n        \"args\": {}\n  "
  },
  {
    "path": ".devcontainer/postCreateCommand.sh",
    "chars": 1466,
    "preview": "git config --global safe.directory '*'\ngit config --global core.editor \"code --wait\"\ngit config --global pager.branch fa"
  },
  {
    "path": ".dockerignore",
    "chars": 359,
    "preview": "# The .dockerignore file excludes files from the container build process.\n#\n# https://docs.docker.com/engine/reference/b"
  },
  {
    "path": ".editorconfig",
    "chars": 319,
    "preview": "root = true\n\n# Unix-style newlines with a newline ending every file\n[*]\nend_of_line = lf\ninsert_final_newline = true\ntri"
  },
  {
    "path": ".gitattributes",
    "chars": 772,
    "preview": "# https://git-scm.com/docs/gitattributes\n\n# Set the default behavior, in case people don't have core.autocrlf set.\n# htt"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/1-usage.yaml",
    "chars": 968,
    "preview": "name: Usage issues\ndescription: Report issues in usage.\ntitle: \"[Usage] \"\nbody:\n  - type: markdown\n    attributes:\n     "
  },
  {
    "path": ".github/ISSUE_TEMPLATE/2-feature-request.yaml",
    "chars": 364,
    "preview": "name: Feature Request\ndescription: Request for a new feature\ntitle: \"[Feature request] \"\nbody:\n  - type: markdown\n    at"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/3-question.yaml",
    "chars": 528,
    "preview": "name: Questions\ndescription: General questions about the work\ntitle: \"[Question] \"\nbody:\n  - type: markdown\n    attribut"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/4-discussion.yaml",
    "chars": 540,
    "preview": "name: Discussions\ndescription: General discussions about the work\ntitle: \"[Discussion] \"\nbody:\n  - type: markdown\n    at"
  },
  {
    "path": ".gitignore",
    "chars": 278,
    "preview": "# Python\n__pycache__\n*.pyc\n*.egg-info\ndist\n\n# Log\n*.log\n*.log.*\n*.json\n*.jsonl\n\n# Data\n!**/alpaca-data-conversation.json"
  },
  {
    "path": "LICENSE",
    "chars": 11357,
    "preview": "                                 Apache License\n                           Version 2.0, January 2004\n                   "
  },
  {
    "path": "README.md",
    "chars": 30057,
    "preview": "# 🌋 LLaVA: Large Language and Vision Assistant\n\n*Visual instruction tuning towards large language and vision models with"
  },
  {
    "path": "cog.yaml",
    "chars": 977,
    "preview": "# Configuration for Cog ⚙️\n# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md\n\nbuild:\n  gpu: true\n\n  p"
  },
  {
    "path": "docs/Customize_Component.md",
    "chars": 1325,
    "preview": "# Customize Components in LLaVA\n\nThis is an initial guide on how to replace the LLMs, visual encoders, etc. with your ch"
  },
  {
    "path": "docs/Data.md",
    "chars": 3219,
    "preview": "## Data\n\n| Data file name | Size |\n| --- | ---: |\n| [llava_instruct_150k.json](https://huggingface.co/datasets/liuhaotia"
  },
  {
    "path": "docs/Evaluation.md",
    "chars": 8574,
    "preview": "# Evaluation\n\nIn LLaVA-1.5, we evaluate models on a diverse set of 12 benchmarks. To ensure the reproducibility, we eval"
  },
  {
    "path": "docs/Finetune_Custom_Data.md",
    "chars": 1520,
    "preview": "# Finetune LLaVA on Custom Datasets\n\n## Dataset Format\n\nConvert your data to a JSON file of a List of all samples. Sampl"
  },
  {
    "path": "docs/Intel.md",
    "chars": 508,
    "preview": "# Intel Platforms \n\n* Support [Intel GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-g"
  },
  {
    "path": "docs/LLaVA_Bench.md",
    "chars": 3484,
    "preview": "# LLaVA-Bench [[Download](https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild)]\n\n**-Introduction-**  Larg"
  },
  {
    "path": "docs/LLaVA_from_LLaMA2.md",
    "chars": 2325,
    "preview": "# LLaVA (based on Llama 2 LLM, Preview)\n\n*NOTE: This is a technical preview. We are still running hyperparameter search,"
  },
  {
    "path": "docs/LoRA.md",
    "chars": 3656,
    "preview": "# LLaVA (LoRA, Preview)\n\nNOTE: This is a technical preview, and is not yet ready for production use. We are still runnin"
  },
  {
    "path": "docs/MODEL_ZOO.md",
    "chars": 12047,
    "preview": "# Model Zoo\n\n**To Use LLaVA-1.6 checkpoints, your llava package version must be newer than 1.2.0. [Instructions](https:/"
  },
  {
    "path": "docs/ScienceQA.md",
    "chars": 2329,
    "preview": "### ScienceQA\n\n#### Prepare Data\n1. Please see ScienceQA [repo](https://github.com/lupantech/ScienceQA) for setting up t"
  },
  {
    "path": "docs/Windows.md",
    "chars": 957,
    "preview": "# Run LLaVA on Windows\n\n*NOTE: LLaVA on Windows is not fully supported. Currently we only support 16-bit inference. For "
  },
  {
    "path": "docs/macOS.md",
    "chars": 814,
    "preview": "# Run LLaVA on macOS\n\n*NOTE: LLaVA on macOS is not fully supported. Currently we only support 16-bit inference. More fun"
  },
  {
    "path": "llava/__init__.py",
    "chars": 41,
    "preview": "from .model import LlavaLlamaForCausalLM\n"
  },
  {
    "path": "llava/constants.py",
    "chars": 335,
    "preview": "CONTROLLER_HEART_BEAT_EXPIRATION = 30\nWORKER_HEART_BEAT_INTERVAL = 15\n\nLOGDIR = \".\"\n\n# Model Constants\nIGNORE_INDEX = -1"
  },
  {
    "path": "llava/conversation.py",
    "chars": 15022,
    "preview": "import dataclasses\nfrom enum import auto, Enum\nfrom typing import List, Tuple\nimport base64\nfrom io import BytesIO\nfrom "
  },
  {
    "path": "llava/eval/eval_gpt_review.py",
    "chars": 3620,
    "preview": "import argparse\nimport json\nimport os\n\nimport openai\nimport tqdm\nimport ray\nimport time\n\nNUM_SECONDS_TO_SLEEP = 3\n\n@ray."
  },
  {
    "path": "llava/eval/eval_gpt_review_bench.py",
    "chars": 4172,
    "preview": "import argparse\nimport json\nimport os\n\nimport openai\nimport time\n\nNUM_SECONDS_TO_SLEEP = 0.5\n\n\ndef get_eval(content: str"
  },
  {
    "path": "llava/eval/eval_gpt_review_visual.py",
    "chars": 4177,
    "preview": "import argparse\nimport json\nimport os\n\nimport openai\nimport time\n\nNUM_SECONDS_TO_SLEEP = 0.5\n\n\ndef get_eval(content: str"
  },
  {
    "path": "llava/eval/eval_pope.py",
    "chars": 2732,
    "preview": "import os\nimport json\nimport argparse\n\ndef eval_pope(answers, label_file):\n    label_list = [json.loads(q)['label'] for "
  },
  {
    "path": "llava/eval/eval_science_qa.py",
    "chars": 3920,
    "preview": "import argparse\nimport json\nimport os\nimport re\nimport random\n\n\ndef get_args():\n    parser = argparse.ArgumentParser()\n "
  },
  {
    "path": "llava/eval/eval_science_qa_gpt4.py",
    "chars": 3675,
    "preview": "import argparse\nimport json\nimport os\nimport re\nimport random\nfrom collections import defaultdict\n\n\ndef get_args():\n    "
  },
  {
    "path": "llava/eval/eval_science_qa_gpt4_requery.py",
    "chars": 5774,
    "preview": "import argparse\nimport json\nimport os\nimport re\nimport random\nfrom collections import defaultdict\n\n\ndef get_args():\n    "
  },
  {
    "path": "llava/eval/eval_textvqa.py",
    "chars": 2226,
    "preview": "import os\nimport argparse\nimport json\nimport re\n\nfrom llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator\n\n\ndef get"
  },
  {
    "path": "llava/eval/generate_webpage_data_from_table.py",
    "chars": 4088,
    "preview": "\"\"\"Generate json file for webpage.\"\"\"\nimport json\nimport os\nimport re\n\n# models = ['llama', 'alpaca', 'gpt35', 'bard']\nm"
  },
  {
    "path": "llava/eval/m4c_evaluator.py",
    "chars": 10265,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\nimport re\n\nfrom tqdm import tqdm\n\n\nclass EvalAIAnswerProcessor:\n    \""
  },
  {
    "path": "llava/eval/model_qa.py",
    "chars": 2430,
    "preview": "import argparse\nfrom transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria\nimport torch\nimport os\nim"
  },
  {
    "path": "llava/eval/model_vqa.py",
    "chars": 4115,
    "preview": "import argparse\nimport torch\nimport os\nimport json\nfrom tqdm import tqdm\nimport shortuuid\n\nfrom llava.constants import I"
  },
  {
    "path": "llava/eval/model_vqa_loader.py",
    "chars": 5975,
    "preview": "import argparse\nimport torch\nimport os\nimport json\nfrom tqdm import tqdm\nimport shortuuid\n\nfrom llava.constants import I"
  },
  {
    "path": "llava/eval/model_vqa_mmbench.py",
    "chars": 6408,
    "preview": "import argparse\nimport torch\nimport os\nimport json\nimport pandas as pd\nfrom tqdm import tqdm\nimport shortuuid\n\nfrom llav"
  },
  {
    "path": "llava/eval/model_vqa_science.py",
    "chars": 4592,
    "preview": "import argparse\nimport torch\nimport os\nimport json\nfrom tqdm import tqdm\nimport shortuuid\n\nfrom llava.constants import I"
  },
  {
    "path": "llava/eval/qa_baseline_gpt35.py",
    "chars": 2345,
    "preview": "\"\"\"Generate answers with GPT-3.5\"\"\"\n# Note: you need to be using OpenAI Python v0.27.0 for the code below to work\nimport"
  },
  {
    "path": "llava/eval/run_llava.py",
    "chars": 4443,
    "preview": "import argparse\nimport torch\n\nfrom llava.constants import (\n    IMAGE_TOKEN_INDEX,\n    DEFAULT_IMAGE_TOKEN,\n    DEFAULT_"
  },
  {
    "path": "llava/eval/summarize_gpt_review.py",
    "chars": 2438,
    "preview": "import json\nimport os\nfrom collections import defaultdict\n\nimport numpy as np\n\nimport argparse\n\ndef parse_args():\n    pa"
  },
  {
    "path": "llava/eval/webpage/index.html",
    "chars": 7664,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n    <meta charset=\"UTF-8\">\n    <meta name=\"viewport\" content=\"width=device-width"
  },
  {
    "path": "llava/eval/webpage/script.js",
    "chars": 9967,
    "preview": "// Description: Script for the evaluation webpage.\n\nlet currentQuestionIndex = 1;\n\n// Store the model name mapping for l"
  },
  {
    "path": "llava/eval/webpage/styles.css",
    "chars": 1822,
    "preview": "body {\n    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;\n    background-color: #f8f9fa;\n}\n\n.navbar-dark "
  },
  {
    "path": "llava/mm_utils.py",
    "chars": 9555,
    "preview": "from PIL import Image\nfrom io import BytesIO\nimport base64\nimport torch\nimport math\nimport ast\n\nfrom transformers import"
  },
  {
    "path": "llava/model/__init__.py",
    "chars": 269,
    "preview": "try:\n    from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig\n    from .language_model.llava_mpt i"
  },
  {
    "path": "llava/model/apply_delta.py",
    "chars": 1956,
    "preview": "\"\"\"\nUsage:\npython3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --de"
  },
  {
    "path": "llava/model/builder.py",
    "chars": 8761,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llava/model/consolidate.py",
    "chars": 914,
    "preview": "\"\"\"\nUsage:\npython3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate\n"
  },
  {
    "path": "llava/model/language_model/llava_llama.py",
    "chars": 5378,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llava/model/language_model/llava_mistral.py",
    "chars": 5386,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llava/model/language_model/llava_mpt.py",
    "chars": 3487,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llava/model/llava_arch.py",
    "chars": 18110,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llava/model/make_delta.py",
    "chars": 2257,
    "preview": "\"\"\"\nUsage:\npython3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~"
  },
  {
    "path": "llava/model/multimodal_encoder/builder.py",
    "chars": 748,
    "preview": "import os\nfrom .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2\n\n\ndef build_vision_tower(vision_tower_cfg, **kwar"
  },
  {
    "path": "llava/model/multimodal_encoder/clip_encoder.py",
    "chars": 5775,
    "preview": "import torch\nimport torch.nn as nn\n\nfrom transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig\n\n\ncla"
  },
  {
    "path": "llava/model/multimodal_projector/builder.py",
    "chars": 1437,
    "preview": "import torch\nimport torch.nn as nn\nimport re\n\n\nclass IdentityMap(nn.Module):\n    def __init__(self):\n        super().__i"
  },
  {
    "path": "llava/model/utils.py",
    "chars": 927,
    "preview": "from transformers import AutoConfig\n\n\ndef auto_upgrade(config):\n    cfg = AutoConfig.from_pretrained(config)\n    if 'lla"
  },
  {
    "path": "llava/serve/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "llava/serve/cli.py",
    "chars": 4718,
    "preview": "import argparse\nimport torch\n\nfrom llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN"
  },
  {
    "path": "llava/serve/controller.py",
    "chars": 9949,
    "preview": "\"\"\"\nA controller manages distributed workers.\nIt sends worker addresses to clients.\n\"\"\"\nimport argparse\nimport asyncio\ni"
  },
  {
    "path": "llava/serve/gradio_web_server.py",
    "chars": 18841,
    "preview": "import argparse\nimport datetime\nimport json\nimport os\nimport time\n\nimport gradio as gr\nimport requests\n\nfrom llava.conve"
  },
  {
    "path": "llava/serve/model_worker.py",
    "chars": 11176,
    "preview": "\"\"\"\nA model worker executes the model.\n\"\"\"\nimport argparse\nimport asyncio\nimport json\nimport time\nimport threading\nimpor"
  },
  {
    "path": "llava/serve/register_worker.py",
    "chars": 734,
    "preview": "\"\"\"\nManually register workers.\n\nUsage:\npython3 -m fastchat.serve.register_worker --controller http://localhost:21001 --w"
  },
  {
    "path": "llava/serve/sglang_worker.py",
    "chars": 8678,
    "preview": "\"\"\"\nA model worker executes the model.\n\"\"\"\nimport argparse\nimport asyncio\nfrom concurrent.futures import ThreadPoolExecu"
  },
  {
    "path": "llava/serve/test_message.py",
    "chars": 2022,
    "preview": "import argparse\nimport json\n\nimport requests\n\nfrom llava.conversation import default_conversation\n\n\ndef main():\n    if a"
  },
  {
    "path": "llava/train/llama_flash_attn_monkey_patch.py",
    "chars": 4404,
    "preview": "from typing import Optional, Tuple\nimport warnings\n\nimport torch\n\nimport transformers\nfrom transformers.models.llama.mod"
  },
  {
    "path": "llava/train/llama_xformers_attn_monkey_patch.py",
    "chars": 4916,
    "preview": "\"\"\"\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_a"
  },
  {
    "path": "llava/train/llava_trainer.py",
    "chars": 11076,
    "preview": "import os\nimport torch\nimport torch.nn as nn\n\nfrom torch.utils.data import Sampler\n\nfrom transformers import Trainer\nfro"
  },
  {
    "path": "llava/train/train.py",
    "chars": 38414,
    "preview": "# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:\n# Adopted from tatsu-lab@stanford_al"
  },
  {
    "path": "llava/train/train_mem.py",
    "chars": 115,
    "preview": "from llava.train.train import train\n\nif __name__ == \"__main__\":\n    train(attn_implementation=\"flash_attention_2\")\n"
  },
  {
    "path": "llava/train/train_xformers.py",
    "chars": 366,
    "preview": "# Make it more memory efficient by monkey patching the LLaMA model with xformers attention.\n\n# Need to call this before "
  },
  {
    "path": "llava/utils.py",
    "chars": 4003,
    "preview": "import datetime\nimport logging\nimport logging.handlers\nimport os\nimport sys\n\nimport requests\n\nfrom llava.constants impor"
  },
  {
    "path": "playground/data/prompts/complex_reasoning/000_caps.txt",
    "chars": 633,
    "preview": "A man wearing multiple neck ties making a goofy face.\nA man in a white shirt wearing very many ties.\na man with ties on "
  },
  {
    "path": "playground/data/prompts/complex_reasoning/000_conv.txt",
    "chars": 376,
    "preview": "Question:\nWhat is unusual about this photo?\n===\nAnswer:\nIn the photo, the man is wearing a total of ten ties around his "
  },
  {
    "path": "playground/data/prompts/complex_reasoning/001_caps.txt",
    "chars": 759,
    "preview": "A group of people standing outside of a black vehicle with various luggage.\nLuggage surrounds a vehicle in an undergroun"
  },
  {
    "path": "playground/data/prompts/complex_reasoning/001_conv.txt",
    "chars": 781,
    "preview": "Question:\nWhat challenges do these people face?\n===\nAnswer:\nIn the image, a group of people is standing outside a black "
  },
  {
    "path": "playground/data/prompts/complex_reasoning/002_caps.txt",
    "chars": 306,
    "preview": "There is a movie theater that displays the show times above the doors.\nA red fire hydrant is deep in the snow.\nThe fire "
  },
  {
    "path": "playground/data/prompts/complex_reasoning/002_conv.txt",
    "chars": 977,
    "preview": "Question:\nWhat challenges might this city face?\n===\nAnswer:\nThe city faces challenges due to the harsh winter conditions"
  },
  {
    "path": "playground/data/prompts/complex_reasoning/system_message.txt",
    "chars": 1505,
    "preview": "You are an AI visual assistant that can analyze a single image. You receive five sentences, each describing the same ima"
  },
  {
    "path": "playground/data/prompts/conversation/000_caps.txt",
    "chars": 263,
    "preview": "There is a movie theater that displays the show times above the doors.\nA red fire hydrant is deep in the snow.\nThe fire "
  },
  {
    "path": "playground/data/prompts/conversation/000_conv.txt",
    "chars": 2052,
    "preview": "Question:\nWhat color is the fire hydrant in the image?\n===\nAnswer:\nThe fire hydrant in the image is red.\n===\nQuestion:\nI"
  },
  {
    "path": "playground/data/prompts/conversation/001_caps.txt",
    "chars": 254,
    "preview": "A man is skiing in the open snow covered hills\nA skier is making his way into the snow.\nA skier on the foothills of a hu"
  },
  {
    "path": "playground/data/prompts/conversation/001_conv.txt",
    "chars": 3452,
    "preview": "Question:\nWhat is the skier doing?\n===\nAnswer:\nThe skier is skiing in the snow-covered hills, making his way through the"
  },
  {
    "path": "playground/data/prompts/conversation/system_message.txt",
    "chars": 1400,
    "preview": "You are an AI visual assistant, and you are seeing a single image. What you see are provided with five sentences, descri"
  },
  {
    "path": "playground/data/prompts/detail_description/000_caps.txt",
    "chars": 723,
    "preview": "A harbor filled with lots of boats next to a building.\nA bicycle parked in front of several boats at a dock.\nA red bicyc"
  },
  {
    "path": "playground/data/prompts/detail_description/000_conv.txt",
    "chars": 614,
    "preview": "It is a harbor filled with numerous boats of various sizes docked next to a long building. Among the boats, there are a "
  },
  {
    "path": "playground/data/prompts/detail_description/001_caps.txt",
    "chars": 759,
    "preview": "A group of people standing outside of a black vehicle with various luggage.\nLuggage surrounds a vehicle in an undergroun"
  },
  {
    "path": "playground/data/prompts/detail_description/001_conv.txt",
    "chars": 927,
    "preview": "The image is an underground parking area with a black sport utility vehicle (SUV) parked. There are three people in the "
  },
  {
    "path": "playground/data/prompts/detail_description/002_caps.txt",
    "chars": 610,
    "preview": "A man holds a Wii-mote above his head while another looks on.\nA guy and his friend are playing Nintendo Wii.\nA young man"
  },
  {
    "path": "playground/data/prompts/detail_description/002_conv.txt",
    "chars": 552,
    "preview": "The image shows two men standing in a room, engaged in playing a video game on a Nintendo Wii console. One of the men is"
  },
  {
    "path": "playground/data/prompts/detail_description/system_message.txt",
    "chars": 1017,
    "preview": "You are an AI visual assistant that can analyze a single image. You receive five sentences, each describing the same ima"
  },
  {
    "path": "predict.py",
    "chars": 6080,
    "preview": "import torch\n\nfrom llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN\nfrom llava.conversation import conv_tem"
  },
  {
    "path": "pyproject.toml",
    "chars": 1256,
    "preview": "[build-system]\nrequires = [\"setuptools>=61.0\"]\nbuild-backend = \"setuptools.build_meta\"\n\n[project]\nname = \"llava\"\nversion"
  },
  {
    "path": "scripts/convert_gqa_for_eval.py",
    "chars": 487,
    "preview": "import os\nimport json\nimport argparse\n\nparser = argparse.ArgumentParser()\nparser.add_argument(\"--src\", type=str)\nparser."
  },
  {
    "path": "scripts/convert_mmbench_for_submission.py",
    "chars": 980,
    "preview": "import os\nimport json\nimport argparse\nimport pandas as pd\n\ndef get_args():\n    parser = argparse.ArgumentParser()\n    pa"
  },
  {
    "path": "scripts/convert_mmvet_for_eval.py",
    "chars": 397,
    "preview": "import os\nimport json\nimport argparse\n\nparser = argparse.ArgumentParser()\nparser.add_argument(\"--src\", type=str)\nparser."
  },
  {
    "path": "scripts/convert_seed_for_submission.py",
    "chars": 2571,
    "preview": "import os\nimport json\nimport argparse\n\n\ndef get_args():\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\""
  },
  {
    "path": "scripts/convert_sqa_to_llava.py",
    "chars": 2971,
    "preview": "import json\nimport os\nimport fire\nimport re\nfrom convert_sqa_to_llava_base_prompt import build_prompt_chatbot\n\n\ndef conv"
  },
  {
    "path": "scripts/convert_sqa_to_llava_base_prompt.py",
    "chars": 13795,
    "preview": "def get_question_text(problem):\n    question = problem['question']\n    return question\n\n\ndef get_context_text(problem, u"
  },
  {
    "path": "scripts/convert_vizwiz_for_submission.py",
    "chars": 1387,
    "preview": "import os\nimport argparse\nimport json\n\nfrom llava.eval.m4c_evaluator import EvalAIAnswerProcessor\n\n\ndef parse_args():\n  "
  },
  {
    "path": "scripts/convert_vqav2_for_submission.py",
    "chars": 1743,
    "preview": "import os\nimport argparse\nimport json\n\nfrom llava.eval.m4c_evaluator import EvalAIAnswerProcessor\n\n\ndef parse_args():\n  "
  },
  {
    "path": "scripts/extract_mm_projector.py",
    "chars": 1606,
    "preview": "\"\"\"\nThis is just a utility that I use to extract the projector for quantized models.\nIt is NOT necessary at all to train"
  },
  {
    "path": "scripts/finetune.sh",
    "chars": 1642,
    "preview": "#!/bin/bash\n\n# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!\n\n# Uncomment and set t"
  },
  {
    "path": "scripts/finetune_full_schedule.sh",
    "chars": 1643,
    "preview": "#!/bin/bash\n\n# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!\n\n# Uncomment and set t"
  },
  {
    "path": "scripts/finetune_lora.sh",
    "chars": 1672,
    "preview": "#!/bin/bash\n\n# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!\n\n# Uncomment and set t"
  },
  {
    "path": "scripts/finetune_qlora.sh",
    "chars": 1687,
    "preview": "#!/bin/bash\n\n# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!\n\n# Uncomment and set t"
  },
  {
    "path": "scripts/finetune_sqa.sh",
    "chars": 1346,
    "preview": "#!/bin/bash\n\n# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!\n\ndeepspeed llava/train"
  },
  {
    "path": "scripts/merge_lora_weights.py",
    "chars": 767,
    "preview": "import argparse\nfrom llava.model.builder import load_pretrained_model\nfrom llava.mm_utils import get_model_name_from_pat"
  },
  {
    "path": "scripts/pretrain.sh",
    "chars": 1460,
    "preview": "#!/bin/bash\n\n# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!\n\n# Uncomment and set t"
  },
  {
    "path": "scripts/pretrain_xformers.sh",
    "chars": 1380,
    "preview": "#!/bin/bash\n\n# Uncomment and set the following variables correspondingly to run this script:\n\n# MODEL_VERSION=vicuna-v1-"
  },
  {
    "path": "scripts/sqa_eval_batch.sh",
    "chars": 524,
    "preview": "#!/bin/bash\n\nCHUNKS=8\nfor IDX in {0..7}; do\n    CUDA_VISIBLE_DEVICES=$IDX python -m llava.eval.model_vqa_science \\\n     "
  },
  {
    "path": "scripts/sqa_eval_gather.sh",
    "chars": 518,
    "preview": "#!/bin/bash\n\nCHUNKS=8\noutput_file=\"test_llava-13b.jsonl\"\n\n# Clear out the output file if it exists.\n> \"$output_file\"\n\n# "
  },
  {
    "path": "scripts/upload_pypi.sh",
    "chars": 387,
    "preview": "#!/bin/bash\n\n# Step 0: Clean up\nrm -rf dist\n\n# Step 1: Change the package name to \"llava-torch\"\nsed -i 's/name = \"llava\""
  },
  {
    "path": "scripts/v1_5/eval/gqa.sh",
    "chars": 1228,
    "preview": "#!/bin/bash\n\ngpu_list=\"${CUDA_VISIBLE_DEVICES:-0}\"\nIFS=',' read -ra GPULIST <<< \"$gpu_list\"\n\nCHUNKS=${#GPULIST[@]}\n\nCKPT"
  },
  {
    "path": "scripts/v1_5/eval/llavabench.sh",
    "chars": 1093,
    "preview": "#!/bin/bash\n\npython -m llava.eval.model_vqa \\\n    --model-path liuhaotian/llava-v1.5-13b \\\n    --question-file ./playgro"
  },
  {
    "path": "scripts/v1_5/eval/mmbench.sh",
    "chars": 704,
    "preview": "#!/bin/bash\n\nSPLIT=\"mmbench_dev_20230712\"\n\npython -m llava.eval.model_vqa_mmbench \\\n    --model-path liuhaotian/llava-v1"
  },
  {
    "path": "scripts/v1_5/eval/mmbench_cn.sh",
    "chars": 738,
    "preview": "#!/bin/bash\n\nSPLIT=\"mmbench_dev_cn_20231003\"\n\npython -m llava.eval.model_vqa_mmbench \\\n    --model-path liuhaotian/llava"
  },
  {
    "path": "scripts/v1_5/eval/mme.sh",
    "chars": 532,
    "preview": "#!/bin/bash\n\npython -m llava.eval.model_vqa_loader \\\n    --model-path liuhaotian/llava-v1.5-13b \\\n    --question-file ./"
  },
  {
    "path": "scripts/v1_5/eval/mmvet.sh",
    "chars": 580,
    "preview": "#!/bin/bash\n\npython -m llava.eval.model_vqa \\\n    --model-path liuhaotian/llava-v1.5-13b \\\n    --question-file ./playgro"
  },
  {
    "path": "scripts/v1_5/eval/pope.sh",
    "chars": 590,
    "preview": "#!/bin/bash\n\npython -m llava.eval.model_vqa_loader \\\n    --model-path liuhaotian/llava-v1.5-13b \\\n    --question-file ./"
  },
  {
    "path": "scripts/v1_5/eval/qbench.sh",
    "chars": 578,
    "preview": "#!/bin/bash\n\nif [ \"$1\" = \"dev\" ]; then\n    echo \"Evaluating in 'dev' split.\"\nelif [ \"$1\" = \"test\" ]; then\n    echo \"Eval"
  },
  {
    "path": "scripts/v1_5/eval/qbench_zh.sh",
    "chars": 621,
    "preview": "#!/bin/bash\n\nif [ \"$1\" = \"dev\" ]; then\n    ZH_SPLIT=\"验证集\"\n    echo \"Evaluating in 'dev' split.\"\nelif [ \"$1\" = \"test\" ]; "
  },
  {
    "path": "scripts/v1_5/eval/seed.sh",
    "chars": 1264,
    "preview": "#!/bin/bash\n\ngpu_list=\"${CUDA_VISIBLE_DEVICES:-0}\"\nIFS=',' read -ra GPULIST <<< \"$gpu_list\"\n\nCHUNKS=${#GPULIST[@]}\n\nCKPT"
  },
  {
    "path": "scripts/v1_5/eval/sqa.sh",
    "chars": 749,
    "preview": "#!/bin/bash\n\npython -m llava.eval.model_vqa_science \\\n    --model-path liuhaotian/llava-v1.5-13b \\\n    --question-file ."
  },
  {
    "path": "scripts/v1_5/eval/textvqa.sh",
    "chars": 571,
    "preview": "#!/bin/bash\n\npython -m llava.eval.model_vqa_loader \\\n    --model-path liuhaotian/llava-v1.5-13b \\\n    --question-file ./"
  },
  {
    "path": "scripts/v1_5/eval/vizwiz.sh",
    "chars": 642,
    "preview": "#!/bin/bash\n\npython -m llava.eval.model_vqa_loader \\\n    --model-path liuhaotian/llava-v1.5-13b \\\n    --question-file ./"
  },
  {
    "path": "scripts/v1_5/eval/vqav2.sh",
    "chars": 1113,
    "preview": "#!/bin/bash\n\ngpu_list=\"${CUDA_VISIBLE_DEVICES:-0}\"\nIFS=',' read -ra GPULIST <<< \"$gpu_list\"\n\nCHUNKS=${#GPULIST[@]}\n\nCKPT"
  },
  {
    "path": "scripts/v1_5/finetune.sh",
    "chars": 1234,
    "preview": "#!/bin/bash\n\ndeepspeed llava/train/train_mem.py \\\n    --deepspeed ./scripts/zero3.json \\\n    --model_name_or_path lmsys/"
  },
  {
    "path": "scripts/v1_5/finetune_lora.sh",
    "chars": 1317,
    "preview": "#!/bin/bash\n\ndeepspeed llava/train/train_mem.py \\\n    --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr"
  },
  {
    "path": "scripts/v1_5/finetune_task.sh",
    "chars": 1156,
    "preview": "#!/bin/bash\n\ndeepspeed llava/train/train_mem.py \\\n    --deepspeed ./scripts/zero3.json \\\n    --model_name_or_path liuhao"
  },
  {
    "path": "scripts/v1_5/finetune_task_lora.sh",
    "chars": 1239,
    "preview": "#!/bin/bash\n\ndeepspeed llava/train/train_mem.py \\\n    --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr"
  },
  {
    "path": "scripts/v1_5/pretrain.sh",
    "chars": 1164,
    "preview": "#!/bin/bash\n\ndeepspeed llava/train/train_mem.py \\\n    --deepspeed ./scripts/zero2.json \\\n    --model_name_or_path lmsys/"
  }
]

About this extraction

This page contains the full source code of the haotian-liu/LLaVA GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 140 files (459.8 KB), approximately 119.2k tokens, and a symbol index with 322 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!