gitextract_tizp684z/

├── .gitignore
├── LICENSE
├── README.md
├── data/
│   ├── test/
│   │   ├── report/
│   │   │   ├── harvard_test.json
│   │   │   ├── iuxray_test.json
│   │   │   ├── mimic_test.json
│   │   │   ├── pmc-oa_test.json
│   │   │   └── quilt-1m_test.json
│   │   └── vqa/
│   │       ├── harvard_test.jsonl
│   │       ├── iuxray_test.jsonl
│   │       ├── mimic_test.jsonl
│   │       ├── pmc-oa_test.jsonl
│   │       └── quilt-1m_test.jsonl
│   └── training/
│       ├── alignment/
│       │   ├── ophthalmology/
│       │   │   ├── harvard_report.json
│       │   │   └── harvard_vqa.json
│       │   ├── pathology/
│       │   │   └── pathology_vqa.json
│       │   └── radiology/
│       │       ├── radiology_report.json
│       │       └── radiology_vqa.json
│       └── retriever/
│           ├── ophthalmology/
│           │   ├── harvard_train_7000.json
│           │   └── harvard_val_1000.json
│           ├── pathology/
│           │   ├── pathology_train.json
│           │   └── pathology_val.json
│           └── radiology/
│               ├── radiology_train.json
│               └── radiology_val.json
├── requirements.txt
├── scripts/
│   ├── finetune_clip.sh
│   ├── retrieve_clip_VQA.sh
│   ├── retrieve_clip_report.sh
│   └── train_dpo_2stages.sh
└── train/
    ├── dpo/
    │   ├── LICENSE
    │   ├── cog.yaml
    │   ├── dpo_trainer_2stages.py
    │   ├── llava/
    │   │   ├── __init__.py
    │   │   ├── constants.py
    │   │   ├── conversation.py
    │   │   ├── conversation_new.py
    │   │   ├── eval/
    │   │   │   ├── eval_gpt_review.py
    │   │   │   ├── eval_gpt_review_bench.py
    │   │   │   ├── eval_gpt_review_visual.py
    │   │   │   ├── eval_pope.py
    │   │   │   ├── eval_science_qa.py
    │   │   │   ├── eval_science_qa_gpt4.py
    │   │   │   ├── eval_science_qa_gpt4_requery.py
    │   │   │   ├── eval_textvqa.py
    │   │   │   ├── generate_webpage_data_from_table.py
    │   │   │   ├── m4c_evaluator.py
    │   │   │   ├── model_qa.py
    │   │   │   ├── model_vqa.py
    │   │   │   ├── model_vqa_loader.py
    │   │   │   ├── model_vqa_mmbench.py
    │   │   │   ├── model_vqa_science.py
    │   │   │   ├── qa_baseline_gpt35.py
    │   │   │   ├── run_llava.py
    │   │   │   ├── summarize_gpt_review.py
    │   │   │   ├── table/
    │   │   │   │   ├── answer/
    │   │   │   │   │   ├── answer_alpaca-13b.jsonl
    │   │   │   │   │   ├── answer_bard.jsonl
    │   │   │   │   │   ├── answer_gpt35.jsonl
    │   │   │   │   │   ├── answer_llama-13b.jsonl
    │   │   │   │   │   └── answer_vicuna-13b.jsonl
    │   │   │   │   ├── caps_boxes_coco2014_val_80.jsonl
    │   │   │   │   ├── model.jsonl
    │   │   │   │   ├── prompt.jsonl
    │   │   │   │   ├── question.jsonl
    │   │   │   │   ├── results/
    │   │   │   │   │   ├── test_sqa_llava_13b_v0.json
    │   │   │   │   │   └── test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json
    │   │   │   │   ├── review/
    │   │   │   │   │   ├── review_alpaca-13b_vicuna-13b.jsonl
    │   │   │   │   │   ├── review_bard_vicuna-13b.jsonl
    │   │   │   │   │   ├── review_gpt35_vicuna-13b.jsonl
    │   │   │   │   │   └── review_llama-13b_vicuna-13b.jsonl
    │   │   │   │   ├── reviewer.jsonl
    │   │   │   │   └── rule.json
    │   │   │   └── webpage/
    │   │   │       └── styles.css
    │   │   ├── mm_utils.py
    │   │   ├── model/
    │   │   │   ├── __init__.py
    │   │   │   ├── apply_delta.py
    │   │   │   ├── builder.py
    │   │   │   ├── consolidate.py
    │   │   │   ├── language_model/
    │   │   │   │   ├── llava_llama.py
    │   │   │   │   ├── llava_mistral.py
    │   │   │   │   └── llava_mpt.py
    │   │   │   ├── llava_arch.py
    │   │   │   ├── make_delta.py
    │   │   │   ├── multimodal_encoder/
    │   │   │   │   ├── builder.py
    │   │   │   │   └── clip_encoder.py
    │   │   │   ├── multimodal_projector/
    │   │   │   │   └── builder.py
    │   │   │   └── utils.py
    │   │   ├── serve/
    │   │   │   ├── __init__.py
    │   │   │   ├── cli.py
    │   │   │   ├── controller.py
    │   │   │   ├── gradio_web_server.py
    │   │   │   ├── model_worker.py
    │   │   │   ├── register_worker.py
    │   │   │   ├── sglang_worker.py
    │   │   │   └── test_message.py
    │   │   ├── train/
    │   │   │   ├── llama_flash_attn_monkey_patch.py
    │   │   │   ├── llama_xformers_attn_monkey_patch.py
    │   │   │   ├── llava_trainer.py
    │   │   │   ├── train.py
    │   │   │   ├── train_dpo.py
    │   │   │   ├── train_dpo_inherent.py
    │   │   │   ├── train_mem.py
    │   │   │   └── train_xformers.py
    │   │   └── utils.py
    │   ├── llava_trainer_2stages.py
    │   ├── povid_infer.py
    │   ├── predict.py
    │   ├── pyproject.toml
    │   ├── scripts/
    │   │   ├── convert_gqa_for_eval.py
    │   │   ├── convert_mmbench_for_submission.py
    │   │   ├── convert_mmvet_for_eval.py
    │   │   ├── convert_seed_for_submission.py
    │   │   ├── convert_sqa_to_llava.py
    │   │   ├── convert_sqa_to_llava_base_prompt.py
    │   │   ├── convert_vizwiz_for_submission.py
    │   │   ├── convert_vqav2_for_submission.py
    │   │   ├── extract_mm_projector.py
    │   │   ├── finetune.sh
    │   │   ├── finetune_full_schedule.sh
    │   │   ├── finetune_lora.sh
    │   │   ├── finetune_qlora.sh
    │   │   ├── finetune_sqa.sh
    │   │   ├── merge_lora_weights.py
    │   │   ├── pretrain.sh
    │   │   ├── pretrain_xformers.sh
    │   │   ├── run_povid.sh
    │   │   ├── sqa_eval_batch.sh
    │   │   ├── sqa_eval_gather.sh
    │   │   ├── upload_pypi.sh
    │   │   ├── v1_5/
    │   │   │   ├── eval/
    │   │   │   │   ├── gqa.sh
    │   │   │   │   ├── llavabench.sh
    │   │   │   │   ├── mmbench.sh
    │   │   │   │   ├── mmbench_cn.sh
    │   │   │   │   ├── mme.sh
    │   │   │   │   ├── mmvet.sh
    │   │   │   │   ├── pope.sh
    │   │   │   │   ├── qbench.sh
    │   │   │   │   ├── qbench_zh.sh
    │   │   │   │   ├── seed.sh
    │   │   │   │   ├── sqa.sh
    │   │   │   │   ├── textvqa.sh
    │   │   │   │   ├── vizwiz.sh
    │   │   │   │   └── vqav2.sh
    │   │   │   ├── finetune.sh
    │   │   │   ├── finetune_lora.sh
    │   │   │   ├── finetune_task.sh
    │   │   │   ├── finetune_task_lora.sh
    │   │   │   └── pretrain.sh
    │   │   ├── zero2.json
    │   │   ├── zero3.json
    │   │   └── zero3_offload.json
    │   ├── tool/
    │   │   ├── dpo_trainer.py
    │   │   └── dpo_trainer_inherent.py
    │   └── train_dpo_2stages.py
    └── open_clip/
        ├── CITATION.cff
        ├── HISTORY.md
        ├── LICENSE
        ├── MANIFEST.in
        ├── setup.py
        └── src/
            ├── open_clip/
            │   ├── __init__.py
            │   ├── big_vision.py
            │   ├── coca_model.py
            │   ├── constants.py
            │   ├── factory.py
            │   ├── hf_configs.py
            │   ├── hf_model.py
            │   ├── loss.py
            │   ├── model.py
            │   ├── model_configs/
            │   │   ├── EVA01-g-14-plus.json
            │   │   ├── EVA01-g-14.json
            │   │   ├── EVA02-B-16.json
            │   │   ├── EVA02-E-14-plus.json
            │   │   ├── EVA02-E-14.json
            │   │   ├── EVA02-L-14-336.json
            │   │   ├── EVA02-L-14.json
            │   │   ├── RN101-quickgelu.json
            │   │   ├── RN101.json
            │   │   ├── RN50-quickgelu.json
            │   │   ├── RN50.json
            │   │   ├── RN50x16.json
            │   │   ├── RN50x4.json
            │   │   ├── RN50x64.json
            │   │   ├── ViT-B-16-SigLIP-256.json
            │   │   ├── ViT-B-16-SigLIP-384.json
            │   │   ├── ViT-B-16-SigLIP-512.json
            │   │   ├── ViT-B-16-SigLIP-i18n-256.json
            │   │   ├── ViT-B-16-SigLIP.json
            │   │   ├── ViT-B-16-plus-240.json
            │   │   ├── ViT-B-16-plus.json
            │   │   ├── ViT-B-16-quickgelu.json
            │   │   ├── ViT-B-16.json
            │   │   ├── ViT-B-32-256.json
            │   │   ├── ViT-B-32-plus-256.json
            │   │   ├── ViT-B-32-quickgelu.json
            │   │   ├── ViT-B-32.json
            │   │   ├── ViT-H-14-378-quickgelu.json
            │   │   ├── ViT-H-14-CLIPA-336.json
            │   │   ├── ViT-H-14-CLIPA.json
            │   │   ├── ViT-H-14-quickgelu.json
            │   │   ├── ViT-H-14.json
            │   │   ├── ViT-H-16.json
            │   │   ├── ViT-L-14-280.json
            │   │   ├── ViT-L-14-336.json
            │   │   ├── ViT-L-14-CLIPA-336.json
            │   │   ├── ViT-L-14-CLIPA.json
            │   │   ├── ViT-L-14-quickgelu.json
            │   │   ├── ViT-L-14.json
            │   │   ├── ViT-L-16-320.json
            │   │   ├── ViT-L-16-SigLIP-256.json
            │   │   ├── ViT-L-16-SigLIP-384.json
            │   │   ├── ViT-L-16.json
            │   │   ├── ViT-M-16-alt.json
            │   │   ├── ViT-M-16.json
            │   │   ├── ViT-M-32-alt.json
            │   │   ├── ViT-M-32.json
            │   │   ├── ViT-S-16-alt.json
            │   │   ├── ViT-S-16.json
            │   │   ├── ViT-S-32-alt.json
            │   │   ├── ViT-S-32.json
            │   │   ├── ViT-SO400M-14-SigLIP-384.json
            │   │   ├── ViT-SO400M-14-SigLIP.json
            │   │   ├── ViT-bigG-14-CLIPA-336.json
            │   │   ├── ViT-bigG-14-CLIPA.json
            │   │   ├── ViT-bigG-14.json
            │   │   ├── ViT-e-14.json
            │   │   ├── ViT-g-14.json
            │   │   ├── coca_ViT-B-32.json
            │   │   ├── coca_ViT-L-14.json
            │   │   ├── coca_base.json
            │   │   ├── coca_roberta-ViT-B-32.json
            │   │   ├── convnext_base.json
            │   │   ├── convnext_base_w.json
            │   │   ├── convnext_base_w_320.json
            │   │   ├── convnext_large.json
            │   │   ├── convnext_large_d.json
            │   │   ├── convnext_large_d_320.json
            │   │   ├── convnext_small.json
            │   │   ├── convnext_tiny.json
            │   │   ├── convnext_xlarge.json
            │   │   ├── convnext_xxlarge.json
            │   │   ├── convnext_xxlarge_320.json
            │   │   ├── mt5-base-ViT-B-32.json
            │   │   ├── mt5-xl-ViT-H-14.json
            │   │   ├── nllb-clip-base-siglip.json
            │   │   ├── nllb-clip-base.json
            │   │   ├── nllb-clip-large-siglip.json
            │   │   ├── nllb-clip-large.json
            │   │   ├── roberta-ViT-B-32.json
            │   │   ├── swin_base_patch4_window7_224.json
            │   │   ├── vit_medium_patch16_gap_256.json
            │   │   ├── vit_relpos_medium_patch16_cls_224.json
            │   │   ├── xlm-roberta-base-ViT-B-32.json
            │   │   └── xlm-roberta-large-ViT-H-14.json
            │   ├── modified_resnet.py
            │   ├── openai.py
            │   ├── pos_embed.py
            │   ├── pretrained.py
            │   ├── push_to_hf_hub.py
            │   ├── timm_model.py
            │   ├── tokenizer.py
            │   ├── transform.py
            │   ├── transformer.py
            │   ├── utils.py
            │   ├── version.py
            │   ├── zero_shot_classifier.py
            │   └── zero_shot_metadata.py
            ├── retrieve_clip_VQA.py
            ├── retrieve_clip_report.py
            └── training/
                ├── .gitignore
                ├── __init__.py
                ├── data.py
                ├── distributed.py
                ├── file_utils.py
                ├── logger.py
                ├── main.py
                ├── main_feature-work.py
                ├── main_retrieve_report.py
                ├── main_retrieve_report_harvard.py
                ├── params.py
                ├── precision.py
                ├── profiler.py
                ├── scheduler.py
                ├── train.py
                └── zero_shot.py