Showing preview only (1,688K chars total). Download the full file or copy to clipboard to get everything.
Repository: Yui010206/SeViLA
Branch: main
Commit: 419e7281da60
Files: 386
Total size: 1.5 MB
Directory structure:
gitextract_g7qpcjx_/
├── LICENSE.txt
├── MANIFEST.in
├── README.md
├── app/
│ ├── __init__.py
│ ├── calculate_coco_features.py
│ ├── caption.py
│ ├── classification.py
│ ├── dataset_browser.py
│ ├── image_text_match.py
│ ├── main.py
│ ├── multimodal_search.py
│ ├── multipage.py
│ ├── text_localization.py
│ ├── utils.py
│ └── vqa.py
├── app.py
├── docs/
│ ├── Makefile
│ ├── benchmark.rst
│ ├── build_docs.sh
│ ├── conf.py
│ ├── getting_started.rst
│ ├── index.rst
│ ├── intro.rst
│ ├── make.bat
│ ├── requirements.txt
│ ├── tutorial.configs.rst
│ ├── tutorial.datasets.rst
│ ├── tutorial.evaluation.rst
│ ├── tutorial.models.rst
│ ├── tutorial.processors.rst
│ ├── tutorial.rst
│ ├── tutorial.tasks.rst
│ └── tutorial.training-example.rst
├── evaluate.py
├── lavis/
│ ├── __init__.py
│ ├── common/
│ │ ├── config.py
│ │ ├── dist_utils.py
│ │ ├── gradcam.py
│ │ ├── logger.py
│ │ ├── optims.py
│ │ ├── registry.py
│ │ ├── utils.py
│ │ └── vqa_tools/
│ │ ├── __init__.py
│ │ ├── vqa.py
│ │ └── vqa_eval.py
│ ├── configs/
│ │ ├── datasets/
│ │ │ ├── aokvqa/
│ │ │ │ └── defaults.yaml
│ │ │ ├── avsd/
│ │ │ │ └── defaults_dial.yaml
│ │ │ ├── coco/
│ │ │ │ ├── defaults_cap.yaml
│ │ │ │ ├── defaults_ret.yaml
│ │ │ │ ├── defaults_vqa.yaml
│ │ │ │ └── eval_vqa.yaml
│ │ │ ├── conceptual_caption/
│ │ │ │ ├── defaults_12m.yaml
│ │ │ │ └── defaults_3m.yaml
│ │ │ ├── didemo/
│ │ │ │ └── defaults_ret.yaml
│ │ │ ├── flickr30k/
│ │ │ │ └── defaults.yaml
│ │ │ ├── gqa/
│ │ │ │ ├── balanced_testdev.yaml
│ │ │ │ ├── balanced_val.yaml
│ │ │ │ └── defaults.yaml
│ │ │ ├── how2qa/
│ │ │ │ └── defaults_qa.yaml
│ │ │ ├── imagenet/
│ │ │ │ └── defaults.yaml
│ │ │ ├── laion/
│ │ │ │ └── defaults_2B_multi.yaml
│ │ │ ├── msrvtt/
│ │ │ │ ├── defaults_cap.yaml
│ │ │ │ ├── defaults_qa.yaml
│ │ │ │ └── defaults_ret.yaml
│ │ │ ├── msrvttmc/
│ │ │ │ └── defaults_qa.yaml
│ │ │ ├── msvd/
│ │ │ │ ├── defaults_cap.yaml
│ │ │ │ └── defaults_qa.yaml
│ │ │ ├── nextqa/
│ │ │ │ └── defaults_qa.yaml
│ │ │ ├── nlvr/
│ │ │ │ └── defaults.yaml
│ │ │ ├── nocaps/
│ │ │ │ └── defaults.yaml
│ │ │ ├── okvqa/
│ │ │ │ └── defaults.yaml
│ │ │ ├── qvh/
│ │ │ │ └── defaults.yaml
│ │ │ ├── sbu_caption/
│ │ │ │ └── defaults.yaml
│ │ │ ├── snli_ve/
│ │ │ │ └── defaults.yaml
│ │ │ ├── star/
│ │ │ │ └── defaults_qa.yaml
│ │ │ ├── tvqa/
│ │ │ │ └── defaults_qa.yaml
│ │ │ ├── vatex/
│ │ │ │ └── defaults_cap.yaml
│ │ │ ├── vg/
│ │ │ │ ├── defaults_caption.yaml
│ │ │ │ └── defaults_vqa.yaml
│ │ │ └── vlep/
│ │ │ └── defaults_qa.yaml
│ │ ├── default.yaml
│ │ └── models/
│ │ ├── albef_classification_ve.yaml
│ │ ├── albef_feature_extractor.yaml
│ │ ├── albef_nlvr.yaml
│ │ ├── albef_pretrain_base.yaml
│ │ ├── albef_retrieval_coco.yaml
│ │ ├── albef_retrieval_flickr.yaml
│ │ ├── albef_vqav2.yaml
│ │ ├── alpro_qa_msrvtt.yaml
│ │ ├── alpro_qa_msvd.yaml
│ │ ├── alpro_retrieval_didemo.yaml
│ │ ├── alpro_retrieval_msrvtt.yaml
│ │ ├── bert_config.json
│ │ ├── bert_config_alpro.json
│ │ ├── blip2/
│ │ │ ├── blip2_caption_flant5xl.yaml
│ │ │ ├── blip2_caption_opt2.7b.yaml
│ │ │ ├── blip2_caption_opt6.7b.yaml
│ │ │ ├── blip2_coco.yaml
│ │ │ ├── blip2_pretrain.yaml
│ │ │ ├── blip2_pretrain_flant5xl.yaml
│ │ │ ├── blip2_pretrain_flant5xxl.yaml
│ │ │ ├── blip2_pretrain_opt2.7b.yaml
│ │ │ └── blip2_pretrain_opt6.7b.yaml
│ │ ├── blip_caption_base_coco.yaml
│ │ ├── blip_caption_large_coco.yaml
│ │ ├── blip_classification_base.yaml
│ │ ├── blip_feature_extractor_base.yaml
│ │ ├── blip_itm_base.yaml
│ │ ├── blip_itm_large.yaml
│ │ ├── blip_nlvr.yaml
│ │ ├── blip_pretrain_base.yaml
│ │ ├── blip_pretrain_large.yaml
│ │ ├── blip_retrieval_coco.yaml
│ │ ├── blip_retrieval_flickr.yaml
│ │ ├── blip_vqa_aokvqa.yaml
│ │ ├── blip_vqa_okvqa.yaml
│ │ ├── blip_vqav2.yaml
│ │ ├── clip/
│ │ │ ├── RN101-quickgelu.json
│ │ │ ├── RN101.json
│ │ │ ├── RN50-quickgelu.json
│ │ │ ├── RN50.json
│ │ │ ├── RN50x16.json
│ │ │ ├── RN50x4.json
│ │ │ ├── ViT-B-16-plus-240.json
│ │ │ ├── ViT-B-16-plus.json
│ │ │ ├── ViT-B-16.json
│ │ │ ├── ViT-B-32-plus-256.json
│ │ │ ├── ViT-B-32-quickgelu.json
│ │ │ ├── ViT-B-32.json
│ │ │ ├── ViT-H-14.json
│ │ │ ├── ViT-H-16.json
│ │ │ ├── ViT-L-14-280.json
│ │ │ ├── ViT-L-14-336.json
│ │ │ ├── ViT-L-14.json
│ │ │ ├── ViT-L-16-320.json
│ │ │ ├── ViT-L-16.json
│ │ │ ├── ViT-g-14.json
│ │ │ ├── timm-efficientnetv2_rw_s.json
│ │ │ ├── timm-resnet50d.json
│ │ │ ├── timm-resnetaa50d.json
│ │ │ ├── timm-resnetblur50.json
│ │ │ ├── timm-swin_base_patch4_window7_224.json
│ │ │ ├── timm-vit_base_patch16_224.json
│ │ │ ├── timm-vit_base_patch32_224.json
│ │ │ └── timm-vit_small_patch16_224.json
│ │ ├── clip_resnet50.yaml
│ │ ├── clip_vit_base16.yaml
│ │ ├── clip_vit_base32.yaml
│ │ ├── clip_vit_large14.yaml
│ │ ├── clip_vit_large14_336.yaml
│ │ ├── gpt_dialogue_base.yaml
│ │ ├── img2prompt-vqa/
│ │ │ └── img2prompt_vqa_base.yaml
│ │ ├── med_config.json
│ │ ├── med_config_albef.json
│ │ ├── med_large_config.json
│ │ ├── pnp-vqa/
│ │ │ ├── pnp_vqa_3b.yaml
│ │ │ ├── pnp_vqa_base.yaml
│ │ │ ├── pnp_vqa_large.yaml
│ │ │ ├── unifiedqav2_3b_config.json
│ │ │ ├── unifiedqav2_base_config.json
│ │ │ └── unifiedqav2_large_config.json
│ │ └── sevila.yaml
│ ├── datasets/
│ │ ├── builders/
│ │ │ ├── __init__.py
│ │ │ ├── base_dataset_builder.py
│ │ │ ├── caption_builder.py
│ │ │ ├── classification_builder.py
│ │ │ ├── dialogue_builder.py
│ │ │ ├── image_text_pair_builder.py
│ │ │ ├── imagefolder_builder.py
│ │ │ ├── retrieval_builder.py
│ │ │ ├── video_qa_builder.py
│ │ │ └── vqa_builder.py
│ │ ├── data_utils.py
│ │ ├── datasets/
│ │ │ ├── aok_vqa_datasets.py
│ │ │ ├── avsd_dialogue_datasets.py
│ │ │ ├── base_dataset.py
│ │ │ ├── caption_datasets.py
│ │ │ ├── coco_caption_datasets.py
│ │ │ ├── coco_vqa_datasets.py
│ │ │ ├── dataloader_utils.py
│ │ │ ├── dialogue_datasets.py
│ │ │ ├── gqa_datasets.py
│ │ │ ├── image_text_pair_datasets.py
│ │ │ ├── imagefolder_dataset.py
│ │ │ ├── laion_dataset.py
│ │ │ ├── mc_video_vqa_datasets.py
│ │ │ ├── multimodal_classification_datasets.py
│ │ │ ├── nlvr_datasets.py
│ │ │ ├── retrieval_datasets.py
│ │ │ ├── snli_ve_datasets.py
│ │ │ ├── vg_vqa_datasets.py
│ │ │ ├── video_caption_datasets.py
│ │ │ ├── video_vqa_datasets.py
│ │ │ └── vqa_datasets.py
│ │ └── download_scripts/
│ │ ├── DownloadConceptualCaptions/
│ │ │ ├── LICENSE
│ │ │ ├── README.md
│ │ │ ├── create_annotation_12m.ipynb
│ │ │ ├── create_annotation_3m.ipynb
│ │ │ ├── download_data_cc12m.py
│ │ │ └── download_data_cc3m.py
│ │ ├── download_coco.py
│ │ ├── download_didemo.py
│ │ ├── download_flickr.py
│ │ ├── download_gqa.py
│ │ ├── download_msrvtt.py
│ │ ├── download_msvd.py
│ │ ├── download_nocaps.py
│ │ ├── download_sbu.py
│ │ └── download_vg.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── albef_models/
│ │ │ ├── __init__.py
│ │ │ ├── albef_classification.py
│ │ │ ├── albef_feature_extractor.py
│ │ │ ├── albef_nlvr.py
│ │ │ ├── albef_outputs.py
│ │ │ ├── albef_pretrain.py
│ │ │ ├── albef_retrieval.py
│ │ │ └── albef_vqa.py
│ │ ├── alpro_models/
│ │ │ ├── __init__.py
│ │ │ ├── alpro_outputs.py
│ │ │ ├── alpro_qa.py
│ │ │ └── alpro_retrieval.py
│ │ ├── base_model.py
│ │ ├── blip2_models/
│ │ │ ├── Qformer.py
│ │ │ ├── __init__.py
│ │ │ ├── blip2.py
│ │ │ ├── blip2_fmr.py
│ │ │ ├── blip2_image_text_matching.py
│ │ │ ├── blip2_opt.py
│ │ │ ├── blip2_qformer.py
│ │ │ ├── blip2_t5.py
│ │ │ ├── modeling_opt.py
│ │ │ └── modeling_t5.py
│ │ ├── blip_models/
│ │ │ ├── __init__.py
│ │ │ ├── blip.py
│ │ │ ├── blip_caption.py
│ │ │ ├── blip_classification.py
│ │ │ ├── blip_feature_extractor.py
│ │ │ ├── blip_image_text_matching.py
│ │ │ ├── blip_nlvr.py
│ │ │ ├── blip_outputs.py
│ │ │ ├── blip_pretrain.py
│ │ │ ├── blip_retrieval.py
│ │ │ ├── blip_vqa.py
│ │ │ └── nlvr_encoder.py
│ │ ├── clip_models/
│ │ │ ├── __init__.py
│ │ │ ├── clip_outputs.py
│ │ │ ├── loss.py
│ │ │ ├── model.py
│ │ │ ├── pretrained.py
│ │ │ ├── timm_model.py
│ │ │ ├── tokenizer.py
│ │ │ ├── transform.py
│ │ │ └── utils.py
│ │ ├── eva_vit.py
│ │ ├── gpt_models/
│ │ │ └── gpt_dialogue.py
│ │ ├── img2prompt_models/
│ │ │ ├── __init__.py
│ │ │ └── img2prompt_vqa.py
│ │ ├── med.py
│ │ ├── pnp_vqa_models/
│ │ │ ├── __init__.py
│ │ │ ├── pnp_unifiedqav2_fid.py
│ │ │ └── pnp_vqa.py
│ │ ├── sevila_models/
│ │ │ ├── __init__.py
│ │ │ └── sevila.py
│ │ ├── timesformer/
│ │ │ ├── __init__.py
│ │ │ ├── conv2d_same.py
│ │ │ ├── features.py
│ │ │ ├── helpers.py
│ │ │ ├── linear.py
│ │ │ ├── vit.py
│ │ │ └── vit_utils.py
│ │ ├── topk.py
│ │ └── vit.py
│ ├── processors/
│ │ ├── __init__.py
│ │ ├── alpro_processors.py
│ │ ├── base_processor.py
│ │ ├── blip_processors.py
│ │ ├── clip_processors.py
│ │ ├── functional_video.py
│ │ ├── gpt_processors.py
│ │ ├── randaugment.py
│ │ └── transforms_video.py
│ ├── projects/
│ │ ├── albef/
│ │ │ ├── eval/
│ │ │ │ ├── nlvr_eval.yaml
│ │ │ │ ├── ret_coco_eval.yaml
│ │ │ │ ├── ret_flickr30k_eval.yaml
│ │ │ │ ├── snli_ve_eval.yaml
│ │ │ │ ├── vqa_test.yaml
│ │ │ │ └── vqa_val.yaml
│ │ │ └── train/
│ │ │ ├── aokvqa_ft.yaml
│ │ │ ├── nlvr_ft.yaml
│ │ │ ├── okvqa_ft.yaml
│ │ │ ├── pretrain.yaml
│ │ │ ├── ret_coco_ft.yaml
│ │ │ ├── ret_flickr30k_ft.yaml
│ │ │ ├── snli_ve_ft.yaml
│ │ │ └── vqa_ft.yaml
│ │ ├── alpro/
│ │ │ ├── eval/
│ │ │ │ ├── didemo_ret_eval.yaml
│ │ │ │ ├── msrvtt_qa_eval.yaml
│ │ │ │ ├── msrvtt_ret_eval.yaml
│ │ │ │ └── msvd_qa_eval.yaml
│ │ │ └── train/
│ │ │ ├── didemo_ret_ft.yaml
│ │ │ ├── msrvtt_qa_ft.yaml
│ │ │ ├── msrvtt_retrieval_ft.yaml
│ │ │ └── msvd_qa_ft.yaml
│ │ ├── blip/
│ │ │ ├── coco_cap_ft_iter.yaml
│ │ │ ├── eval/
│ │ │ │ ├── aokvqa_eval.yaml
│ │ │ │ ├── caption_coco_eval.yaml
│ │ │ │ ├── caption_coco_eval_large.yaml
│ │ │ │ ├── nlvr_eval.yaml
│ │ │ │ ├── nocaps_eval.yaml
│ │ │ │ ├── okvqa_eval.yaml
│ │ │ │ ├── ret_coco_eval.yaml
│ │ │ │ ├── ret_flickr_eval.yaml
│ │ │ │ └── vqav2_eval.yaml
│ │ │ └── train/
│ │ │ ├── aokvqa_ft.yaml
│ │ │ ├── caption_coco_ft.yaml
│ │ │ ├── caption_coco_large_ft.yaml
│ │ │ ├── nlvr_ft.yaml
│ │ │ ├── okvqa_ft.yaml
│ │ │ ├── pretrain_14m.yaml
│ │ │ ├── retrieval_coco_ft.yaml
│ │ │ ├── retrieval_flickr_ft.yaml
│ │ │ └── vqav2_ft.yaml
│ │ ├── blip2/
│ │ │ ├── eval/
│ │ │ │ ├── caption_coco_flant5xl_eval.yaml
│ │ │ │ ├── caption_coco_opt2.7b_eval.yaml
│ │ │ │ ├── caption_coco_opt6.7b_eval.yaml
│ │ │ │ ├── gqa_zeroshot_flant5xl_eval.yaml
│ │ │ │ ├── okvqa_zeroshot_flant5xl_eval.yaml
│ │ │ │ ├── ret_coco_eval.yaml
│ │ │ │ ├── ret_flickr_eval.yaml
│ │ │ │ └── vqav2_zeroshot_flant5xl_eval.yaml
│ │ │ └── train/
│ │ │ ├── caption_coco_ft.yaml
│ │ │ ├── pretrain_stage1.yaml
│ │ │ └── pretrain_stage2.yaml
│ │ ├── clip/
│ │ │ ├── exp_coco_ret_eval.yaml
│ │ │ ├── exp_flickr_ret_eval.yaml
│ │ │ └── exp_imnet_zs_eval.yaml
│ │ ├── gpt/
│ │ │ ├── eval/
│ │ │ │ └── dialogue_avsd_eval.yaml
│ │ │ └── train/
│ │ │ └── dialogue_avsd_ft.yaml
│ │ ├── pnp-vqa/
│ │ │ └── eval/
│ │ │ ├── gqa_eval.yaml
│ │ │ ├── gqa_eval_3b.yaml
│ │ │ ├── gqa_eval_large.yaml
│ │ │ ├── okvqa_eval.yaml
│ │ │ ├── okvqa_eval_3b.yaml
│ │ │ ├── okvqa_eval_large.yaml
│ │ │ ├── vqav2_eval.yaml
│ │ │ ├── vqav2_eval_3b.yaml
│ │ │ ├── vqav2_eval_large.yaml
│ │ │ ├── vqav2_test_eval.yaml
│ │ │ ├── vqav2_test_eval_3b.yaml
│ │ │ └── vqav2_test_eval_large.yaml
│ │ └── sevila/
│ │ ├── eval/
│ │ │ ├── how2qa_eval.yaml
│ │ │ ├── nextqa_eval.yaml
│ │ │ ├── qvh_eval.yaml
│ │ │ ├── star_eval.yaml
│ │ │ ├── tvqa_eval.yaml
│ │ │ └── vlep_eval.yaml
│ │ └── train/
│ │ ├── how2qa.yaml
│ │ ├── nextqa.yaml
│ │ ├── qvh.yaml
│ │ ├── star.yaml
│ │ ├── tvqa.yaml
│ │ └── vlep.yaml
│ ├── runners/
│ │ ├── __init__.py
│ │ ├── runner_base.py
│ │ └── runner_iter.py
│ └── tasks/
│ ├── __init__.py
│ ├── base_task.py
│ ├── captioning.py
│ ├── dialogue.py
│ ├── image_text_pretrain.py
│ ├── multimodal_classification.py
│ ├── retrieval.py
│ ├── vqa.py
│ └── vqa_reading_comprehension.py
├── pyproject.toml
├── requirements.txt
├── run_scripts/
│ └── sevila/
│ ├── finetune/
│ │ └── nexqa_ft.sh
│ ├── inference/
│ │ └── nexqa_infer.sh
│ ├── pre-train/
│ │ └── pretrain_qvh.sh
│ └── refinement/
│ └── nexqa_sr.sh
├── setup.py
├── sevila_checkpoints/
│ └── __init__.py
├── sevila_data/
│ ├── Data Preprocess.ipynb
│ └── README.md
└── train.py
================================================
FILE CONTENTS
================================================
================================================
FILE: LICENSE.txt
================================================
BSD 3-Clause License
Copyright (c) 2022 Salesforce, Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
3. Neither the name of Salesforce.com nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
================================================
FILE: MANIFEST.in
================================================
recursive-include lavis/configs *.yaml *.json
recursive-include lavis/projects *.yaml *.json
recursive-exclude lavis/datasets/download_scripts *
recursive-exclude lavis/output *
include requirements.txt
================================================
FILE: README.md
================================================
# [NeurIPS 2023] Self-Chained Image-Language Model for Video Localization and Question Answering
* Authors: [Shoubin Yu](https://yui010206.github.io/), [Jaemin Cho](https://j-min.io), [Prateek Yadav](https://prateek-yadav.github.io/), [Mohit Bansal](https://www.cs.unc.edu/~mbansal/)
* Paper: [arXiv](https://arxiv.org/abs/2305.06988)
* Online Demo: Try our Gradio demo on Hugging Face[](https://huggingface.co/spaces/Shoubin/SeViLA)
<img src="./assets/teaser.png" alt="teaser image" width="800"/>
<img src="./assets/model.png" alt="teaser image" width="800"/>
<img src="./assets/chain.png" alt="teaser image" width="800"/>
# Code structure
```bash
# data & data preprocessing
./sevila_data
# pretrained checkpoints
./sevila_checkpoints
# SeViLA code
./lavis/
# running scripts for SeViLA localizer/answerer training/inference
./run_scripts
```
# Setup
## Install Dependencies
1. (Optional) Creating conda environment
```bash
conda create -n sevila python=3.8
conda activate sevila
```
2. build from source
```bash
pip install -e .
```
## Download Pretrained Models
We pre-train SeViLA localizer on QVHighlights and hold checkpoints via [Hugging Face](https://huggingface.co/Shoubin/SeViLA/resolve/main/sevila_pretrained.pth).
Download checkpoints and put it under /sevila_checkpoints.
The checkpoints (814.55M) contains pre-trained localizer and zero-shot answerer.
If you want to pre-train your own localizer, you can download [qformer_loc.pth](https://drive.google.com/file/d/13hE_BQDflkzYrHVmVGddRSt8VMa0ouGB/view?usp=sharing), which is a copy of the original BLIP-2 Q-former to initialize the localizer (with changed model keys).
# Run Gradio Demo Locally
We also provide a UI for testing our SeViLA locally that is built with gradio.
Running demo locally requires about 12GB of memory.
* Installing Gradio:
```bash
pip install gradio==3.30.0
```
* Running the following command in a terminal will launch the demo:
```bash
python app.py
```
# Dataset Preparation
We test our model on:
+ [NExT-QA](https://doc-doc.github.io/docs/nextqa.html)
+ [STAR](https://star.csail.mit.edu/)
+ [How2QA](https://value-benchmark.github.io/index.html)
+ [TVQA](https://tvqa.cs.unc.edu/)
+ [VLEP](https://value-benchmark.github.io/index.html)
+ [QVHighlights](https://github.com/jayleicn/moment_detr)
Please download original QA data and preprocess them via our [scripts](sevila_data/).
# Training and Inference
We provide SeViLA training and inference script examples as follows.
And please refer to [dataset page](sevila_data/) to custom your data path.
## 1) Localizer Pre-training
```bash
sh run_scripts/sevila/pre-train/pretrain_qvh.sh
```
## 2) Answerer Fine-tuning
```bash
sh run_scripts/sevila/finetune/nextqa_ft.sh
```
## 3) Localizer Self-refinement
```bash
sh run_scripts/sevila/refinement/nextqa_sr.sh
```
## 4) Inference
```bash
sh run_scripts/sevila/inference/nextqa_infer.sh
```
# Acknowledgments
We thank the developers of [LAVIS](https://github.com/salesforce/LAVIS), [BLIP-2](https://github.com/salesforce/LAVIS/tree/main/projects/blip2), [CLIP](https://github.com/openai/CLIP), [All-in-One](https://github.com/showlab/all-in-one), for their public code release.
# Reference
Please cite our paper if you use our models in your works:
```bibtex
@inproceedings{yu2023self,
title = {Self-Chained Image-Language Model for Video Localization and Question Answering},
author = {Yu, Shoubin and Cho, Jaemin and Yadav, Prateek and Bansal, Mohit},
booktitle = {NeurIPS},
year = {2023}
}
================================================
FILE: app/__init__.py
================================================
"""
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
from PIL import Image
import requests
import streamlit as st
import torch
@st.cache()
def load_demo_image():
img_url = (
"https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg"
)
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
return raw_image
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cache_root = "/export/home/.cache/lavis/"
================================================
FILE: app/calculate_coco_features.py
================================================
"""
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
from PIL import Image
import requests
import torch
import os
from lavis.common.registry import registry
from lavis.processors import *
from lavis.models import *
from lavis.common.utils import build_default_model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def load_demo_image():
img_url = (
"https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg"
)
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
return raw_image
def read_img(filepath):
raw_image = Image.open(filepath).convert("RGB")
return raw_image
# model
model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base.pth"
feature_extractor = BlipFeatureExtractor(pretrained=model_url)
feature_extractor.eval()
feature_extractor = feature_extractor.to(device)
# preprocessors
vis_processor = BlipImageEvalProcessor(image_size=224)
text_processor = BlipCaptionProcessor()
# files to process
# file_root = "/export/home/.cache/lavis/coco/images/val2014"
file_root = "/export/home/.cache/lavis/coco/images/train2014"
filepaths = os.listdir(file_root)
print(len(filepaths))
caption = "dummy"
path2feat = dict()
bsz = 256
images_in_batch = []
filepaths_in_batch = []
for i, filename in enumerate(filepaths):
if i % bsz == 0 and i > 0:
images_in_batch = torch.cat(images_in_batch, dim=0).to(device)
with torch.no_grad():
image_features = feature_extractor(
images_in_batch, caption, mode="image", normalized=True
)[:, 0]
for filepath, image_feat in zip(filepaths_in_batch, image_features):
path2feat[os.path.basename(filepath)] = image_feat.detach().cpu()
images_in_batch = []
filepaths_in_batch = []
print(len(path2feat), image_features.shape)
else:
filepath = os.path.join(file_root, filename)
image = read_img(filepath)
image = vis_processor(image).unsqueeze(0)
images_in_batch.append(image)
filepaths_in_batch.append(filepath)
torch.save(path2feat, "path2feat_coco_train2014.pth")
================================================
FILE: app/caption.py
================================================
"""
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import streamlit as st
from app import device, load_demo_image
from app.utils import load_model_cache
from lavis.processors import load_processor
from PIL import Image
def app():
# ===== layout =====
model_type = st.sidebar.selectbox("Model:", ["BLIP_base", "BLIP_large"])
sampling_method = st.sidebar.selectbox(
"Sampling method:", ["Beam search", "Nucleus sampling"]
)
st.markdown(
"<h1 style='text-align: center;'>Image Description Generation</h1>",
unsafe_allow_html=True,
)
instructions = """Try the provided image or upload your own:"""
file = st.file_uploader(instructions)
use_beam = sampling_method == "Beam search"
col1, col2 = st.columns(2)
if file:
raw_img = Image.open(file).convert("RGB")
else:
raw_img = load_demo_image()
col1.header("Image")
w, h = raw_img.size
scaling_factor = 720 / w
resized_image = raw_img.resize((int(w * scaling_factor), int(h * scaling_factor)))
col1.image(resized_image, use_column_width=True)
col2.header("Description")
cap_button = st.button("Generate")
# ==== event ====
vis_processor = load_processor("blip_image_eval").build(image_size=384)
if cap_button:
if model_type.startswith("BLIP"):
blip_type = model_type.split("_")[1].lower()
model = load_model_cache(
"blip_caption",
model_type=f"{blip_type}_coco",
is_eval=True,
device=device,
)
img = vis_processor(raw_img).unsqueeze(0).to(device)
captions = generate_caption(
model=model, image=img, use_nucleus_sampling=not use_beam
)
col2.write("\n\n".join(captions), use_column_width=True)
def generate_caption(
model, image, use_nucleus_sampling=False, num_beams=3, max_length=40, min_length=5
):
samples = {"image": image}
captions = []
if use_nucleus_sampling:
for _ in range(5):
caption = model.generate(
samples,
use_nucleus_sampling=True,
max_length=max_length,
min_length=min_length,
top_p=0.9,
)
captions.append(caption[0])
else:
caption = model.generate(
samples,
use_nucleus_sampling=False,
num_beams=num_beams,
max_length=max_length,
min_length=min_length,
)
captions.append(caption[0])
return captions
================================================
FILE: app/classification.py
================================================
"""
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import plotly.graph_objects as go
import requests
import streamlit as st
import torch
from lavis.models import load_model
from lavis.processors import load_processor
from lavis.processors.blip_processors import BlipCaptionProcessor
from PIL import Image
from app import device, load_demo_image
from app.utils import load_blip_itm_model
from lavis.processors.clip_processors import ClipImageEvalProcessor
@st.cache()
def load_demo_image(img_url=None):
if not img_url:
img_url = "https://img.atlasobscura.com/yDJ86L8Ou6aIjBsxnlAy5f164w1rjTgcHZcx2yUs4mo/rt:fit/w:1200/q:81/sm:1/scp:1/ar:1/aHR0cHM6Ly9hdGxh/cy1kZXYuczMuYW1h/em9uYXdzLmNvbS91/cGxvYWRzL3BsYWNl/X2ltYWdlcy85MDll/MDRjOS00NTJjLTQx/NzQtYTY4MS02NmQw/MzI2YWIzNjk1ZGVk/MGZhMTJiMTM5MmZi/NGFfUmVhcl92aWV3/X29mX3RoZV9NZXJs/aW9uX3N0YXR1ZV9h/dF9NZXJsaW9uX1Bh/cmssX1NpbmdhcG9y/ZSxfd2l0aF9NYXJp/bmFfQmF5X1NhbmRz/X2luX3RoZV9kaXN0/YW5jZV8tXzIwMTQw/MzA3LmpwZw.jpg"
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
return raw_image
@st.cache(
hash_funcs={
torch.nn.parameter.Parameter: lambda parameter: parameter.data.detach()
.cpu()
.numpy()
},
allow_output_mutation=True,
)
def load_model_cache(model_type, device):
if model_type == "blip":
model = load_model(
"blip_feature_extractor", model_type="base", is_eval=True, device=device
)
elif model_type == "albef":
model = load_model(
"albef_feature_extractor", model_type="base", is_eval=True, device=device
)
elif model_type == "CLIP_ViT-B-32":
model = load_model(
"clip_feature_extractor", "ViT-B-32", is_eval=True, device=device
)
elif model_type == "CLIP_ViT-B-16":
model = load_model(
"clip_feature_extractor", "ViT-B-16", is_eval=True, device=device
)
elif model_type == "CLIP_ViT-L-14":
model = load_model(
"clip_feature_extractor", "ViT-L-14", is_eval=True, device=device
)
return model
def app():
model_type = st.sidebar.selectbox(
"Model:",
["ALBEF", "BLIP_Base", "CLIP_ViT-B-32", "CLIP_ViT-B-16", "CLIP_ViT-L-14"],
)
score_type = st.sidebar.selectbox("Score type:", ["Cosine", "Multimodal"])
# ===== layout =====
st.markdown(
"<h1 style='text-align: center;'>Zero-shot Classification</h1>",
unsafe_allow_html=True,
)
instructions = """Try the provided image or upload your own:"""
file = st.file_uploader(instructions)
st.header("Image")
if file:
raw_img = Image.open(file).convert("RGB")
else:
raw_img = load_demo_image()
st.image(raw_img) # , use_column_width=True)
col1, col2 = st.columns(2)
col1.header("Categories")
cls_0 = col1.text_input("category 1", value="merlion")
cls_1 = col1.text_input("category 2", value="sky")
cls_2 = col1.text_input("category 3", value="giraffe")
cls_3 = col1.text_input("category 4", value="fountain")
cls_4 = col1.text_input("category 5", value="marina bay")
cls_names = [cls_0, cls_1, cls_2, cls_3, cls_4]
cls_names = [cls_nm for cls_nm in cls_names if len(cls_nm) > 0]
if len(cls_names) != len(set(cls_names)):
st.error("Please provide unique class names")
return
button = st.button("Submit")
col2.header("Prediction")
# ===== event =====
if button:
if model_type.startswith("BLIP"):
text_processor = BlipCaptionProcessor(prompt="A picture of ")
cls_prompt = [text_processor(cls_nm) for cls_nm in cls_names]
if score_type == "Cosine":
vis_processor = load_processor("blip_image_eval").build(image_size=224)
img = vis_processor(raw_img).unsqueeze(0).to(device)
feature_extractor = load_model_cache(model_type="blip", device=device)
sample = {"image": img, "text_input": cls_prompt}
with torch.no_grad():
image_features = feature_extractor.extract_features(
sample, mode="image"
).image_embeds_proj[:, 0]
text_features = feature_extractor.extract_features(
sample, mode="text"
).text_embeds_proj[:, 0]
sims = (image_features @ text_features.t())[
0
] / feature_extractor.temp
else:
vis_processor = load_processor("blip_image_eval").build(image_size=384)
img = vis_processor(raw_img).unsqueeze(0).to(device)
model = load_blip_itm_model(device)
output = model(img, cls_prompt, match_head="itm")
sims = output[:, 1]
sims = torch.nn.Softmax(dim=0)(sims)
inv_sims = [sim * 100 for sim in sims.tolist()[::-1]]
elif model_type.startswith("ALBEF"):
vis_processor = load_processor("blip_image_eval").build(image_size=224)
img = vis_processor(raw_img).unsqueeze(0).to(device)
text_processor = BlipCaptionProcessor(prompt="A picture of ")
cls_prompt = [text_processor(cls_nm) for cls_nm in cls_names]
feature_extractor = load_model_cache(model_type="albef", device=device)
sample = {"image": img, "text_input": cls_prompt}
with torch.no_grad():
image_features = feature_extractor.extract_features(
sample, mode="image"
).image_embeds_proj[:, 0]
text_features = feature_extractor.extract_features(
sample, mode="text"
).text_embeds_proj[:, 0]
st.write(image_features.shape)
st.write(text_features.shape)
sims = (image_features @ text_features.t())[0] / feature_extractor.temp
sims = torch.nn.Softmax(dim=0)(sims)
inv_sims = [sim * 100 for sim in sims.tolist()[::-1]]
elif model_type.startswith("CLIP"):
if model_type == "CLIP_ViT-B-32":
model = load_model_cache(model_type="CLIP_ViT-B-32", device=device)
elif model_type == "CLIP_ViT-B-16":
model = load_model_cache(model_type="CLIP_ViT-B-16", device=device)
elif model_type == "CLIP_ViT-L-14":
model = load_model_cache(model_type="CLIP_ViT-L-14", device=device)
else:
raise ValueError(f"Unknown model type {model_type}")
if score_type == "Cosine":
# image_preprocess = ClipImageEvalProcessor(image_size=336)
image_preprocess = ClipImageEvalProcessor(image_size=224)
img = image_preprocess(raw_img).unsqueeze(0).to(device)
sample = {"image": img, "text_input": cls_names}
with torch.no_grad():
clip_features = model.extract_features(sample)
image_features = clip_features.image_embeds_proj
text_features = clip_features.text_embeds_proj
sims = (100.0 * image_features @ text_features.T)[0].softmax(dim=-1)
inv_sims = sims.tolist()[::-1]
else:
st.warning("CLIP does not support multimodal scoring.")
return
fig = go.Figure(
go.Bar(
x=inv_sims,
y=cls_names[::-1],
text=["{:.2f}".format(s) for s in inv_sims],
orientation="h",
)
)
fig.update_traces(
textfont_size=12,
textangle=0,
textposition="outside",
cliponaxis=False,
)
col2.plotly_chart(fig, use_container_width=True)
================================================
FILE: app/dataset_browser.py
================================================
"""
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import random
from collections import OrderedDict
from functools import reduce
from tkinter import N
import streamlit as st
from lavis.common.registry import registry
from lavis.datasets.builders import dataset_zoo, load_dataset
from lavis.datasets.builders.base_dataset_builder import load_dataset_config
from PIL import Image
IMAGE_LAYOUT = 3, 4
VIDEO_LAYOUT = 1, 2
PREV_STR = "Prev"
NEXT_STR = "Next"
def sample_dataset(dataset, indices):
samples = [dataset.displ_item(idx) for idx in indices]
return samples
def get_concat_v(im1, im2):
margin = 5
canvas_size = (im1.width + im2.width + margin, max(im1.height, im2.height))
canvas = Image.new("RGB", canvas_size, "White")
canvas.paste(im1, (0, 0))
canvas.paste(im2, (im1.width + margin, 0))
return canvas
def resize_img_w(raw_img, new_w=224):
if isinstance(raw_img, list):
resized_imgs = [resize_img_w(img, 196) for img in raw_img]
# concatenate images
resized_image = reduce(get_concat_v, resized_imgs)
else:
w, h = raw_img.size
scaling_factor = new_w / w
resized_image = raw_img.resize(
(int(w * scaling_factor), int(h * scaling_factor))
)
return resized_image
def get_visual_key(dataset):
if "image" in dataset[0]:
return "image"
elif "image0" in dataset[0]: # NLVR2 dataset
return "image"
elif "video" in dataset[0]:
return "video"
else:
raise ValueError("Visual key not found.")
def gather_items(samples, exclude=[]):
gathered = []
for s in samples:
ns = OrderedDict()
for k in s.keys():
if k not in exclude:
ns[k] = s[k]
gathered.append(ns)
return gathered
@st.cache(allow_output_mutation=True)
def load_dataset_cache(name):
return load_dataset(name)
def format_text(text):
md = "\n\n".join([f"**{k}**: {v}" for k, v in text.items()])
return md
def show_samples(dataset, offset=0, is_next=False):
visual_key = get_visual_key(dataset)
num_rows, num_cols = IMAGE_LAYOUT if visual_key == "image" else VIDEO_LAYOUT
n_samples = num_rows * num_cols
if not shuffle:
if is_next:
start = min(int(start_idx) + offset + n_samples, len(dataset) - n_samples)
else:
start = max(0, int(start_idx) + offset - n_samples)
st.session_state.last_start = start
end = min(start + n_samples, len(dataset))
indices = list(range(start, end))
else:
indices = random.sample(range(len(dataset)), n_samples)
samples = sample_dataset(dataset, indices)
visual_info = (
iter([resize_img_w(s[visual_key]) for s in samples])
if visual_key == "image"
# else iter([s[visual_key] for s in samples])
else iter([s["file"] for s in samples])
)
text_info = gather_items(samples, exclude=["image", "video"])
text_info = iter([format_text(s) for s in text_info])
st.markdown(
"""<hr style="height:1px;border:none;color:#c7ccd4;background-color:#c7ccd4;"/> """,
unsafe_allow_html=True,
)
for _ in range(num_rows):
with st.container():
for col in st.columns(num_cols):
# col.text(next(text_info))
# col.caption(next(text_info))
try:
col.markdown(next(text_info))
if visual_key == "image":
col.image(next(visual_info), use_column_width=True, clamp=True)
elif visual_key == "video":
col.markdown(
""
)
except StopIteration:
break
st.markdown(
"""<hr style="height:1px;border:none;color:#c7ccd4;background-color:#c7ccd4;"/> """,
unsafe_allow_html=True,
)
st.session_state.n_display = n_samples
if __name__ == "__main__":
st.set_page_config(
page_title="LAVIS Dataset Explorer",
# layout="wide",
initial_sidebar_state="expanded",
)
dataset_name = st.sidebar.selectbox("Dataset:", dataset_zoo.get_names())
function = st.sidebar.selectbox("Function:", ["Browser"], index=0)
if function == "Browser":
shuffle = st.sidebar.selectbox("Shuffled:", [True, False], index=0)
dataset = load_dataset_cache(dataset_name)
split = st.sidebar.selectbox("Split:", dataset.keys())
dataset_len = len(dataset[split])
st.success(
f"Loaded {dataset_name}/{split} with **{dataset_len}** records. **Image/video directory**: {dataset[split].vis_root}"
)
if "last_dataset" not in st.session_state:
st.session_state.last_dataset = dataset_name
st.session_state.last_split = split
if "last_start" not in st.session_state:
st.session_state.last_start = 0
if "start_idx" not in st.session_state:
st.session_state.start_idx = 0
if "shuffle" not in st.session_state:
st.session_state.shuffle = shuffle
if "first_run" not in st.session_state:
st.session_state.first_run = True
elif (
st.session_state.last_dataset != dataset_name
or st.session_state.last_split != split
):
st.session_state.first_run = True
st.session_state.last_dataset = dataset_name
st.session_state.last_split = split
elif st.session_state.shuffle != shuffle:
st.session_state.shuffle = shuffle
st.session_state.first_run = True
if not shuffle:
n_col, p_col = st.columns([0.05, 1])
prev_button = n_col.button(PREV_STR)
next_button = p_col.button(NEXT_STR)
else:
next_button = st.button(NEXT_STR)
if not shuffle:
start_idx = st.sidebar.text_input(f"Begin from (total {dataset_len})", 0)
if not start_idx.isdigit():
st.error(f"Input to 'Begin from' must be digits, found {start_idx}.")
else:
if int(start_idx) != st.session_state.start_idx:
st.session_state.start_idx = int(start_idx)
st.session_state.last_start = int(start_idx)
if prev_button:
show_samples(
dataset[split],
offset=st.session_state.last_start - st.session_state.start_idx,
is_next=False,
)
if next_button:
show_samples(
dataset[split],
offset=st.session_state.last_start - st.session_state.start_idx,
is_next=True,
)
if st.session_state.first_run:
st.session_state.first_run = False
show_samples(
dataset[split],
offset=st.session_state.last_start - st.session_state.start_idx,
is_next=True,
)
================================================
FILE: app/image_text_match.py
================================================
"""
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import numpy as np
import streamlit as st
import torch
from lavis.models.blip_models.blip_image_text_matching import compute_gradcam
from lavis.processors import load_processor
from PIL import Image
from app import device, load_demo_image
from app.utils import getAttMap, init_bert_tokenizer, load_blip_itm_model
def app():
model_type = st.sidebar.selectbox("Model:", ["BLIP_base", "BLIP_large"])
if model_type.startswith("BLIP"):
blip_type = model_type.split("_")[1]
model = load_blip_itm_model(device, model_type=blip_type)
vis_processor = load_processor("blip_image_eval").build(image_size=384)
st.markdown(
"<h1 style='text-align: center;'>Image Text Matching</h1>",
unsafe_allow_html=True,
)
values = list(range(1, 12))
default_layer_num = values.index(7)
layer_num = (
st.sidebar.selectbox("Layer number", values, index=default_layer_num) - 1
)
instructions = """Try the provided image or upload your own:"""
file = st.file_uploader(instructions)
col1, col2 = st.columns(2)
col1.header("Image")
col2.header("GradCam")
if file:
raw_img = Image.open(file).convert("RGB")
else:
raw_img = load_demo_image()
w, h = raw_img.size
scaling_factor = 720 / w
resized_image = raw_img.resize((int(w * scaling_factor), int(h * scaling_factor)))
col1.image(resized_image, use_column_width=True)
col3, col4 = st.columns(2)
col3.header("Text")
user_question = col3.text_input(
"Input your sentence!", "a woman sitting on the beach with a dog"
)
submit_button = col3.button("Submit")
col4.header("Matching score")
if submit_button:
tokenizer = init_bert_tokenizer()
img = vis_processor(raw_img).unsqueeze(0).to(device)
text_processor = load_processor("blip_caption").build()
qry = text_processor(user_question)
norm_img = np.float32(resized_image) / 255
qry_tok = tokenizer(qry, return_tensors="pt").to(device)
gradcam, output = compute_gradcam(model, img, qry, qry_tok, block_num=layer_num)
avg_gradcam = getAttMap(norm_img, gradcam[0][1], blur=True)
col2.image(avg_gradcam, use_column_width=True, clamp=True)
# output = model(img, question)
itm_score = torch.nn.functional.softmax(output, dim=1)
new_title = (
'<p style="text-align: left; font-size: 25px;">\n{:.3f}%</p>'.format(
itm_score[0][1].item() * 100
)
)
col4.markdown(new_title, unsafe_allow_html=True)
================================================
FILE: app/main.py
================================================
"""
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
from app.multipage import MultiPage
from app import vqa, caption
from app import image_text_match as itm
from app import text_localization as tl
from app import multimodal_search as ms
from app import classification as cl
if __name__ == "__main__":
app = MultiPage()
app.add_page("Image Description Generation", caption.app)
app.add_page("Multimodal Search", ms.app)
app.add_page("Visual Question Answering", vqa.app)
app.add_page("Image Text Matching", itm.app)
app.add_page("Text Localization", tl.app)
app.add_page("Classification", cl.app)
app.run()
================================================
FILE: app/multimodal_search.py
================================================
"""
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import os
import numpy as np
import streamlit as st
import torch
import torch.nn.functional as F
from app import cache_root, device
from app.utils import (
getAttMap,
init_bert_tokenizer,
load_blip_itm_model,
read_img,
resize_img,
)
from lavis.models import load_model
from lavis.processors import load_processor
@st.cache(
hash_funcs={
torch.nn.parameter.Parameter: lambda parameter: parameter.data.detach()
.cpu()
.numpy()
},
allow_output_mutation=True,
)
def load_feat():
from lavis.common.utils import download_url
dirname = os.path.join(os.path.dirname(__file__), "assets")
filename = "path2feat_coco_train2014.pth"
filepath = os.path.join(dirname, filename)
url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/path2feat_coco_train2014.pth"
if not os.path.exists(filepath):
download_url(url=url, root=dirname, filename="path2feat_coco_train2014.pth")
path2feat = torch.load(filepath)
paths = sorted(path2feat.keys())
all_img_feats = torch.stack([path2feat[k] for k in paths], dim=0).to(device)
return path2feat, paths, all_img_feats
@st.cache(
hash_funcs={
torch.nn.parameter.Parameter: lambda parameter: parameter.data.detach()
.cpu()
.numpy()
},
allow_output_mutation=True,
)
def load_feature_extractor_model(device):
model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base.pth"
model = load_model(
"blip_feature_extractor", model_type="base", is_eval=True, device=device
)
model.load_from_pretrained(model_url)
return model
def app():
# === layout ===
model_type = st.sidebar.selectbox("Model:", ["BLIP_base", "BLIP_large"])
file_root = os.path.join(cache_root, "coco/images/train2014/")
values = [12, 24, 48]
default_layer_num = values.index(24)
num_display = st.sidebar.selectbox(
"Number of images:", values, index=default_layer_num
)
show_gradcam = st.sidebar.selectbox("Show GradCam:", [True, False], index=1)
itm_ranking = st.sidebar.selectbox("Multimodal re-ranking:", [True, False], index=0)
# st.title('Multimodal Search')
st.markdown(
"<h1 style='text-align: center;'>Multimodal Search</h1>", unsafe_allow_html=True
)
# === event ===
vis_processor = load_processor("blip_image_eval").build(image_size=384)
text_processor = load_processor("blip_caption")
user_question = st.text_input(
"Search query", "A dog running on the grass.", help="Type something to search."
)
user_question = text_processor(user_question)
feature_extractor = load_feature_extractor_model(device)
# ======= ITC =========
sample = {"text_input": user_question}
with torch.no_grad():
text_feature = feature_extractor.extract_features(
sample, mode="text"
).text_embeds_proj[0, 0]
path2feat, paths, all_img_feats = load_feat()
all_img_feats.to(device)
all_img_feats = F.normalize(all_img_feats, dim=1)
num_cols = 4
num_rows = int(num_display / num_cols)
similarities = text_feature @ all_img_feats.T
indices = torch.argsort(similarities, descending=True)[:num_display]
top_paths = [paths[ind.detach().cpu().item()] for ind in indices]
sorted_similarities = [similarities[idx] for idx in indices]
filenames = [os.path.join(file_root, p) for p in top_paths]
# ========= ITM and GradCam ==========
bsz = 4 # max number of images to avoid cuda oom
if model_type.startswith("BLIP"):
blip_type = model_type.split("_")[1]
itm_model = load_blip_itm_model(device, model_type=blip_type)
tokenizer = init_bert_tokenizer()
queries_batch = [user_question] * bsz
queries_tok_batch = tokenizer(queries_batch, return_tensors="pt").to(device)
num_batches = int(num_display / bsz)
avg_gradcams = []
all_raw_images = []
itm_scores = []
for i in range(num_batches):
filenames_in_batch = filenames[i * bsz : (i + 1) * bsz]
raw_images, images = read_and_process_images(filenames_in_batch, vis_processor)
gradcam, itm_output = compute_gradcam_batch(
itm_model, images, queries_batch, queries_tok_batch
)
all_raw_images.extend([resize_img(r_img) for r_img in raw_images])
norm_imgs = [np.float32(r_img) / 255 for r_img in raw_images]
for norm_img, grad_cam in zip(norm_imgs, gradcam):
avg_gradcam = getAttMap(norm_img, grad_cam[0], blur=True)
avg_gradcams.append(avg_gradcam)
with torch.no_grad():
itm_score = torch.nn.functional.softmax(itm_output, dim=1)
itm_scores.append(itm_score)
# ========= ITM re-ranking =========
itm_scores = torch.cat(itm_scores)[:, 1]
if itm_ranking:
itm_scores_sorted, indices = torch.sort(itm_scores, descending=True)
avg_gradcams_sorted = []
all_raw_images_sorted = []
for idx in indices:
avg_gradcams_sorted.append(avg_gradcams[idx])
all_raw_images_sorted.append(all_raw_images[idx])
avg_gradcams = avg_gradcams_sorted
all_raw_images = all_raw_images_sorted
if show_gradcam:
images_to_show = iter(avg_gradcams)
else:
images_to_show = iter(all_raw_images)
for _ in range(num_rows):
with st.container():
for col in st.columns(num_cols):
col.image(next(images_to_show), use_column_width=True, clamp=True)
def read_and_process_images(image_paths, vis_processor):
raw_images = [read_img(path) for path in image_paths]
images = [vis_processor(r_img) for r_img in raw_images]
images_tensors = torch.stack(images).to(device)
return raw_images, images_tensors
def compute_gradcam_batch(model, visual_input, text_input, tokenized_text, block_num=6):
model.text_encoder.base_model.base_model.encoder.layer[
block_num
].crossattention.self.save_attention = True
output = model({"image": visual_input, "text_input": text_input}, match_head="itm")
loss = output[:, 1].sum()
model.zero_grad()
loss.backward()
with torch.no_grad():
mask = tokenized_text.attention_mask.view(
tokenized_text.attention_mask.size(0), 1, -1, 1, 1
) # (bsz,1,token_len, 1,1)
token_length = mask.sum() - 2
token_length = token_length.cpu()
# grads and cams [bsz, num_head, seq_len, image_patch]
grads = model.text_encoder.base_model.base_model.encoder.layer[
block_num
].crossattention.self.get_attn_gradients()
cams = model.text_encoder.base_model.base_model.encoder.layer[
block_num
].crossattention.self.get_attention_map()
# assume using vit large with 576 num image patch
cams = cams[:, :, :, 1:].reshape(visual_input.size(0), 12, -1, 24, 24) * mask
grads = (
grads[:, :, :, 1:].clamp(0).reshape(visual_input.size(0), 12, -1, 24, 24)
* mask
)
gradcam = cams * grads
# [enc token gradcam, average gradcam across token, gradcam for individual token]
# gradcam = torch.cat((gradcam[0:1,:], gradcam[1:token_length+1, :].sum(dim=0, keepdim=True)/token_length, gradcam[1:, :]))
gradcam = gradcam.mean(1).cpu().detach()
gradcam = (
gradcam[:, 1 : token_length + 1, :].sum(dim=1, keepdim=True) / token_length
)
return gradcam, output
================================================
FILE: app/multipage.py
================================================
"""
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
"""
This file is the framework for generating multiple Streamlit applications
through an object oriented framework.
"""
# Import necessary libraries
import streamlit as st
# Define the multipage class to manage the multiple apps in our program
class MultiPage:
"""Framework for combining multiple streamlit applications."""
def __init__(self) -> None:
"""Constructor class to generate a list which will store all our applications as an instance variable."""
self.pages = []
def add_page(self, title, func) -> None:
"""Class Method to Add pages to the project
Args:
title ([str]): The title of page which we are adding to the list of apps
func: Python function to render this page in Streamlit
"""
self.pages.append({"title": title, "function": func})
def run(self):
# Drodown to select the page to run
page = st.sidebar.selectbox(
"Navigation", self.pages, format_func=lambda page: page["title"]
)
# run the app function
page["function"]()
================================================
FILE: app/text_localization.py
================================================
"""
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import math
import numpy as np
import streamlit as st
from lavis.models.blip_models.blip_image_text_matching import compute_gradcam
from lavis.processors import load_processor
from PIL import Image
from app import device, load_demo_image
from app.utils import getAttMap, init_bert_tokenizer, load_blip_itm_model
def app():
model_type = st.sidebar.selectbox("Model:", ["BLIP_base", "BLIP_large"])
values = list(range(1, 12))
default_layer_num = values.index(7)
layer_num = (
st.sidebar.selectbox("Layer number", values, index=default_layer_num) - 1
)
st.markdown(
"<h1 style='text-align: center;'>Text Localization</h1>", unsafe_allow_html=True
)
vis_processor = load_processor("blip_image_eval").build(image_size=384)
text_processor = load_processor("blip_caption")
tokenizer = init_bert_tokenizer()
instructions = "Try the provided image and text or use your own ones."
file = st.file_uploader(instructions)
query = st.text_input(
"Try a different input.", "A girl playing with her dog on the beach."
)
submit_button = st.button("Submit")
col1, col2 = st.columns(2)
if file:
raw_img = Image.open(file).convert("RGB")
else:
raw_img = load_demo_image()
col1.header("Image")
w, h = raw_img.size
scaling_factor = 720 / w
resized_image = raw_img.resize((int(w * scaling_factor), int(h * scaling_factor)))
col1.image(resized_image, use_column_width=True)
col2.header("GradCam")
if submit_button:
if model_type.startswith("BLIP"):
blip_type = model_type.split("_")[1]
model = load_blip_itm_model(device, model_type=blip_type)
img = vis_processor(raw_img).unsqueeze(0).to(device)
qry = text_processor(query)
qry_tok = tokenizer(qry, return_tensors="pt").to(device)
norm_img = np.float32(resized_image) / 255
gradcam, _ = compute_gradcam(model, img, qry, qry_tok, block_num=layer_num)
avg_gradcam = getAttMap(norm_img, gradcam[0][1], blur=True)
col2.image(avg_gradcam, use_column_width=True, clamp=True)
num_cols = 4.0
num_tokens = len(qry_tok.input_ids[0]) - 2
num_rows = int(math.ceil(num_tokens / num_cols))
gradcam_iter = iter(gradcam[0][2:-1])
token_id_iter = iter(qry_tok.input_ids[0][1:-1])
for _ in range(num_rows):
with st.container():
for col in st.columns(int(num_cols)):
token_id = next(token_id_iter, None)
if not token_id:
break
gradcam_img = next(gradcam_iter)
word = tokenizer.decode([token_id])
gradcam_todraw = getAttMap(norm_img, gradcam_img, blur=True)
new_title = (
'<p style="text-align: center; font-size: 25px;">{}</p>'.format(
word
)
)
col.markdown(new_title, unsafe_allow_html=True)
# st.image(image, channels="BGR")
col.image(gradcam_todraw, use_column_width=True, clamp=True)
================================================
FILE: app/utils.py
================================================
"""
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import numpy as np
import streamlit as st
import torch
from lavis.models import BlipBase, load_model
from matplotlib import pyplot as plt
from PIL import Image
from scipy.ndimage import filters
from skimage import transform as skimage_transform
def resize_img(raw_img):
w, h = raw_img.size
scaling_factor = 240 / w
resized_image = raw_img.resize((int(w * scaling_factor), int(h * scaling_factor)))
return resized_image
def read_img(filepath):
raw_image = Image.open(filepath).convert("RGB")
return raw_image
@st.cache(
hash_funcs={
torch.nn.parameter.Parameter: lambda parameter: parameter.data.detach()
.cpu()
.numpy()
},
allow_output_mutation=True,
)
def load_model_cache(name, model_type, is_eval, device):
return load_model(name, model_type, is_eval, device)
@st.cache(allow_output_mutation=True)
def init_bert_tokenizer():
tokenizer = BlipBase.init_tokenizer()
return tokenizer
def getAttMap(img, attMap, blur=True, overlap=True):
attMap -= attMap.min()
if attMap.max() > 0:
attMap /= attMap.max()
attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant")
if blur:
attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2]))
attMap -= attMap.min()
attMap /= attMap.max()
cmap = plt.get_cmap("jet")
attMapV = cmap(attMap)
attMapV = np.delete(attMapV, 3, 2)
if overlap:
attMap = (
1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img
+ (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV
)
return attMap
@st.cache(
hash_funcs={
torch.nn.parameter.Parameter: lambda parameter: parameter.data.detach()
.cpu()
.numpy()
},
allow_output_mutation=True,
)
def load_blip_itm_model(device, model_type="base"):
model = load_model(
"blip_image_text_matching", model_type, is_eval=True, device=device
)
return model
================================================
FILE: app/vqa.py
================================================
"""
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import streamlit as st
from app import load_demo_image, device
from app.utils import load_model_cache
from lavis.processors import load_processor
from PIL import Image
def app():
model_type = st.sidebar.selectbox("Model:", ["BLIP"])
# ===== layout =====
st.markdown(
"<h1 style='text-align: center;'>Visual Question Answering</h1>",
unsafe_allow_html=True,
)
instructions = """Try the provided image or upload your own:"""
file = st.file_uploader(instructions)
col1, col2 = st.columns(2)
col1.header("Image")
if file:
raw_img = Image.open(file).convert("RGB")
else:
raw_img = load_demo_image()
w, h = raw_img.size
scaling_factor = 720 / w
resized_image = raw_img.resize((int(w * scaling_factor), int(h * scaling_factor)))
col1.image(resized_image, use_column_width=True)
col2.header("Question")
user_question = col2.text_input("Input your question!", "What are objects there?")
qa_button = st.button("Submit")
col2.header("Answer")
# ===== event =====
vis_processor = load_processor("blip_image_eval").build(image_size=480)
text_processor = load_processor("blip_question").build()
if qa_button:
if model_type.startswith("BLIP"):
model = load_model_cache(
"blip_vqa", model_type="vqav2", is_eval=True, device=device
)
img = vis_processor(raw_img).unsqueeze(0).to(device)
question = text_processor(user_question)
vqa_samples = {"image": img, "text_input": [question]}
answers = model.predict_answers(vqa_samples, inference_method="generate")
col2.write("\n".join(answers), use_column_width=True)
================================================
FILE: app.py
================================================
import gradio as gr
import os
import torch
from torchvision import transforms
from lavis.processors import transforms_video
from lavis.datasets.data_utils import load_video_demo
from lavis.processors.blip_processors import ToUint8, ToTHWC
from lavis.models.sevila_models.sevila import SeViLA
from typing import Optional
import warnings
# model config
img_size = 224
num_query_token = 32
t5_model = 'google/flan-t5-xl'
drop_path_rate = 0
use_grad_checkpoint = False
vit_precision = "fp16"
freeze_vit = True
prompt = ''
max_txt_len = 77
answer_num = 5
apply_lemmatizer = False
task = 'freeze_loc_freeze_qa_vid'
# prompt
LOC_propmpt = 'Does the information within the frame provide the necessary details to accurately answer the given question?'
QA_prompt = 'Considering the information presented in the frame, select the correct answer from the options.'
# processors config
mean = (0.48145466, 0.4578275, 0.40821073)
std = (0.26862954, 0.26130258, 0.27577711)
normalize = transforms.Normalize(mean, std)
image_size = img_size
transform = transforms.Compose([ToUint8(), ToTHWC(), transforms_video.ToTensorVideo(), normalize])
print('Model Loading \nLoading the SeViLA model can take a few minutes (typically 2-3).')
sevila = SeViLA(
img_size=img_size,
drop_path_rate=drop_path_rate,
use_grad_checkpoint=use_grad_checkpoint,
vit_precision=vit_precision,
freeze_vit=freeze_vit,
num_query_token=num_query_token,
t5_model=t5_model,
prompt=prompt,
max_txt_len=max_txt_len,
apply_lemmatizer=apply_lemmatizer,
frame_num=4,
answer_num=answer_num,
task=task,
)
sevila.load_checkpoint(url_or_filename='https://huggingface.co/Shoubin/SeViLA/resolve/main/sevila_pretrained.pth')
print('Model Loaded')
ANS_MAPPING = {0 : 'A', 1 : 'B', 2 : 'C', 3 : 'D', 4 : 'E'}
# os.mkdir('video')
def sevila_demo(video,
question,
option1, option2, option3,
video_frame_num,
keyframe_num):
if torch.cuda.is_available():
device = 0
else:
device = 'cpu'
global sevila
if device == "cpu":
sevila = sevila.float()
else:
sevila = sevila.to(int(device))
vpath = video
raw_clip, indice, fps, vlen = load_video_demo(
video_path=vpath,
n_frms=int(video_frame_num),
height=image_size,
width=image_size,
sampling="uniform",
clip_proposal=None
)
clip = transform(raw_clip.permute(1,0,2,3))
clip = clip.float().to(int(device))
clip = clip.unsqueeze(0)
# check
if option1[-1] != '.':
option1 += '.'
if option2[-1] != '.':
option2 += '.'
if option3[-1] != '.':
option3 += '.'
option_dict = {0:option1, 1:option2, 2:option3}
options = 'Option A:{} Option B:{} Option C:{}'.format(option1, option2, option3)
text_input_qa = 'Question: ' + question + ' ' + options + ' ' + QA_prompt
text_input_loc = 'Question: ' + question + ' ' + options + ' ' + LOC_propmpt
out = sevila.generate_demo(clip, text_input_qa, text_input_loc, int(keyframe_num))
# print(out)
answer_id = out['output_text'][0]
answer = option_dict[answer_id]
select_index = out['frame_idx'][0]
# images = []
keyframes = []
timestamps =[]
# print('raw_clip', len(raw_clip))
# for j in range(int(video_frame_num)):
# image = raw_clip[:, j, :, :].int()
# image = image.permute(1, 2, 0).numpy()
# images.append(image)
video_len = vlen/fps # seconds
for i in select_index:
image = raw_clip[:, i, :, :].int()
image = image.permute(1, 2, 0).numpy()
keyframes.append(image)
select_i = indice[i]
time = round((select_i / vlen) * video_len, 2)
timestamps.append(str(time)+'s')
gr.components.Gallery(keyframes)
#gr.components.Gallery(images)
timestamps_des = ''
for i in range(len(select_index)):
timestamps_des += 'Keyframe {}: {} \n'.format(str(i+1), timestamps[i])
return keyframes, timestamps_des, answer
with gr.Blocks(title="SeViLA demo") as demo:
description = """<p style="text-align: center; font-weight: bold;">
<span style="font-size: 28px">Self-Chained Image-Language Model for Video Localization and Question Answering</span>
<br>
<span style="font-size: 18px" id="author-info">
<a href="https://yui010206.github.io/" target="_blank">Shoubin Yu</a>,
<a href="https://j-min.io/" target="_blank">Jaemin Cho</a>,
<a href="https://prateek-yadav.github.io/" target="_blank">Prateek Yadav</a>,
<a href="https://www.cs.unc.edu/~mbansal/" target="_blank">Mohit Bansal</a>
</span>
<br>
<span style="font-size: 18px" id="paper-info">
[<a href="https://github.com/Yui010206/SeViLA" target="_blank">GitHub</a>]
[<a href="https://arxiv.org/abs/2305.06988" target="_blank">Paper</a>]
</span>
</p>
<p>
To locate keyframes in a video and answer question, please:
<br>
(1) upolad your video; (2) write your question/options and set # video frame/# keyframe; (3) click Locate and Answer!
<br>
Just a heads up - loading the SeViLA model can take a few minutes (typically 2-3), and running examples requires about 12GB of memory.
<br>
We've got you covered! We've provided some example videos and questions below to help you get started. Feel free to try out SeViLA with these!
</p>
"""
gr.HTML(description)
with gr.Row():
with gr.Column(scale=1, min_width=600):
video = gr.Video(label='Video')
question = gr.Textbox(placeholder="Why did the two ladies put their hands above their eyes while staring out?", label='Question')
with gr.Row():
option1 = gr.Textbox(placeholder="practicing cheer", label='Option 1')
option2 = gr.Textbox(placeholder="posing for photo", label='Option 2')
option3 = gr.Textbox(placeholder="to see better", label='Option 3')
with gr.Row():
video_frame_num = gr.Textbox(placeholder=32, label='# Video Frame')
keyframe_num = gr.Textbox(placeholder=4, label='# Keyframe')
# device = gr.Textbox(placeholder=0, label='Device')
gen_btn = gr.Button(value='Locate and Answer!')
with gr.Column(scale=1, min_width=600):
keyframes = gr.Gallery(
label="Keyframes", show_label=False, elem_id="gallery",
).style(columns=[4], rows=[1], object_fit="contain", max_width=100, max_height=100)
#keyframes = gr.Gallery(label='Keyframes')
timestamps = gr.outputs.Textbox(label="Keyframe Timestamps")
answer = gr.outputs.Textbox(label="Output Answer")
gen_btn.click(
sevila_demo,
inputs=[video, question, option1, option2, option3, video_frame_num, keyframe_num],
outputs=[keyframes, timestamps, answer],
queue=True
)
#demo = gr.Interface(sevila_demo,
# inputs=[gr.Video(), question, option1, option2, option3, video_frame_num, keyframe_num, device],
# outputs=['gallery', timestamps, answer],
# examples=[['videos/demo1.mp4', 'Why did the two ladies put their hands above their eyes while staring out?', 'practicing cheer.', 'play ball.', 'to see better.', 32, 4, 0],
# ['videos/demo2.mp4', 'What did both of them do after completing skiing?', 'jump and pose.' , 'bend down.','raised their hands.', 32, 4, 0],
# ['videos/demo3.mp4', 'What room was Wilson breaking into when House found him?', 'the kitchen.' , 'the dining room.','the bathroom.', 32, 4, 0]]
# )
with gr.Column():
gr.Examples(
inputs=[video, question, option1, option2, option3, video_frame_num, keyframe_num],
outputs=[keyframes, timestamps, answer],
fn=sevila_demo,
examples=[['videos/demo1.mp4', 'Why did the two ladies put their hands above their eyes while staring out?', 'practicing cheer', 'to place wreaths', 'to see better', 32, 4],
['videos/demo2.mp4', 'What did both of them do after completing skiing?', 'jump and pose' , 'bend down','raised their hands', 32, 4],
['videos/demo3.mp4', 'What room was Wilson breaking into when House found him?', 'the bedroom' , 'the bathroom','the kitchen', 32, 4]],
cache_examples=False,
)
demo.queue(concurrency_count=1, api_open=False)
demo.launch(share=False)
================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
================================================
FILE: docs/benchmark.rst
================================================
Benchmark
############
We provide scripts for evaluating and training models on task datasets. The following benchmark results are included for reference.
ALBEF
*******
.. list-table::
:widths: 30 80 20
* - **Pretraining**
- COCO (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_coco.py>`__)
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/albef/train/pretrain.sh>`__
* -
- Visual Genome (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_vg.py>`__)
-
* -
- SBU (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_sbu.py>`__)
-
* -
- CC3M (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc3m.py>`__)
-
* -
- CC12M (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc12m.py>`__)
-
.. list-table::
:widths: 30 40 20 20 20 30 30
:header-rows: 1
* -
- **Retrieval**
- **R1**
- **R5**
- **R10**
- **Training**
- **Evaluation**
* - TR
- COCO (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_coco.py>`__)
- 77.6
- 94.1
- 97.2
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/albef/train/train_coco_retrieval_albef.sh>`__
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/albef/eval/eval_coco_retrieval.sh>`__
* - IR
- COCO (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_coco.py>`__)
- 61.0
- 84.5
- 90.7
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/albef/train/train_coco_retrieval_albef.sh>`__
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/albef/eval/eval_coco_retrieval.sh>`__
* - TR
- Flickr30k (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_flickr.py>`__)
- 77.6
- 94.1
- 97.2
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/albef/train/train_flickr30k_retrieval_albef.sh>`__
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/albef/eval/eval_flickr30k_retrieval.sh>`__
* - IR
- Flickr30k (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_flickr.py>`__)
- 61.0
- 84.5
- 90.7
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/albef/train/train_flickr30k_retrieval_albef.sh>`__
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/albef/eval/eval_flickr30k_retrieval.sh>`__
.. list-table::
:widths: 20 20 20 20 20
:header-rows: 1
* - **VQA**
- **test-dev**
- **test-std/test**
- **Training**
- **Evaluation**
* - VQAv2 (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_coco.py>`__)
- 76.35
- 76.54
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/albef/train/train_vqa_albef.sh>`__
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/albef/eval/test_albef_vqa.sh>`__
* - OKVQA (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_coco.py>`__)
- NA
- 54.7
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/albef/train/train_okvqa_albef.sh>`__
- NA
* - AOKVQA (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_coco.py>`__)
- 54.5
- NA
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/albef/train/train_aokvqa_albef.sh>`__
- NA
.. list-table::
:widths: 20 20 20 20 20
:header-rows: 1
* - **Multimodal Classification**
- **val**
- **test**
- **Training**
- **Evaluation**
* - SNLI-VE (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_coco.py>`__)
- 80.60
- 81.04
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/albef/train/train_ve_albef.sh>`__
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/albef/eval/eval_albef_ve.sh>`__
* - NLVR2 (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_coco.py>`__)
- 82.47
- 82.91
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/albef/train/train_nlvr_albef.sh>`__
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/albef/eval/eval_albef_nlvr.sh>`__
BLIP
*******
.. list-table::
:widths: 30 80 20
* - **Pretraining (14M)**
- COCO (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_coco.py>`__)
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/blip/train/pretrain.sh>`__
* -
- Visual Genome (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_vg.py>`__)
-
* -
- SBU (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_sbu.py>`__)
-
* -
- CC3M (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc3m.py>`__)
-
* -
- CC12M (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc12m.py>`__)
-
.. list-table::
:widths: 30 40 20 20 20 30 30
:header-rows: 1
* - **Tasks**
- **Retrieval**
- **R1**
- **R5**
- **R10**
- **Training**
- **Evaluation**
* - TR
- COCO (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_coco.py>`__)
- 82.0
- 95.8
- 98.1
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/blip/train/train_retrieval_coco.sh>`__
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/blip/eval/eval_ret_coco.sh>`__
* - IR
- COCO (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_coco.py>`__)
- 64.5
- 86.0
- 91.7
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/blip/train/train_retrieval_coco.sh>`__
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/blip/eval/eval_ret_coco.sh>`__
* - TR
- Flickr30k (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_flickr.py>`__)
- 96.9
- 99.9
- 100.0
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/blip/train/train_retrieval_flickr.sh>`__
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/blip/eval/eval_ret_flickr.sh>`__
* - IR
- Flickr30k (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_flickr.py>`__)
- 87.5
- 97.6
- 98.9
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/blip/train/train_retrieval_flickr.sh>`__
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/blip/eval/eval_ret_flickr.sh>`__
.. list-table::
:widths: 20 20 20 20 20
:header-rows: 1
* - **VQA**
- **test-dev**
- **test-std/test**
- **Training**
- **Evaluation**
* - VQAv2 (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_coco.py>`__)
- 78.23
- 78.29
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/albef/train/train_vqa_albef.sh>`__
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/albef/eval/test_albef_vqa.sh>`__
* - OKVQA (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_coco.py>`__)
- NA
- 55.4
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/blip/train/train_okvqa.sh>`__
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/blip/eval/eval_okvqa.sh>`__
* - AOKVQA (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_coco.py>`__)
- 56.2
- 50.1
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/blip/train/train_aokvqa.sh>`__
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/blip/eval/eval_aokvqa.sh>`__
.. list-table::
:widths: 20 20 20 20 20 20
:header-rows: 1
* - **Image Captioning**
- **BLEU@4**
- **CIDEr**
- **SPICE**
- **Training**
- **Evaluation**
* - COCO (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_coco.py>`__)
- 39.9
- 133.5
- 23.7
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/blip/train/train_caption_coco.sh>`__
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/blip/eval/eval_coco_cap.sh>`__
* - NoCaps (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_nocaps.py>`__)
- 31.9
- 109.1
- 14.7
- NA
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/blip/eval/eval_nocaps.sh>`__
.. list-table::
:widths: 20 20 20 20 20
:header-rows: 1
* - **Multimodal Classification**
- **val**
- **test**
- **Training**
- **Evaluation**
* - NLVR2 (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_coco.py>`__)
- 82.48
- 83.25
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/blip/train/train_nlvr.sh>`__
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/blip/eval/eval_nlvr.sh>`__
CLIP
*******
.. list-table::
:widths: 30 40 20 20 20 30
:header-rows: 1
* - **Tasks**
- **Retrieval (Zero-shot)**
- **R1**
- **R5**
- **R10**
- **Evaluation**
* - TR
- COCO (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_coco.py>`__)
- 57.2
- 80.5
- 87.8
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/clip/eval/eval_clip_ret_coco.sh>`__
* - IR
- COCO (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_coco.py>`__)
- 36.5
- 60.8
- 71.0
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/clip/eval/eval_clip_ret_coco.sh>`__
* - TR
- Flickr30k (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_flickr.py>`__)
- 86.5
- 98.0
- 99.1
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/clip/eval/eval_clip_ret_flickr.sh>`__
* - IR
- Flickr30k (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_flickr.py>`__)
- 67.0
- 88.9
- 93.3
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/clip/eval/eval_clip_ret_flickr.sh>`__
.. list-table::
:widths: 20 20 20
:header-rows: 1
* - **Multimodal Classification**
- **val**
- **Evaluation**
* - ImageNet
- 76.5
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/clip/eval/eval_clip_zs_imnet.sh>`__
ALPRO
*******
.. list-table::
:widths: 30 40 20 20 20 20 30
:header-rows: 1
* - **Tasks**
- **Retrieval**
- **R1**
- **R5**
- **R10**
- **Training**
- **Evaluation**
* - TR
- MSRVTT (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_msrvtt.py>`__)
- 33.2
- 60.5
- 71.7
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/alpro/train/train_msrvtt_ret.sh>`__
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/alpro/eval/eval_msrvtt_ret.sh>`__
* - VR
- MSRVTT (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_msrvtt.py>`__)
- 33.8
- 61.4
- 72.7
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/alpro/train/train_msrvtt_ret.sh>`__
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/alpro/eval/eval_msrvtt_ret.sh>`__
* - TR
- DiDeMo (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_didemo.py>`__)
- 38.8
- 66.4
- 76.8
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/alpro/train/train_didemo_ret.sh>`__
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/alpro/eval/eval_didemo_ret.sh>`__
* - VR
- DiDeMo (`download <https://github.com/salesforce/LAVIS/blob/main/lavis/datasets/download_scripts/download_didemo.py>`__)
- 36.6
- 67.5
- 77.9
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/alpro/train/train_didemo_ret.sh>`__
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/alpro/eval/eval_didemo_ret.sh>`__
.. list-table::
:widths: 20 20 20 20
:header-rows: 1
* - **Video QA**
- **test**
- **Training**
- **Evaluation**
* - MSRVTT
- 42.1
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/alpro/train/train_msrvtt_qa.sh>`__
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/alpro/eval/eval_msrvtt_qa.sh>`__
* - MSVD
- 46.0
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/alpro/train/train_msvd_qa.sh>`__
- `script <https://github.com/salesforce/LAVIS/blob/main/run_scripts/alpro/eval/eval_msvd_qa.sh>`__
================================================
FILE: docs/build_docs.sh
================================================
#!/bin/bash
set -euo pipefail
# Change to root directory of repo
DIRNAME=$(cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
cd "${DIRNAME}/.."
# # Set up virtual environment
pip3 install setuptools wheel virtualenv
if [ ! -d venv ]; then
rm -f venv
virtualenv venv
fi
source venv/bin/activate
# # Get current git branch & stash unsaved changes
GIT_BRANCH=$(git branch --show-current)
if [ -z "${GIT_BRANCH}" ]; then
GIT_BRANCH="main"
fi
git stash
# Set up exit handler to restore git state & delete temp branches
# function exit_handler {
# git reset --hard
# git checkout "${GIT_BRANCH}" --
# git stash pop || true
# for version in $(git tag --list 'v[0-9]*'); do
# branch="${version}_local_docs_only"
# if git show-ref --verify --quiet "refs/heads/$branch"; then
# git branch -D "$branch"
# fi
# done
# }
# trap exit_handler EXIT
# Clean up build directory and install Sphinx requirements
pip3 install -r "${DIRNAME}/requirements.txt"
sphinx-build -M clean "${DIRNAME}" "${DIRNAME}/_build"
# Build API docs for current head
export current_version="latest"
pip3 install "."
sphinx-build -b html "${DIRNAME}" "${DIRNAME}/_build/html/${current_version}" -W --keep-going
rm -rf "${DIRNAME}/_build/html/${current_version}/.doctrees"
#pip3 uninstall -y omnixai
# Install all previous released versions
# and use them to build the appropriate API docs.
# Uninstall after we're done with each one.
# versions=()
# checkout_files=("${DIRNAME}/*.rst" "lavis" "tutorials" "setup.py")
# for version in $(git tag --list 'v[0-9]*'); do
# versions+=("$version")
# git checkout -b "${version}_local_docs_only"
# for f in $(git diff --name-only --diff-filter=A "tags/${version}" "${DIRNAME}/*.rst"); do
# git rm "$f"
# done
# git checkout "tags/${version}" -- "${checkout_files[@]}"
# export current_version=${version}
# pip3 install ".[all]"
# sphinx-build -b html "${DIRNAME}" "${DIRNAME}/_build/html/${current_version}" -W --keep-going
# rm -rf "${DIRNAME}/_build/html/${current_version}/.doctrees"
# #pip3 uninstall -y omnixai
# git reset --hard
# git checkout "${GIT_BRANCH}" --
# done
# Determine the latest stable version if there is one
# if (( ${#versions[@]} > 0 )); then
# stable_hash=$(git rev-list --tags --max-count=1)
# stable_version=$(git describe --tags "$stable_hash")
# export stable_version
# else
export stable_version="latest"
# fi
# Create dummy HTML's for the stable version in the base directory
while read -r filename; do
filename=$(echo "$filename" | sed "s/\.\///")
n_sub=$(echo "$filename" | (grep -o "/" || true) | wc -l)
prefix=""
for (( i=0; i<n_sub; i++ )); do
prefix+="../"
done
url="${prefix}${stable_version}/$filename"
mkdir -p "${DIRNAME}/_build/html/$(dirname "$filename")"
cat > "${DIRNAME}/_build/html/$filename" <<EOF
<!DOCTYPE html>
<html>
<head>
<title>LAVIS Documentation</title>
<meta http-equiv = "refresh" content="0; url='$url'" />
</head>
<body>
<p>Please wait while you're redirected to our <a href="$url">documentation</a>.</p>
</body>
</html>
EOF
done < <(cd "${DIRNAME}/_build/html/$stable_version" && find . -name "*.html")
echo "Finished writing to _build/html."
================================================
FILE: docs/conf.py
================================================
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
# -- Project information -----------------------------------------------------
project = "LAVIS"
copyright = "2022, salesforce.com inc."
author = (
"Dongxu Li, Junnan Li, Hung Le, Guangsen Wang, Silvio Savarese, Steven C.H. Hoi"
)
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = ["nbsphinx"]
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
# html_theme = "alabaster"
html_theme = "sphinx_rtd_theme"
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static"]
# pygments_style = "sphinx"
================================================
FILE: docs/getting_started.rst
================================================
Dataset Zoo
##################
LAVIS inherently supports a wide variety of common language-vision datasets by providing automatic download scripts to help download and organize these datasets;
and implements PyTorch datasets for these datasets. To view supported datasets, use the following code:
.. code-block:: python
from lavis.datasets.builders import dataset_zoo
dataset_names = dataset_zoo.get_names()
print(dataset_names)
# ['aok_vqa', 'coco_caption', 'coco_retrieval', 'coco_vqa', 'conceptual_caption_12m',
# 'conceptual_caption_3m', 'didemo_retrieval', 'flickr30k', 'imagenet', 'laion2B_multi',
# 'msrvtt_caption', 'msrvtt_qa', 'msrvtt_retrieval', 'msvd_caption', 'msvd_qa', 'nlvr',
# 'nocaps', 'ok_vqa', 'sbu_caption', 'snli_ve', 'vatex_caption', 'vg_caption', 'vg_vqa']
print(len(dataset_names))
# 23
Auto-Downloading and Loading Datasets
######################################
We now take COCO caption dataset as an example to demonstrate how to download and prepare the dataset.
In ``lavis/datasets/download_scripts/``, we provide tools to download most common public language-vision datasets supported by LAVIS.
The COCO caption dataset uses images from COCO dataset. Therefore, we first download COCO images via:
.. code-block:: bash
cd lavis/datasets/download_scripts/ && python download_coco.py
This will automatically download and extract COCO images to the default LAVIS cache location.
The default cache location is ``~/.cache/lavis``, defined in ``lavis/configs/default.yaml``.
After downloading the images, we can use ``load_dataset()`` to obtain the dataset. On the first run, this will automatically download and cache annotation files.
.. code-block:: python
from lavis.datasets.builders import load_dataset
coco_dataset = load_dataset("coco_caption")
print(coco_dataset.keys())
# dict_keys(['train', 'val', 'test'])
print(len(coco_dataset["train"]))
# 566747
print(coco_dataset["train"][0])
# {'image': <PIL.Image.Image image mode=RGB size=640x480>,
# 'text_input': 'A woman wearing a net on her head cutting a cake. ',
# 'image_id': 0}
If you already host a local copy of the dataset, you can pass in the ``vis_path`` argument to change the default location to load images.
.. code-block:: python
coco_dataset = load_dataset("coco_caption", vis_path=YOUR_LOCAL_PATH)
Model Zoo
####################################
LAVIS supports a growing list of pre-trained models for different tasks,
datatsets and of varying sizes. Let's get started by viewing the supported models.
.. code-block:: python
from lavis.models import model_zoo
print(model_zoo)
# ==================================================
# Architectures Types
# ==================================================
# albef_classification base, ve
# albef_nlvr base
# albef_pretrain base
# albef_retrieval base, coco, flickr
# albef_vqa base, vqav2
# alpro_qa base, msrvtt, msvd
# alpro_retrieval base, msrvtt, didemo
# blip_caption base, base_coco, large, large_coco
# blip_classification base
# blip_feature_extractor base
# blip_nlvr base
# blip_pretrain base
# blip_retrieval base, coco, flickr
# blip_vqa base, vqav2
# clip ViT-B-32, ViT-B-16, ViT-L-14, ViT-L-14-336, RN50
# show total number of support model variants
len(model_zoo)
# 33
Inference with Pre-trained Models
####################################
Now let's see how to use models in LAVIS to perform inference on example data. We first
load a sample image from local.
.. code-block:: python
from PIL import Image
# setup device to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# load sample image
raw_image = Image.open("docs/_static/merlion.png").convert("RGB")
This example image shows `Merlion park <https://en.wikipedia.org/wiki/Merlion>`_ (`image credit <https://theculturetrip.com/asia/singapore/articles/what-exactly-is-singapores-merlion-anyway/>`_), a landmark in Singapore.
.. image:: _static/merlion.png
Image Captioning
*******************************
We now use the BLIP model to generate a caption for the image. To make inference even easier, we also associate each
pre-trained model with its preprocessors (transforms), we use ``load_model_and_preprocess()`` with the following arguments:
- ``name``: The name of the model to load. This could be a pre-trained model, task model, or feature extractor. See ``model_zoo`` for a full list of model names.
- ``model_type``: Each architecture has variants trained on different datasets and at different scale. See Types column in ``model_zoo`` for a full list of model types.
- ``is_eval``: if `True`, set the model to evaluation mode. This is desired for inference or feature extraction.
- ``devce``: device to load the model to.
.. code-block:: python
from lavis.models import load_model_and_preprocess
# loads BLIP caption base model, with finetuned checkpoints on MSCOCO captioning dataset.
# this also loads the associated image processors
model, vis_processors, _ = load_model_and_preprocess(name="blip_caption", model_type="base_coco", is_eval=True, device=device)
# preprocess the image
# vis_processors stores image transforms for "train" and "eval" (validation / testing / inference)
image = vis_processors["eval"](raw_image).unsqueeze(0).to(device)
# generate caption
model.generate({"image": image})
# ['a large fountain spewing water into the air']
You may also load models and their preprocessors separately via ``load_model()`` and ``load_processor()``.
In BLIP, you can also generate diverse captions by turning nucleus sampling on.
.. code-block:: python
from lavis.processors import load_processor
from lavis.models import load_model
# load image preprocesser used for BLIP
vis_processor = load_processor("blip_image_eval").build(image_size=384)
model = load_model(name="blip_caption", model_type="base_coco", is_eval=True, device=device)
image = vis_processor(image).unsqueeze(0).to(device)
model.generate({"image": raw_image}, use_nucleus_sampling=True)
# one generated random sample: ['some very pretty buildings and some water jets']
Visual question answering (VQA)
*******************************
BLIP model is able to answer free-form questions about images in natural language.
To access the VQA model, simply replace the ``name`` and ``model_type`` arguments
passed to ``load_model_and_preprocess()``.
.. code-block:: python
from lavis.models import load_model_and_preprocess
model, vis_processors, txt_processors = load_model_and_preprocess(name="blip_vqa", model_type="vqav2", is_eval=True, device=device)
# ask a random question.
question = "Which city is this photo taken?"
image = vis_processors["eval"](raw_image).unsqueeze(0).to(device)
question = txt_processors["eval"](question)
model.predict_answers(samples={"image": image, "text_input": question}, inference_method="generate")
# ['singapore']
Unified Feature Extraction Interface
####################################
LAVIS provides a unified interface to extract multimodal features from each architecture.
To extract features, we load the feature extractor variants of each model.
The multimodal feature can be used for multimodal classification. The low-dimensional unimodal features can be used to compute cross-modal similarity.
.. code-block:: python
from lavis.models import load_model_and_preprocess
model, vis_processors, txt_processors = load_model_and_preprocess(name="blip_feature_extractor", model_type="base", is_eval=True, device=device)
caption = "a large fountain spewing water into the air"
image = vis_processors["eval"](raw_image).unsqueeze(0).to(device)
text_input = txt_processors["eval"](caption)
sample = {"image": image, "text_input": [text_input]}
features_multimodal = model.extract_features(sample)
print(features_multimodal.keys())
# odict_keys(['image_embeds', 'multimodal_embeds'])
print(features_multimodal.multimodal_embeds.shape)
# torch.Size([1, 12, 768]), use features_multimodal[:, 0, :] for multimodal classification tasks
features_image = model.extract_features(sample, mode="image")
print(features_image.keys())
# odict_keys(['image_embeds', 'image_embeds_proj'])
print(features_image.image_embeds.shape)
# torch.Size([1, 197, 768])
print(features_image.image_embeds_proj.shape)
# torch.Size([1, 197, 256])
features_text = model.extract_features(sample, mode="text")
print(features_text.keys())
# odict_keys(['text_embeds', 'text_embeds_proj'])
print(features_text.text_embeds.shape)
# torch.Size([1, 12, 768])
print(features_text.text_embeds_proj.shape)
# torch.Size([1, 12, 256])
similarity = features_image.image_embeds_proj[:, 0, :] @ features_text.text_embeds_proj[:, 0, :].t()
print(similarity)
# tensor([[0.2622]])
Since LAVIS supports a unified feature extraction interface, minimal changes are necessary to use a different model as feature extractor. For example,
to use ALBEF as the feature extractor, one only needs to change the following line:
.. code-block:: python
model, vis_processors, txt_processors = load_model_and_preprocess(name="albef_feature_extractor", model_type="base", is_eval=True, device=device)
Similarly, to use CLIP as feature extractor:
.. code-block:: python
model, vis_processors, txt_processors = load_model_and_preprocess(name="clip_feature_extractor", model_type="base", is_eval=True, device=device)
# model, vis_processors, txt_processors = load_model_and_preprocess(name="clip_feature_extractor", model_type="RN50", is_eval=True, device=device)
# model, vis_processors, txt_processors = load_model_and_preprocess(name="clip_feature_extractor", model_type="ViT-L-14", is_eval=True, device=device)
================================================
FILE: docs/index.rst
================================================
.. LAVIS documentation master file, created by
sphinx-quickstart on Sun Jul 31 10:32:27 2022.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to LAVIS's documentation!
=================================
.. toctree::
:maxdepth: 1
:caption: Introduction
intro
.. toctree::
:maxdepth: 1
:caption: Getting Started
getting_started
.. :maxdepth: 1
.. :caption: Advanced Training
.. advanced_training
.. toctree::
:maxdepth: 2
:caption: Advanced Usage
benchmark
tutorial
.. Documentations
.. ===================
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
================================================
FILE: docs/intro.rst
================================================
What is LAVIS?
####################################
LAVIS is a Python deep learning library for LAnguage-and-VISion research and applications.
It features a unified design to access state-of-the-art foundation language-vision models (`ALBEF <https://arxiv.org/pdf/2107.07651.pdf>`_,
`BLIP <https://arxiv.org/pdf/2201.12086.pdf>`_, `ALPRO <https://arxiv.org/pdf/2112.09583.pdf>`_, `CLIP <https://arxiv.org/pdf/2103.00020.pdf>`_), common tasks
(retrieval, captioning, visual question answering, multimodal classification etc.) and datasets (COCO, Flickr, Nocaps, Conceptual
Commons, SBU, etc.).
This library aims to provide engineers and researchers with a one-stop solution to rapidly develop models for their specific multimodal
scenarios, and benchmark them across standard and customized datasets.
Key features of LAVIS include:
- **Modular and Extensible Library Design**: facilitating to easily utilize and repurpose existing modules (datasets, models, preprocessors), also to add new modules.
- **Easy Off-the-shelf Inference and Feature Extraction**: readily available pre-trained models let you take advantage of state-of-the-art multimodal understanding and generation capabilities on your own data.
- **Reproducible Model Zoo**: provided training/pre-training recipies to easily replicate and extend state-of-the-art models.
- **Dataset Zoo and Automatic Downloading Tools**: it can be a hassle to prepare the many language-vision datasets. LAVIS provides automatic downloaing scripts to help prepare a large variety of datasets and their annotations.
Other features include:
- **Distributed Training** using multiple GPUs on one machine or across multiple machines.
- **Web Demo**: try supported models on your own pictures, questions etc.
- **Leaderboard**: comparing state-of-the-art models across standard datasets.
- **Dataset Explorer**: help browse and understand language-vision datasets.
Supported Tasks, Models and Datasets
####################################
The following table shows the supported models and language-vision tasks by LAVIS. Adapting existing models to more tasks is possible and next to come in future releases.
======================================== =========================== ============================================= ============
Tasks Supported Models Supported Datasets Modalities
======================================== =========================== ============================================= ============
Image-text Pre-training ALBEF, BLIP COCO, VisualGenome, SBU, ConceptualCaptions image, text
Image-text Retrieval ALBEF, BLIP, CLIP COCO, Flickr30k image, text
Text-image Retrieval ALBEF, BLIP, CLIP COCO, Flickr30k image, text
Visual Question Answering ALBEF, BLIP VQAv2, OKVQA, A-OKVQA image, text
Image Captioning BLIP COCO, NoCaps image, text
Image Classification CLIP ImageNet image
Natural Language Visual Reasoning (NLVR) ALBEF, BLIP NLVR2 image, text
Visual Entailment (VE) ALBEF SNLI-VE image, text
Visual Dialogue BLIP VisDial image, text
Video-text Retrieval BLIP, ALPRO MSRVTT, DiDeMo video, text
Text-video Retrieval BLIP, ALPRO MSRVTT, DiDeMo video, text
Video Question Answering (VideoQA) BLIP, ALPRO MSRVTT, MSVD video, text
Video Dialogue VGD-GPT AVSD video, text
Multimodal Feature Extraction ALBEF, CLIP, BLIP, ALPRO customized image, text
======================================== =========================== ============================================= ============
Library Design
####################################
.. image:: _static/architecture.png
:width: 550
LAVIS has six key modules.
- ``lavis.runners`` manages the overall training and evaluation lifecycle. It is also responsible for creating required components lazily as per demand, such as optimizers, learning rate schedulers and dataloaders. Currently ``RunnerBase`` implements epoch-based training and ``RunerIters`` implements iteration-based training.
- ``lavis.tasks`` implements concrete training and evaluation logic per task. A task could be, for example, retrieval, captioning, pre-training. The rationale to have an abstraction of task is to accomodate task-specific training and evaluation. For example, evaluating a retrieval model is different from a classification model.
- ``lavis.datasets`` is responsible for creating datasets, where ``lavis.datasets.builders`` loads dataset configurations, downloads annotations and returns a dataset object; ``lavis.datasets.datasets`` defines the supported datasets, each is a ``torch.utils.data.Dataset`` instance. We also provide `automatic dataset downloading tools` in ``datasets/download_scripts`` to help prepare common public datasets.
- ``lavis.models`` holds definition for the supported models and shared model layers.
- ``lavis.processors`` handles preprocessing of text and images/videos before feeding the model. For images and videos, a processor can be thought as transfroms in torchvision; for text input, this may include lowering case, truncation etc.
- ``lavis.common`` module contains shared classes and methods used by multiple other modules. For example,
- ``lavis.common.config`` contains classes to store and manipulate configuration files used by LAVIS. In particular, we use a hierarchical configuration design, to allow highly customizable training and evaluation.
- ``lavis.common.registry`` serves as a centralized place to manage modules that share the same functionalities. It allows building datasets, models, tasks, and learning rate schedulers during runtime, by specifying their names as string in the configuration file.
- ``lavis.common.optims`` contains definitions of learning rate schedulers.
- ``lavis.common.dist_utils`` contains utilities for distributed training and evaluation.
- ``lavis.common.utils`` contains miscellaneous utilities, mostly IO-related helper functions.
Installation
############
1. (Optional) Creating conda environment
.. code-block:: bash
conda create -n lavis python=3.8
conda activate lavis
2. Cloning and building from source
.. code-block:: bash
git clone https://github.com/salesforce/LAVIS.git
cd LAVIS
pip install .
If you would like to develop on LAVIS, you may find it easier to build with editable mode::
pip install -e .
================================================
FILE: docs/make.bat
================================================
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd
================================================
FILE: docs/requirements.txt
================================================
GitPython
ipykernel
nbsphinx==0.8.7
pandoc
sphinx
sphinx_autodoc_typehints
sphinx_rtd_theme
================================================
FILE: docs/tutorial.configs.rst
================================================
.. _config:
Training Models on Task Datasets (Commands and Configurations)
#################################################################
LAVIS provides scripts to pre-train and finetune supported models on standard language-vision tasks, stored at ``lavis/run_scripts/``.
To replicate the experiments, just run these bash scripts. For example, to train BLIP model on the image-text retrieval task with MSCOCO dataset, we can run
.. code-block::
bash run_scripts/lavis/blip/train/train_retrieval_coco.sh
Inside the scripts, we can see
.. code-block:: bash
python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/blip/train/retrieval_coco_ft.yaml
where we start a pytorch distributed training on 8 GPUs (you may change according to your own hardware setup). The ``--cfg-path`` specifys a `runtime configuration file`, specifying
the task, model, dataset and training recipes.
Available options and their descriptions are as below.
.. LAVIS executes training and evaluation based on arguments specified in the configuration files. The default model and dataset configurations are defined in ``lavis/configs``. The task-specific configurations are defined in ``lavis/projects``. Task-specific configurations have higher priority over the default configurations.
.. The following tables provide explanations for the arguments in the configuration files.
.. list-table::
:widths: 30 40
:header-rows: 1
* - Model Configurations
- Functionalities
* - arch
- | name of the model from the model zoo
| default: task-dependent
* - model_type
- | the type of the model (e.g., base)
| default: task-dependent
* - load_pretrained
- | load pretrained weights
| default: True (for finetuning task) | False (for pretraining task)
* - load_finetuned
- | load task-specific finetuned weights
| default: False (for finetuning task) | True (for evaluation)
* - pretrained
- | URL or local path which stores the pretrained model, defined in the default model configuration file
| default: task-dependent
* - finetuned
- | URL or local path which stores the finetuned model, defined in the default model configuration file
| default: task-dependent
.. list-table::
:widths: 30 50
:header-rows: 1
* - Dataset Configurations
- Functionalities
* - vis_processor
- | pre-processing of visual input
| default: task-dependent
* - text_processor
- | pre-processing of text input
| default: task-dependent
* - build_info
- | dataset information including the storage location, defined in the default dataset configuration file
| default: task-dependent
.. list-table::
:widths: 30 50
:header-rows: 1
* - Runtime Configurations
- Functionalities
* - task
- | name of the task
| default: task-dependent
* - lr_sched
- | learning rate schedular
| default: linear_warmup_cosine_lr
* - init_lr
- | initial learning rate (after warmup)
| default: task-dependent
* - min_lr
- | final learning rate after decay
| default: task-dependent
* - warmup_lr
- | starting learning rate for warmup
| default: init_lr (no warmup)
* - lr_decay_rate
- | learning rate decay per epoch for step_lr_shedule
| default: 0.9
* - warmup_steps
- | number of steps for learning rate warmup
| default: 0
* - max_epoch
- | total number of training epochs
| default: task-dependent
* - weight_decay
- | weight decay coefficient for the optimizer
| default: 0.05
* - batch_size_train
- | batch size during training
| default: task-dependent
* - batch_size_eval
- | batch size during evaluation
| default: task-dependent
* - seed
- | pseudo random number generator seed
| default: 42
* - output_dir
- | directory to store logs, results and checkpoints
| default: task-dependent
* - resume_ckpt_path
- | path of the checkpoint to resume training from
| default: None
* - evaluate
- | only perform evaluation without training
| default: False
* - train_splits
- | dataset splits used for training
| default: ["train"]
* - valid_splits
- | dataset splits used for validation
| default: ["val"]
* - test
- | dataset splits used for test
| default: ["test"]
* - device
- | use cpu or gpu (cuda)
| default: cuda
* - world_size
- | number of processes participating in the job
| default: 1
* - dist_url
- | URL specifying how to initialize the process group
| default: "env://"
* - distributed
- | use distributed training
| default: True
* - amp
- | use automatic mixed precision training
| default: False
.. list-table::
:widths: 40 50
:header-rows: 1
* - Text Generation Configurations
- Functionalities
* - max_len
- | maximum number of text tokens to generate
| default: 20 (for image captioning)
* - min_len
- | minimum number of text tokens to generate
| default: 5 (for image captioning)
* - num_beams
- | number of beams to perform beam search
| default: 3
.. list-table::
:widths: 40 50
:header-rows: 1
* - Multimodal Retrieval Configurations
- Functionalities
* - negative_all_rank
- | collect negatives from all processes for the image-text matching loss
| default: True (for coco)
* - k_test
- | number of retrieval candidates ranked from contrastive similarity
| default: 256 (for coco)
================================================
FILE: docs/tutorial.datasets.rst
================================================
Adding Datasets
################################################
This is a tutorial on adding a new dataset using ``lavis.datasets`` module.
The LAVIS library includes a standard dataset module, which allows customization to add new datasets.
The ``lavis.datasets`` module is designed such that any new dataset class can be easily added and adapted from our code base, including creating dataset configuration, and defining and associating new dataset classes.
In this tutorial, we will replicate the steps to add a dataset class for the `Audio-Visual Scene-Aware Dialogue (AVSD) <https://arxiv.org/pdf/1901.09107.pdf>`_ benchmark for the video-grounded dialogue task.
Dataset Configuration ``lavis.configs.datasets``
**************************************************************
First, we define the basic configurations for this dataset, including a new dataset class ``avsd_dialogue``, dataset card, and data types.
We can define any new dataset configuration in ``lavis.configs.datasets``. For instance, under this module, we can set up a configuration file ``avsd/defaults_dial.yaml`` as follows:
.. code-block:: yaml
datasets:
avsd_dialogue: # name of the dataset builder
dataset_card: dataset_card/avsd_dialogue.md # path to the dataset card
data_type: features # [images|videos|features] we use features in this case for extracted video features
build_info:
# Be careful not to append minus sign (-) before split to avoid itemizing
annotations:
train:
url: /export/home/data/avsd/train_set4DSTC7-AVSD.json
storage: avsd/annotations/train.json
val:
url: /export/home/data/avsd/valid_set4DSTC7-AVSD.json
storage: avsd/annotations/val.json
test:
url: /export/home/data/avsd/test_set4DSTC7-AVSD.json
storage: avsd/annotations/test.json
features:
storage: /export/home/data/avsd/features/
Dataset Card
===============
One optional step to set up dataset configuration is defining a dataset card, which contains more details about the dataset such as description, tasks, and metrics.
For instance, we can define a dataset card for the AVSD benchmark in ``dataset_card/avsd_dialogue.md``.
Depending on the dataset, we included in its corresponding dataset card the command for auto-downloading data (with python code defined in ``lavis.datasets.download_scripts``) that will automatically load the data and store it in a specific folder.
Else, you should describe in the dataset card the external download instructions from the original data source to load the dataset properly.
One example of a dataset card for the AVSD benchmark is:
.. code-block:: md
(Samples from the AVSD dataset. Image credit: "https://arxiv.org/pdf/1901.09107.pdf")
# Audio-Visual Scene-Aware Dialogues (AVSD)
## Description
[Audio-Visual Scene-Aware Dialogues (AVSD)](https://github.com/hudaAlamri/DSTC7-Audio-Visual-Scene-Aware-Dialog-AVSD-Challenge) contains more than 10,000 dialogues, each of which is grounded on a unique video. In the test split, for each test sample, 6 reference dialogue responses are provided.
## Task
(https://github.com/hudaAlamri/DSTC7-Audio-Visual-Scene-Aware-Dialog-AVSD-Challenge)
In a **video-grounded dialogue task**, the system must generate responses to user input in the context of a given dialog.
This context consists of a dialog history (previous utterances by both user and system) in addition to video and audio information that comprise the scene. The quality of a system’s automatically generated sentences is evaluated using objective measures to determine whether or not the generated responses are natural and informative
## Metrics
Models are typically evaluated according to [BLEU](https://aclanthology.org/P02-1040/), [CIDER](https://www.cv-foundation.org/openaccess/content_cvpr_2015/papers/Vedantam_CIDEr_Consensus-Based_Image_2015_CVPR_paper.pdf), [METEOR](https://aclanthology.org/W05-0909/), and [ROUGE-L](https://aclanthology.org/W04-1013/) metrics.
## Leaderboard
....
## Auto-Downloading
Please refer to [benchmark webite](https://github.com/hudaAlamri/DSTC7-Audio-Visual-Scene-Aware-Dialog-AVSD-Challenge) for instructions to download the dataset.
## References
"Audio Visual Scene-Aware Dialog", Huda Alamri, Vincent Cartillier, Abhishek Das, Jue Wang, Anoop Cherian, Irfan Essa, Dhruv Batra, Tim K. Marks, Chiori Hori, Peter Anderson, Stefan Lee, Devi Parikh
Visual Data Type
==============================
We currently limit the visual data types to one of three options: ``images``, ``videos``, and ``features``.
"Images" and "videos" refer to the raw visual data, which is appropriate for models processing visual data in their original forms (e.g. ViT models).
"Features" are visual representations extracted from pretrained models (e.g. CNN models).
In this tutorial, the AVSD benchmark consists of video features extracted from 3D-CNN models.
Build Info
==============================
Build info refers to the specific locations where data is stored and cached.
For text annotations (e.g. captioning or dialogues), by default, we include three data splits, namely "train", "val", and "test", typically used in all machine learning projects.
For each split, we specify 2 parameters: ``url`` and ``storage``.
``url`` can be either an online URL where the dataset can be loaded automatically (e.g. from *googleapis*), or a local directory where data is already downloaded beforehand.
``storage`` is the directory where the data will be cached over time, avoiding downloading data repeatedly.
For visual data annotations, ensure the field name matches the data types defined earlier (e.g. one of "images", "videos" or features").
As visual features are usually large and should be downloaded beforehand, we maintain only a ``storage`` parameter where visual data is cached.
Dataset ``lavis.datasets.datasets``
**************************************************************
Base Dataset ``lavis.datasets.datasets.base_dataset``
=======================================================
In this step, we want to define new dataset classes that inherit our base dataset class ``lavis.datasets.datasets.base_dataset``. This base dataset class already defines standard methods such as ``collater`` which uses the default collator from Pytorch.
.. code-block:: python
import json
from typing import Iterable
from torch.utils.data import Dataset, ConcatDataset
from torch.utils.data.dataloader import default_collate
class BaseDataset(Dataset):
def __init__(
self, vis_processor=None, text_processor=None, vis_root=None, ann_paths=[]
):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.vis_root = vis_root
self.annotation = []
for ann_path in ann_paths:
self.annotation.extend(json.load(open(ann_path, "r")))
self.vis_processor = vis_processor
self.text_processor = text_processor
self._add_instance_ids()
def __len__(self):
return len(self.annotation)
def collater(self, samples):
return default_collate(samples)
def set_processors(self, vis_processor, text_processor):
self.vis_processor = vis_processor
self.text_processor = text_processor
def _add_instance_ids(self, key="instance_id"):
for idx, ann in enumerate(self.annotation):
ann[key] = str(idx)
Any dataset subclass will inherit these methods and it is optional to define and overwrite these methods accordingly to the specifications of the dataset.
We encourage users not to modify the base dataset class as any modification will have cascading impacts on any other dataset classes that inherit this base dataset.
Instead, the users should independently create new dataset classes to cater to their specific requirements.
Dialogue Datasets ``lavis.datasets.datasets.dialogue_datasets``
======================================================================
For example, for the AVSD dataset, we want to define a new dataset subclass ``DialogueDataset`` for dialogue tasks. We can define this dataset class in ``lavis.datasets.datasets.dialogue_datasets`` as following:
.. code-block:: python
import os
from collections import OrderedDict
from lavis.datasets.datasets.base_dataset import BaseDataset
import json
import copy
class DialogueDataset(BaseDataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
"""
vis_processor (string): visual processor
text_processor (string): textual processor
vis_root (string): Root directory of images (e.g. coco/images/)
ann_paths (string): Root directory of images (e.g. coco/images/)
"""
self.vis_root = vis_root
self.annotation = []
for ann_path in ann_paths:
dialogs = json.load(open(ann_path, "r"))['dialogs']
for dialog in dialogs:
all_turns = dialog['dialog']
dialogue_context = []
for turn in all_turns:
dialog_instance = copy.deepcopy(dialog)
question = turn['question']
answer = turn['answer']
dialog_instance['dialog'] = copy.deepcopy(dialogue_context)
dialog_instance['question'] = question
dialog_instance['answer'] = answer
self.annotation.append(dialog_instance)
dialogue_context.append(turn)
self.vis_processor = vis_processor
self.text_processor = text_processor
self._add_instance_ids()
self.img_ids = {}
n = 0
for ann in self.annotation:
img_id = ann["image_id"]
if img_id not in self.img_ids.keys():
self.img_ids[img_id] = n
n += 1
Class inheritance allows us to define multiple subclasses. For instance, we want another dialogue dataset class that is defined only for the test split. We can define another dataset class ``DialogueEvalDataset`` as similarly defined above but the annotations are processed differently.
Typically, in dialogue tasks, during test time, only a single test sample is constructed per dialogue (rather than decomposing all dialogue turns as samples during training time).
The dataset class can then be defined as:
.. code-block:: python
class DialogueEvalDataset(BaseDataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
# ...
# defined similarly as DialogueDataset above
# except for the loading of dialogue annotation data
self.annotation = []
for ann_path in ann_paths:
dialogs = json.load(open(ann_path, "r"))['dialogs']
for dialog in dialogs:
all_turns = dialog['dialog']
dialogue_context = all_turns[:-1]
last_turn = all_turns[-1]
question = last_turn['question']
answer = last_turn['answer']
dialog['dialog'] = dialogue_context
dialog['question'] = question
dialog['answer'] = answer
self.annotation.append(dialog)
Using class inheritance to define datasets also allows us to develop more fine-grain class implementations, each of which is specifically designated for a benchmark.
For instance, under the dialogue-based tasks, we can further define another dataset subclass that is specified for the AVSD dataset.
We can define a new class ``AVSDDialDataset`` that further specifies how to load individual samples and collate them accordingly to specific requirements:
.. code-block:: python
import os
from lavis.datasets.datasets.base_dataset import BaseDataset
from lavis.datasets.datasets.dialogue_datasets import DialogueDataset, DialogueEvalDataset
import torch
class AVSDDialDataset(DialogueDataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
def __getitem__(self, index):
ann = self.annotation[index]
vname = ann["image_id"]
video = self.vis_processor(self.vis_root, vname)
dialogue = self.text_processor(ann)
return {
"video_fts": video['video_fts'],
"video_token_type_ids": video['token_type_ids'],
"input_ids": dialogue['input_ids'],
"token_type_ids": dialogue['token_type_ids'],
"labels": dialogue['labels'],
"image_id": ann["image_id"],
"instance_id": ann["instance_id"]
}
def collater(self, samples):
input_ids, token_type_ids, labels, video_fts, video_token_type_ids = [], [], [], [], []
for i in samples:
input_ids.append(i['input_ids'])
token_type_ids.append(i['token_type_ids'])
labels.append(i['labels'])
video_fts.append(i['video_fts'])
video_token_type_ids.append(i['video_token_type_ids'])
input_ids = self.text_processor.padding(input_ids)
labels = self.text_processor.padding(labels, -1)
video_fts = self.vis_processor.padding(video_fts)
token_type_ids = self.text_processor.padding(token_type_ids)
video_token_type_ids = self.text_processor.padding(video_token_type_ids)
token_type_ids = torch.cat([video_token_type_ids, token_type_ids], dim=1)
attn_mask = self.text_processor.get_attention_mask(input_ids)
video_mask = self.vis_processor.get_attention_mask(video_fts)
attn_mask = torch.cat([video_mask, attn_mask], dim=1)
video_labels = torch.ones((video_fts.size(0), video_fts.size(1))).long() * -1 # ignore token indice -1 by default
labels = torch.cat([video_labels, labels], dim=1)
samples = {}
samples['input_ids'] = input_ids
samples['token_type_ids'] = token_type_ids
samples['labels'] = labels
samples['video_fts'] = video_fts
samples['attn_mask'] = attn_mask
return samples
Note that in a dataset subclass, if methods such as ``__getitem__`` and ``collater`` are not defined, the same functions from the corresponding superclass will be used.
For instance, by default, we always use the collater from the ``BaseDataset`` class to collate data samples.
Dataset Builder ``lavis.datasets.builders``
**************************************************************
Dataset Builder is the data processing module that controls the dataset classes (by training or evaluation split) and associates the specific dataset configurations to these dataset classes.
Base Dataset Builder ``lavis.datasets.builders.base_dataset_builder``
======================================================================
Note that any new builder class definition should inherit the base dataset builder class ``lavis.datasets.builders.base_dataset_builder``:
.. code-block:: python
class BaseDatasetBuilder:
train_dataset_cls, eval_dataset_cls = None, None
...
This allows us to standardize the operations of dataset builders across all builder classes. We advise the users to carefully review the standard methods defined in the base builder class, including methods such as ``_download_data`` and ``build_dataset`` that will load download the data and create instances of dataset classes:
.. code-block:: python
class BaseDatasetBuilder:
...
def build_datasets(self):
# download, split, etc...
# only called on 1 GPU/TPU in distributed
if is_main_process():
self._download_data()
if is_dist_avail_and_initialized():
dist.barrier()
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("Building datasets...")
datasets = self.build() # dataset['train'/'val'/'test']
return datasets
def _download_data(self):
self._download_ann()
self._download_vis()
We encourage users not to modify the implementation of the base dataset builder class as this will affect all existing dataset builder subclasses.
Dialogue Dataset Builder ``lavis.datasets.builders.dialogue_builder``
======================================================================
We can define any new builder subclass and associate this builder with the corresponding dataset classes and dataset configurations.
For instance, for the AVSD dataset, we can define a builder ``lavis.datasets.builders.dialogue_builder`` for dialogue-based datasets as follows:
.. code-block:: python
from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
from lavis.datasets.datasets.avsd_dialogue_datasets import (
AVSDDialDataset,
AVSDDialEvalDataset
)
from lavis.common.registry import registry
@registry.register_builder("avsd_dialogue")
class AVSDDialBuilder(BaseDatasetBuilder):
train_dataset_cls = AVSDDialDataset
eval_dataset_cls = AVSDDialEvalDataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/avsd/defaults_dial.yaml"
}
Note that we chose to separately define the parameters ``train_dataset_cls`` and ``eval_dataset_cls`` to consider cases where data is processed differently between training and test time.
For instance, in captioning tasks, during test time, each data sample often includes multiple ground-truth captions rather than just a single ground-truth during training time.
If the data processing is the same in both training and test time, the two parameters can be linked to the same dataset class.
Finally, define ``DATASET_CONFIG_DICT`` to associate the dataset configurations to the assigned dataset classes.
Registering Builder ``lavis.datasets.builders.__init__``
======================================================================
To add a new builder class, ensure to first include the class within the ``__init__.py``. For instance, to define a new builder for the AVSD dataset:
.. code-block:: python
from lavis.datasets.builders.dialogue_builder import (
AVSDDialBuilder
)
__all__ = [
...,
"AVSDDialBuilder"
]
Assigning Builder
======================================================================
Note that during data loading and processing, the builder being assigned must have the correct registry to be able to load it properly.
For instance, the following should be specified in a configuration file e.g. ``dialogue_avsd_ft.yaml``:
.. code-block:: yaml
datasets:
avsd_dialogue: # name of the dataset builder
...
# processor configuration
...
Subsequently, any processes (e.g. training) should load this configuration file to assign the correct builder which will then associate the correct dataset classes to construct data samples.
.. code-block:: sh
python train.py --cfg-path dialogue_avsd_ft.yaml
================================================
FILE: docs/tutorial.evaluation.rst
================================================
Evaluating Pre-trained Models on Task Datasets
###############################################
LAVIS provides pre-trained and finetuned model for off-the-shelf evaluation on task dataset.
Let's now see an example to evaluate BLIP model on the captioning task, using MSCOCO dataset.
.. _prep coco:
Preparing Datasets
******************
First, let's download the dataset. LAVIS provides `automatic downloading scripts` to help prepare
most of the public dataset, to download MSCOCO dataset, simply run
.. code-block:: bash
cd lavis/datasets/download_scripts && bash download_coco.py
This will put the downloaded dataset at a default cache location ``cache`` used by LAVIS.
If you want to use a different cache location, you can specify it by updating ``cache_root`` in ``lavis/configs/default.yaml``.
If you have a local copy of the dataset, it is recommended to create a symlink from the cache location to the local copy, e.g.
.. code-block:: bash
ln -s /path/to/local/coco cache/coco
Evaluating pre-trained models
******************************
To evaluate pre-trained model, simply run
.. code-block:: bash
bash run_scripts/lavis/blip/eval/eval_coco_cap.sh
Or to evaluate a large model:
.. code-block:: bash
bash run_scripts/lavis/blip/eval/eval_coco_cap_large.sh
================================================
FILE: docs/tutorial.models.rst
================================================
Adding Models
####################################
This is a tutorial on adding new models using ``lavis.models`` module.
The LAVIS library includes a standard model module that builds the foundation for many major language-vision models such as `ALBEF <https://arxiv.org/pdf/2107.07651.pdf>`_,
`BLIP <https://arxiv.org/pdf/2201.12086.pdf>`_, `ALPRO <https://arxiv.org/pdf/2112.09583.pdf>`_, and `CLIP <https://arxiv.org/pdf/2103.00020.pdf>`_.
The ``lavis.models`` module is designed such that any new models can be added and integrated into the LAVIS library, with minimal steps to develop training and testing procedures.
In this tutorial, we will replicate the steps to add a GPT-style model specifically for `video-grounded dialogue tasks <https://arxiv.org/pdf/1901.09107.pdf>`_.
Base Model ``lavis.models.base_model``
**************************************************************
Note that any new model definition should inherit the base model class ``BaseModel``:
.. code-block:: python
from omegaconf import OmegaConf
import numpy as np
import torch
import torch.nn as nn
from lavis.common.utils import get_abs_path
class BaseModel(nn.Module):
"""Base class for models."""
def __init__(self):
super().__init__()
def forward_features(self, *args, **kwargs):
"""Similar to *forward* but only return features."""
raise NotImplementedError
def load_from_pretrained(self, url_or_filename):
raise NotImplementedError
@classmethod
def _from_config(cls, cfg=None, model_type="base"):
if not cfg:
# useful when building model without a provided configuration file
cfg = OmegaConf.load(cls.default_config_path(model_type)).model
return cls.from_config(cfg)
@classmethod
def from_pretrained(cls, model_type="base"):
"""
Build a pretrained model from the default configuration file, specified by model_type.
"""
return cls._from_config(cfg=None, model_type=model_type)
@property
def device(self):
return list(self.parameters())[0].device
@classmethod
def default_config_path(cls, model_type="base"):
assert (
model_type in cls.PRETRAINED_MODEL_CONFIG_DICT
), "Unknown model type {}".format(model_type)
return get_abs_path(cls.PRETRAINED_MODEL_CONFIG_DICT[model_type])
def before_evaluation(self, **kwargs):
pass
def show_n_params(self, return_str=True):
tot = 0
for p in self.parameters():
w = 1
for x in p.shape:
w *= x
tot += w
if return_str:
if tot >= 1e6:
return "{:.1f}M".format(tot / 1e6)
else:
return "{:.1f}K".format(tot / 1e3)
else:
return tot
In this base model, we already declare and standardize many common methods such as ``_from_config`` and ``_from_pretrained``.
Inheriting this base model class allows us to standardize operations of models across all model classes while still allowing customizations.
We advise users not to change the implementation of the base model class as this will affect all existing model subclasses.
GPT-style Video-grounded Dialogue Model ``lavis.models.gpt_models.gpt_dialogue``
********************************************************************************
In this step, we can define a new model class, e.g. under ``lavis.models.gpt_models.gpt_dialogue``, for GPT-based dialogue models designed specifically for video-grounded dialogues.
Note that we assume the model class inherits from the standard model super class ``GPT2LMHeadModel`` from the ``transformers`` `library <https://huggingface.co/docs/transformers/index>`_.
We also enforce model integration to the LAVIS framework through the inheritance of the ``BaseModel`` from the LAVIS library, as the secondary super class.
.. code-block:: python
import torch
from lavis.common.registry import registry
from lavis.models.base_model import BaseModel
from transformers import GPT2Model, GPT2LMHeadModel
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
import math
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
@registry.register_model("gpt_dialogue")
class GPTDialogue(GPT2LMHeadModel, BaseModel):
...
Next, we can modify the architecture of the model during model initialization to fit the tasks of interest, i.e. video-grounded dialogues.
In this case, we want to add additional model parameters for a linear network to transform the video feature representations to the model dimension.
.. code-block:: python
class GPTDialogue(GPT2LMHeadModel, BaseModel):
def __init__(self, config, len_video_ft=4224):
super().__init__(config)
self.video_ff = nn.Linear(len_video_ft, config.n_embd)
# Model parallel
self.model_parallel = False
self.device_map = None
# Initialize weights and apply final processing
self.post_init()
Note that for each new model class, we advise redefining the ``from_config`` method which is inherited from the ``BaseModel`` class.
As each model usually has its own unique configurations, redefining the method will ensure the model instances are created properly.
For instance, ``GPTDialogue`` requires an additional parameter of video feature length (``len_video_ft``) which should be part of the model initialization procedure.
Another additional parameter is the number of tokens/words (as we include additional special tokens in the vocabulary for dialogue tasks).
.. code-block:: python
class GPTDialogue(GPT2LMHeadModel, BaseModel):
...
@classmethod
def from_config(cls, cfg):
model = cls.from_pretrained('gpt2', len_video_ft=cfg['len_video_ft'])
model.resize_token_embeddings(cfg['len_tokenizer'])
return model
Other basic methods should also be defined explicitly in the new model class, including the ``forward`` function.
For instance, in GPT models for video-grounded dialogue tasks, we want the forward operation also includes the transformation and integration of video features before passing the representations to the Transformer layers.
.. code-block:: python
class GPTDialogue(GPT2LMHeadModel, BaseModel):
...
def forward(self, samples,
past_key_values=None,
position_ids=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None):
input_embs = self.transformer.wte(samples['input_ids'])
video_embs = self.video_ff(samples['video_fts'])
input_embs = torch.cat([video_embs, input_embs], dim=1)
transformer_outputs = self.transformer(
attention_mask=samples['attn_mask'],
token_type_ids=samples['token_type_ids'],
inputs_embeds=input_embs,
position_ids=position_ids,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = transformer_outputs[0]
lm_logits = self.lm_head(hidden_states)
...
Registering New Model ``lavis.models.__init__``
********************************************************************************
Any new model must be officially registered as part of the ``lavis.models`` module.
For instance, to add a model class for GPT-based dialogue models, we can modify the ``__init__.py`` as follows:
.. code-block:: python
from lavis.models.gpt_models.gpt_dialogue import GPTDialogue
__all__ = [
...
"GPTDialogue"
]
Assigning Model
********************************************************************************
From the above example of a model class, note that we define a ``from_config method`` for the new model class.
This method will process a configuration file and pass specific parameters to initialize the model classes properly.
To do this, we can assign/ associate the correct registry of model classes in a configuration file.
For instance, the following should be specified in a configuration file e.g. ``dialogue_avsd_ft.yaml``:
.. code-block:: yaml
model:
arch: gpt_dialogue # name of the model
model_type: base
Subsequently, any processes (e.g. training) should load this configuration file to assign the correct model.
.. code-block:: sh
python train.py --cfg-path dialogue_avsd_ft.yaml
Note that to simplify the model configuration, we only enable two main parameters here: ``arch`` and ``model_type``. ``arch`` refers to the model class registry, and ``model_type`` is the corresponding model type under this model family.
For instance, with ``gpt_dialogue``, we have a model ``base`` which has its own configuration in a separate configuration file e.g. ``gpt_dialogue_base.yaml``:
.. code-block:: yaml
model:
arch: gpt_dialogue
len_tokenizer: 50264 # 50257 tokens from gpt2 default tokenizer + additional special tokens
len_video_ft: 4224 # i3d_rgb: 2048 i3d_flow: 2048 vggish: 128
We can pass load this configuration and pass the parameters to the above ``from_config`` method to initialize the model accordingly.
We advise the users to maintain a dictionary that contains default paths to model configurations, in the model class definition.
By default, the LAVIS framework will search for configurations from each model class defined as ``model.PRETRAINED_MODEL_CONFIG_DICT``.
.. code-block:: python
class GPTDialogue(GPT2LMHeadModel, BaseModel):
PRETRAINED_MODEL_CONFIG_DICT = {
"base": "configs/models/gpt_dialogue_base.yaml"
}
...
================================================
FILE: docs/tutorial.processors.rst
================================================
Adding Processors
################################################
This is a tutorial on adding new processors using ``lavis.processors`` module.
The LAVIS library includes a standard processor module that preprocesses data e.g. image transformation and sequence concatenation.
The ``lavis.processors`` module is designed such that any processors can be added, specifically to the requirements of corresponding models of interest.
In this tutorial, we will replicate the steps to add visual and textual processors specifically for `video-grounded dialogue tasks <https://arxiv.org/pdf/1901.09107.pdf>`_.
In addition, we also want the processors to have processing features to make the data samples compatible with GPT-style models.
Base Processor ``lavis.processors.base_processors``
*****************************************************
Note that any new processor definition should inherit the base processor class ``BaseProcessor``:
.. code-block:: python
from omegaconf import OmegaConf
class BaseProcessor:
def __init__(self):
self.transform = lambda x: x
return
def __call__(self, item):
return self.transform(item)
@classmethod
def from_config(cls, cfg=None):
return cls()
def build(self, **kwargs):
cfg = OmegaConf.create(kwargs)
return self.from_config(cfg)
This allows us to standardize operations of processors across all processor classes while still allowing customization of processors specifically to data and model types.
We encourage users not to modify the implementation of the base processor class as this will have an impact on all existing processor subclasses.
GPT-style Processors ``lavis.processors.gpt_processors``
**************************************************************
In this step, we can define new processor classes, e.g. under ``lavis.processors.gpt_processors``, for GPT models designed specifically for video-grounded dialogues.
First, we want to process video features by defining ``GPTVideoFeatureProcessor`` class.
In this tutorial, we assume video features are extracted beforehand and this processor simply loads the features from ``npy`` files.
Other methods that are specifically defined are ``padding`` (which is used by dataset instances to pad multiple video samples) and ``get_attention_mask`` (which creates an attention mask for Transformer attention in GPT models).
.. code-block:: python
SPECIAL_TOKENS_DICT = {'bos_token': "<bos>", 'eos_token': "<eos>", 'additional_special_tokens': ["<speaker1>", "<speaker2>", "<video>", "<cap>"], 'pad_token': "<pad>"}
...
@registry.register_processor("gpt_video_ft")
class GPTVideoFeatureProcessor(BaseProcessor):
def __init__(self, visual_ft, audio_ft):
self.visual_ft = visual_ft
self.audio_ft = audio_ft
self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
self.tokenizer.add_special_tokens(SPECIAL_TOKENS_DICT)
def padding(self, seq):
padded_seq = torch.nn.utils.rnn.pad_sequence(seq, batch_first=True, padding_value=1.0)
return padded_seq
def get_attention_mask(self, seq):
return torch.sum(seq != 1, dim=2) != 0
def __call__(self, ft_root, vname):
all_ft = []
for ft_name in self.visual_ft:
ft_path = os.path.join(ft_root, ft_name, vname)
all_ft.append(np.load(ft_path + '.npy'))
for ft_name in self.audio_ft:
ft_path = os.path.join(ft_root, ft_name, vname)
all_ft.append(np.load(ft_path + '.npy'))
min_len = min([len(ft) for ft in all_ft])
sampled_ft = [ft[:min_len] for ft in all_ft]
sampled_ft = np.concatenate(sampled_ft, axis=1)
item = {}
item['video_fts'] = torch.Tensor(sampled_ft)
video_type_token = self.tokenizer.convert_tokens_to_ids('<video>')
item['token_type_ids'] = torch.Tensor([video_type_token] * len(sampled_ft)).long()
return item
@classmethod
def from_config(cls, cfg=None):
if cfg is None:
cfg = OmegaConf.create()
visual_ft = cfg.get("visual_ft", ["i3d_rgb"])
audio_ft = cfg.get("audio_ft", ["vggish"])
return cls(
visual_ft=visual_ft,
audio_ft=audio_ft
)
Another processor class that will be useful to have is to process dialogue data. Here we can define a ``GPTDialogueProcessor`` class.
This processor class receives raw annotations and constructs inputs as a concatenation of input sequences (questions, dialogue contexts, and responses) to facilitate application in GPT models.
Other methods that are specifically defined are ``padding`` (which is used by dataset instances to pad multiple sequence samples) and ``get_attention_mask`` (which creates an attention mask for Transformer attention in GPT models).
.. code-block:: python
SPECIAL_TOKENS_DICT = {'bos_token': "<bos>", 'eos_token': "<eos>", 'additional_special_tokens': ["<speaker1>", "<speaker2>", "<video>", "<cap>"], 'pad_token': "<pad>"}
...
@registry.register_processor("gpt_dialogue")
class GPTDialogueProcessor(BaseProcessor):
def __init__(self, max_turns=3, use_caption=True):
self.max_turns = max_turns
self.use_caption = use_caption
self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
self.tokenizer.add_special_tokens(SPECIAL_TOKENS_DICT)
def sample_sequence(self, caption, history, answer):
bos, eos, speaker1, speaker2, cap = self.tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-2])
instance = {}
sequence = [caption] + history + [answer]
sequence = [s + [eos] for s in sequence]
instance["input_ids"] = list(chain(*sequence))
instance["token_type_ids"] = [cap] * len(sequence[0]) + [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence[1:]) for _ in s]
instance["labels"] = ([-1]*sum(len(s) for s in sequence[:-1])) + sequence[-1]
assert len(instance["input_ids"])==len(instance["token_type_ids"])
assert len(instance["token_type_ids"])==len(instance["labels"])
for k,v in instance.items():
instance[k] = torch.Tensor(v).long()
return instance
def padding(self, seq, pad_token=-1):
if pad_token==-1: pad_token = self.tokenizer.pad_token_id
padded_seq = torch.nn.utils.rnn.pad_sequence(seq, batch_first=True, padding_value=pad_token)
return padded_seq
def get_attention_mask(self, seq, pad_token=-1):
if pad_token==-1: pad_token = self.tokenizer.pad_token_id
return seq != pad_token
def __call__(self, ann):
if self.use_caption:
caption = ' '.join([ann['caption'], ann['summary']])
caption = self.tokenizer.encode(caption)
else:
caption = []
dial_history = []
for turn in ann['dialog'][-self.max_turns:]:
dial_history.append(turn['question'])
dial_history.append(turn['answer'])
dial_history.append(ann['question'])
dial_history = [self.tokenizer.encode(t) for t in dial_history]
answer = self.tokenizer.encode(ann['answer'])
item = self.sample_sequence(caption, dial_history, answer)
return item
@classmethod
def from_config(cls, cfg=None):
if cfg is None:
cfg = OmegaConf.create()
use_caption = cfg.get("use_caption", True)
max_turns = cfg.get("max_turns", 3)
return cls(max_turns=max_turns, use_caption=use_caption)
Registering New Processors ``lavis.processors.__init__``
**************************************************************
Finally, any new processor must be officially registered as part of the ``lavis.processors`` module.
For instance, to add processor classes for GPT-based dialogue models, including one for dialogue data ``GPTDialogueProcessor`` and one for video features ``GPTVideoFeatureProcessor``, we can modify the ``__init__.py`` as follows:
.. code-block:: python
from lavis.processors.gpt_processors import (
GPTVideoFeatureProcessor,
GPTDialogueProcessor,
)
__all__ = [
...
# GPT
"GPTVideoFeatureProcessor",
"GPTDialogueProcessor"
]
Assigning Processors
**************************************************************
From the above example of processor classes, note that we define a ``from_config`` method for each class.
This method will process a configuration file and pass specific parameters e.g. ``max_turns``, ``visual_ft``, to initialize the processor classes properly.
To do this, we can assign/ associate the correct registry of processor classes in a configuration file.
For instance, the following should be specified in a configuration file e.g. ``dialogue_avsd_ft.yaml``:
.. code-block:: yaml
datasets:
avsd_dialogue: # name of the dataset builder
vis_processor:
train:
name: "gpt_video_ft" # name of the visual processor for training data
visual_ft: ["i3d_flow", "i3d_rgb"]
audio_ft: ["vggish"]
eval:
name: "gpt_video_ft" # name of the visual processor for evaluation data
visual_ft: ["i3d_flow", "i3d_rgb"]
audio_ft: ["vggish"]
text_processor:
train:
name: "gpt_dialogue" # name of the textual processor for training data
max_turns: 3
use_caption: True
eval:
name: "gpt_dialogue" # name of the textual processor for evaluation data
max_turns: 3
use_caption: True
Subsequently, any processes (e.g. training) should load this configuration file to assign the correct processors.
.. code-block:: sh
python train.py --cfg-path dialogue_avsd_ft.yaml
================================================
FILE: docs/tutorial.rst
================================================
Tutorials
==============================
.. toctree::
:maxdepth: 1
tutorial.evaluation
tutorial.training-example
tutorial.configs
tutorial.datasets
tutorial.processors
tutorial.models
tutorial.tasks
================================================
FILE: docs/tutorial.tasks.rst
================================================
Adding Tasks
####################################
This is a tutorial on adding new machine learning tasks using ``lavis.tasks`` module.
The LAVIS library includes a standard task module that centralizes the model training and evaluation procedure of machine learning tasks.
The ``lavis.tasks`` module is designed such that any new tasks can be added and integrated, catering to any customization in the training and testing procedures.
In this tutorial, we will replicate the steps to add a new task into LAVIS for the `video-grounded dialogue tasks <https://arxiv.org/pdf/1901.09107.pdf>`_.
Base Task ``lavis.tasks.base_task``
********************************************************************************
Note that any new model definition should inherit the base task class ``BaseTask``:
.. code-block:: python
import logging
import os
import torch.distributed as dist
from lavis.common.dist_utils import get_rank, get_world_size, is_main_process
from lavis.common.logger import MetricLogger, SmoothedValue
from lavis.common.registry import registry
from lavis.datasets.data_utils import prepare_sample
class BaseTask:
def __init__(self, **kwargs):
super().__init__()
self.inst_id_key = "instance_id"
@classmethod
def setup_task(cls, **kwargs):
return cls()
def build_model(self, cfg):
model_config = cfg.model_cfg
model_cls = registry.get_model_class(model_config.arch)
return model_cls.from_config(model_config)
def build_datasets(self, cfg):
"""
Build a dictionary of datasets, keyed by split 'train', 'valid', 'test'.
Download dataset and annotations automatically if not exist.
Args:
cfg (common.config.Config): _description_
Returns:
dict: Dictionary of torch.utils.data.Dataset objects by split.
"""
datasets = dict()
datasets_config = cfg.datasets_cfg
assert len(datasets_config) > 0, "At least one dataset has to be specified."
for name in datasets_config:
dataset_config = datasets_config[name]
builder = registry.get_builder_class(name)(dataset_config)
dataset = builder.build_datasets()
datasets[name] = dataset
return datasets
def train_step(self, model, samples):
loss = model(samples)["loss"]
return loss
...
In this base task, we already declare and standardize many common methods such as ``train_step``, ``build_model``, and ``build_datasets``.
Inheriting this base task class allows us to standardize operations of tasks across all task classes.
We recommend users not change the implementation of the base task class as this will have an impact on all existing task subclasses.
Dialogue Task ``lavis.tasks.dialogue``
********************************************************************************
In this step, we can define a new task class, e.g. under ``lavis.tasks.dialogue``, for video-grounded dialogues.
For instance, we define a new task class ``DialogueTask`` that inherits the super task class ``BaseTask``.
.. code-block:: python
import json
import os
from lavis.common.dist_utils import main_process
from lavis.common.logger import MetricLogger
from lavis.common.registry import registry
from lavis.tasks.base_task import BaseTask
from lavis.datasets.data_utils import prepare_sample
import numpy as np
@registry.register_task("dialogue")
class DialogueTask(BaseTask):
def __init__(self, num_beams, max_len, min_len, evaluate, report_metric=True):
super().__init__()
self.num_beams = num_beams
self.max_len = max_len
self.min_len = min_len
self.evaluate = evaluate
self.report_metric = report_metric
@classmethod
def setup_task(cls, cfg):
run_cfg = cfg.run_cfg
num_beams = run_cfg.num_beams
max_len = run_cfg.max_len
min_len = run_cfg.min_len
evaluate = run_cfg.evaluate
report_metric = run_cfg.get("report_metric", True)
return cls(
num_beams=num_beams,
max_len=max_len,
min_len=min_len,
evaluate=evaluate,
report_metric=report_metric,
)
def valid_step(self, model, samples):
results = []
loss = model(samples)["loss"].item()
return [loss]
...
Note that for any new task, we advise the users to review carefully the functions implemented within ``BaseTask`` and consider which methods should be modified.
For instance, the base task class already contains a standard implementation of model training steps that are common among machine learning steps.
Some major methods we want to emphasize and should be customized by each task are the ``valid_step`` and ``evaluation``.
These operations were not fully implemented in the base task class due to the differences in evaluation procedures among many machine learning tasks.
Another method that should be considered is the ``setup_task`` method.
This method will receive configurations that set task-specific parameters to initialize any task instance.
Registering New Task ``lavis.tasks.__init__``
********************************************************************************
Any new task must be officially registered as part of the ``lavis.tasks`` module. For instance, to add a new task for video-grounded dialogues, we can modify the ``__init__.py`` as follows:
.. code-block:: python
from lavis.tasks.dialogue import DialogueTask
...
__all__ = [
...
"DialogueTask"
]
Assigning Task
***************
From the above example of task class, note that we define a ``setup_task`` method for each task class.
This method will process a configuration file and pass specific parameters e.g. ``num_beams`` (for beam search generative tasks during the inference stage), to initialize the task classes properly.
To assign and associate any task, we need to specify the correct registry of task classes in a configuration file.
For instance, the following should be specified in a configuration file e.g. ``dialogue_avsd_ft.yaml``:
.. code-block:: yaml
run:
task: dialogue # name of the task
# optimizer
...
max_len: 20
min_len: 5
num_beams: 3
...
Subsequently, any processes (e.g. training) should load this configuration file to assign the correct task.
.. code-block:: sh
python train.py --cfg-path dialogue_avsd_ft.yaml
================================================
FILE: docs/tutorial.training-example.rst
================================================
Example on Finetuning BLIP on COCO-Captioning
################################################
To finetune BLIP model on the coco caption dataset, first refer to :ref:`prep coco` to prepare the dataset if you have not done so.
To finetune the model, we have prepared a run script for you, which can run as follows:
.. code-block:: bash
bash run_scripts/lavis/blip/train/train_caption_coco_large.sh
This will finetune the pre-trained BLIP large model into a new model that can be used for captioning.
Deep Dive
**********
Now let's take a closer look at the script and see what it does.
.. code-block:: bash
python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/blip/train/caption_coco_large_ft.yaml
As can be seen, the script simply calls the :code:`train.py` with PyTorch distributed training enabled.
The :code:`--cfg-path` argument specifies the **runtime config** file to use. The config file is a YAML file that specifies the training parameters, shown as follows:
.. literalinclude:: ../lavis/projects/blip/train/caption_coco_large_ft.yaml
:language: yaml
:linenos:
The runtime config file is divided into 3 sections:
- :code:`model`: specifies the model architecture and type to use.
- :code:`data`: specifies the dataset to use.
- :code:`run`: specifies the runner arguments, such as tasks, optimizer, learning rate scheduler, etc.
We describe each section in detail below.
Model configurations
=====================
.. literalinclude:: ../lavis/projects/blip/train/caption_coco_large_ft.yaml
:language: yaml
:linenos:
:lines: 6-10
The :code:`arch` argument specifies the model architecture to use. In this case, we use the :code:`blip_caption` architecture.
You can find available architectures by inspecting the :code:`model_zoo`.
Once the architecture is specified, the runner will look for the model class registered with the name and try to instantiate a model instance.
In this case :code:`BlipCaption` is the model registered with the name :code:`blip_caption`.
The registry maintains a mapping from the name string to the model class.
This allows the runner to find the model class dynamically based on the name string from the config file.
The following segment in :code:`lavis/models/blip_models/blip_caption.py` shows how :code:`BlipCaption` is registered with the name string :code:`blip_caption`:
.. literalinclude:: ../lavis/models/blip_models/blip_caption.py
:language: python
:linenos:
:lines: 20-38
One same model architecture may be pre-trained or finetuned on different datasets or have different model configurations.
For example, :code:`BlipCaption` have:
- :code:`base_coco`: pre-trained base BLIP model adapated for COCO captioning finetuning.
- :code:`large_coco`: pre-trained large BLIP model adapated for COCO captioning finetuning.
Therefore, we also need to specify :code:`model_type`. Here we use :code:`large_coco`.
And we set :code:`load_finetuned` to :code:`False` to indicate that we are finetuning the model from the pre-trained weights.
If :code:`load_finetuned` set to :code:`True` as by default, the model will load finetuned weights on coco captioning.
Given the model architecture and type, the library will then look for the default model config for :code:`large_coco` in :code:`lavis/models/blip_models/blip_caption.py`.
As can be seen in the above code snippet, the corresponding config path is stored in :code:`BlipCaption.PRETRAINED_MODEL_CONFIG_DICT`.
Then the library will load :code:`lavis/configs/models/blip_caption_large_coco.yaml` as the configuration to build the model.
*Priority of Configs*: Note that the priority of the run config is higher than the default model config, meaning that arguments in the run config will override the default model config.
For example, in the default model config, :code:`load_finetuned` is set to :code:`True` by default, while in the run config, we set it to :code:`False` and finetuning from the pre-trained weights only.
Dataset configurations
=========================
The second section of the config file specifies the dataset(s) to use.
.. literalinclude:: ../lavis/projects/blip/train/caption_coco_large_ft.yaml
:language: yaml
:linenos:
:lines: 12-24
We associate each dataset with a :code:`vis_processor` and a :code:`text_processor`, responsible for processing the visual and textual input respectively.
Here we again use the registry mechanism to dynamically load the processor class based on the name string.
For example, :code:`blip_image_train` is the name string for the :code:`BlipImageTrainProcessor` class, which is registered in :code:`lavis/processors/blip_processors.py`.
Similarly, the dataset name string is also registered in the registry, pointing to a dataset builder :code:`COCOCapBuilder` class.
By default, the builder will load the default dataset configuration as in :code:`DATASET_CONFIG_DICT`. You may also add new dataset types by adding new entries to the dictionary.
The dataset configuration used here is:
.. literalinclude:: ../lavis/configs/datasets/coco/defaults_cap.yaml
:language: yaml
:linenos:
:lines: 6-28
In this configuration file, we specify the dataset name and mainly its building information.
The build information is divided into two parts: :code:`annotation` and :code:`images`. The annotation files will be automatically downloaded upon loading the dataset for the first time.
The :code:`images` part specifies the image root directory. This is a relative path to the cache directory, which is :code:`cache` by default. If you have a local copy of the dataset, you can specify the path to the local copy by
overwriting the :code:`images` part in the runtime config file. For example, you may alter the run config as below to use your local dataset copy:
.. code:: yaml
datasets:
coco_caption: # name of the dataset builder
vis_processor:
train:
name: "blip_image_train"
eval:
name: "blip_image_eval"
text_processor:
train:
name: "blip_caption"
prompt: "a picture of "
eval:
name: "blip_caption"
images:
YOUR_LOCAL_IMAGE_ROOT_DIR
LAVIS supports using multiple datasets for training. See an example in :code:`lavis/projects/blip/train/pretrain_14m.yaml`.
Runner configurations
=========================
The last section of the config file specifies the arguments for the runner, shown below:
.. literalinclude:: ../lavis/projects/blip/train/caption_coco_large_ft.yaml
:language: yaml
:linenos:
:lines: 26-56
Here we specify runner-related arguments, including
- task-specific arguments, such as :code:`task`, :code:`max_len`, :code:`min_len`, etc.
- learning rate schedulers, optimizer;
- distributed training settings;
- logging and checkpointing settings.
Available Configurations
#########################
See :ref:`config` for the full list of available configurations and their descriptions.
================================================
FILE: evaluate.py
================================================
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import argparse
import random
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import lavis.tasks as tasks
from lavis.common.config import Config
from lavis.common.dist_utils import get_rank, init_distributed_mode
from lavis.common.logger import setup_logger
from lavis.common.optims import (
LinearWarmupCosineLRScheduler,
LinearWarmupStepLRScheduler,
)
from lavis.common.utils import now
# imports modules for registration
from lavis.datasets.builders import *
from lavis.models import *
from lavis.processors import *
from lavis.runners.runner_base import RunnerBase
from lavis.tasks import *
def parse_args():
parser = argparse.ArgumentParser(description="Training")
parser.add_argument("--cfg-path", required=True, help="path to configuration file.")
parser.add_argument(
"--options",
nargs="+",
help="override some settings in the used config, the key-value pair "
"in xxx=yyy format will be merged into config file (deprecate), "
"change to --cfg-options instead.",
)
args = parser.parse_args()
# if 'LOCAL_RANK' not in os.environ:
# os.environ['LOCAL_RANK'] = str(args.local_rank)
return args
def setup_seeds(config):
seed = config.run_cfg.seed + get_rank()
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
cudnn.benchmark = False
cudnn.deterministic = True
def main():
# allow auto-dl completes on main process without timeout when using NCCL backend.
# os.environ["NCCL_BLOCKING_WAIT"] = "1"
# set before init_distributed_mode() to ensure the same job_id shared across all ranks.
job_id = now()
cfg = Config(parse_args())
init_distributed_mode(cfg.run_cfg)
setup_seeds(cfg)
# set after init_distributed_mode() to only log on master.
setup_logger()
cfg.pretty_print()
task = tasks.setup_task(cfg)
datasets = task.build_datasets(cfg)
model = task.build_model(cfg)
runner = RunnerBase(
cfg=cfg, job_id=job_id, task=task, model=model, datasets=datasets
)
runner.evaluate(skip_reload=True)
if __name__ == "__main__":
main()
================================================
FILE: lavis/__init__.py
================================================
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import os
import sys
from omegaconf import OmegaConf
from lavis.common.registry import registry
from lavis.datasets.builders import *
from lavis.models import *
from lavis.processors import *
from lavis.tasks import *
root_dir = os.path.dirname(os.path.abspath(__file__))
default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml"))
registry.register_path("library_root", root_dir)
repo_root = os.path.join(root_dir, "..")
registry.register_path("repo_root", repo_root)
cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
registry.register_path("cache_root", cache_root)
registry.register("MAX_INT", sys.maxsize)
registry.register("SPLIT_NAMES", ["train", "val", "test"])
================================================
FILE: lavis/common/config.py
================================================
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import logging
import json
from typing import Dict
from omegaconf import OmegaConf
from lavis.common.registry import registry
class Config:
def __init__(self, args):
self.config = {}
self.args = args
# Register the config and configuration for setup
registry.register("configuration", self)
user_config = self._build_opt_list(self.args.options)
config = OmegaConf.load(self.args.cfg_path)
runner_config = self.build_runner_config(config)
model_config = self.build_model_config(config, **user_config)
dataset_config = self.build_dataset_config(config)
# Validate the user-provided runner configuration
# model and dataset configuration are supposed to be validated by the respective classes
# [TODO] validate the model/dataset configuration
# self._validate_runner_config(runner_config)
# Override the default configuration with user options.
self.config = OmegaConf.merge(
runner_config, model_config, dataset_config, user_config
)
def _validate_runner_config(self, runner_config):
"""
This method validates the configuration, such that
1) all the user specified options are valid;
2) no type mismatches between the user specified options and the config.
"""
runner_config_validator = create_runner_config_validator()
runner_config_validator.validate(runner_config)
def _build_opt_list(self, opts):
opts_dot_list = self._convert_to_dot_list(opts)
return OmegaConf.from_dotlist(opts_dot_list)
@staticmethod
def build_model_config(config, **kwargs):
model = config.get("model", None)
assert model is not None, "Missing model configuration file."
model_cls = registry.get_model_class(model.arch)
assert model_cls is not None, f"Model '{model.arch}' has not been registered."
model_type = kwargs.get("model.model_type", None)
if not model_type:
model_type = model.get("model_type", None)
# else use the model type selected by user.
assert model_type is not None, "Missing model_type."
model_config_path = model_cls.default_config_path(model_type=model_type)
model_config = OmegaConf.create()
# hiararchy override, customized config > default config
model_config = OmegaConf.merge(
model_config,
OmegaConf.load(model_config_path),
{"model": config["model"]},
)
return model_config
@staticmethod
def build_runner_config(config):
return {"run": config.run}
@staticmethod
def build_dataset_config(config):
datasets = config.get("datasets", None)
if datasets is None:
raise KeyError(
"Expecting 'datasets' as the root key for dataset configuration."
)
dataset_config = OmegaConf.create()
for dataset_name in datasets:
builder_cls = registry.get_builder_class(dataset_name)
dataset_config_type = datasets[dataset_name].get("type", "default")
dataset_config_path = builder_cls.default_config_path(
type=dataset_config_type
)
# hiararchy override, customized config > default config
dataset_config = OmegaConf.merge(
dataset_config,
OmegaConf.load(dataset_config_path),
{"datasets": {dataset_name: config["datasets"][dataset_name]}},
)
return dataset_config
def _convert_to_dot_list(self, opts):
if opts is None:
opts = []
if len(opts) == 0:
return opts
has_equal = opts[0].find("=") != -1
if has_equal:
return opts
return [(opt + "=" + value) for opt, value in zip(opts[0::2], opts[1::2])]
def get_config(self):
return self.config
@property
def run_cfg(self):
return self.config.run
@property
def datasets_cfg(self):
return self.config.datasets
@property
def model_cfg(self):
return self.config.model
def pretty_print(self):
logging.info("\n===== Running Parameters =====")
logging.info(self._convert_node_to_json(self.config.run))
logging.info("\n====== Dataset Attributes ======")
datasets = self.config.datasets
for dataset in datasets:
if dataset in self.config.datasets:
logging.info(f"\n======== {dataset} =======")
dataset_config = self.config.datasets[dataset]
logging.info(self._convert_node_to_json(dataset_config))
else:
logging.warning(f"No dataset named '{dataset}' in config. Skipping")
logging.info(f"\n====== Model Attributes ======")
logging.info(self._convert_node_to_json(self.config.model))
def _convert_node_to_json(self, node):
container = OmegaConf.to_container(node, resolve=True)
return json.dumps(container, indent=4, sort_keys=True)
def to_dict(self):
return OmegaConf.to_container(self.config)
def node_to_dict(node):
return OmegaConf.to_container(node)
class ConfigValidator:
"""
This is a preliminary implementation to centralize and validate the configuration.
May be altered in the future.
A helper class to validate configurations from yaml file.
This serves the following purposes:
1. Ensure all the options in the yaml are defined, raise error if not.
2. when type mismatches are found, the validator will raise an error.
3. a central place to store and display helpful messages for supported configurations.
"""
class _Argument:
def __init__(self, name, choices=None, type=None, help=None):
self.name = name
self.val = None
self.choices = choices
self.type = type
self.help = help
def __str__(self):
s = f"{self.name}={self.val}"
if self.type is not None:
s += f", ({self.type})"
if self.choices is not None:
s += f", choices: {self.choices}"
if self.help is not None:
s += f", ({self.help})"
return s
def __init__(self, description):
self.description = description
self.arguments = dict()
self.parsed_args = None
def __getitem__(self, key):
assert self.parsed_args is not None, "No arguments parsed yet."
return self.parsed_args[key]
def __str__(self) -> str:
return self.format_help()
def add_argument(self, *args, **kwargs):
"""
Assume the first argument is the name of the argument.
"""
self.arguments[args[0]] = self._Argument(*args, **kwargs)
def validate(self, config=None):
"""
Convert yaml config (dict-like) to list, required by argparse.
"""
for k, v in config.items():
assert (
k in self.arguments
), f"""{k} is not a valid argument. Support arguments are {self.format_arguments()}."""
if self.arguments[k].type is not None:
try:
self.arguments[k].val = self.arguments[k].type(v)
except ValueError:
raise ValueError(f"{k} is not a valid {self.arguments[k].type}.")
if self.arguments[k].choices is not None:
assert (
v in self.arguments[k].choices
), f"""{k} must be one of {self.arguments[k].choices}."""
return config
def format_arguments(self):
return str([f"{k}" for k in sorted(self.arguments.keys())])
def format_help(self):
# description + key-value pair string for each argument
help_msg = str(self.description)
return help_msg + ", available arguments: " + self.format_arguments()
def print_help(self):
# display help message
print(self.format_help())
def create_runner_config_validator():
validator = ConfigValidator(description="Runner configurations")
validator.add_argument(
"runner",
type=str,
choices=["runner_base", "runner_iter"],
help="""Runner to use. The "runner_base" uses epoch-based training while iter-based
runner runs based on iters. Default: runner_base""",
)
# add argumetns for training dataset ratios
validator.add_argument(
"train_dataset_ratios",
type=Dict[str, float],
help="""Ratios of training dataset. This is used in iteration-based runner.
Do not support for epoch-based runner because how to define an epoch becomes tricky.
Default: None""",
)
validator.add_argument(
"max_iters",
type=float,
help="Maximum number of iterations to run.",
)
validator.add_argument(
"max_epoch",
type=int,
help="Maximum number of epochs to run.",
)
# add arguments for iters_per_inner_epoch
validator.add_argument(
"iters_per_inner_epoch",
type=float,
help="Number of iterations per inner epoch. This is required when runner is runner_iter.",
)
lr_scheds_choices = registry.list_lr_schedulers()
validator.add_argument(
"lr_sched",
type=str,
choices=lr_scheds_choices,
help="Learning rate scheduler to use, from {}".format(lr_scheds_choices),
)
task_choices = registry.list_tasks()
validator.add_argument(
"task",
type=str,
choices=task_choices,
help="Task to use, from {}".format(task_choices),
)
# add arguments for init_lr
validator.add_argument(
"init_lr",
type=float,
help="Initial learning rate. This will be the learning rate after warmup and before decay.",
)
# add arguments for min_lr
validator.add_argument(
"min_lr",
type=float,
help="Minimum learning rate (after decay).",
)
# add arguments for warmup_lr
validator.add_argument(
"warmup_lr",
type=float,
help="Starting learning rate for warmup.",
)
# add arguments for learning rate decay rate
validator.add_argument(
"lr_decay_rate",
type=float,
help="Learning rate decay rate. Required if using a decaying learning rate scheduler.",
)
# add arguments for weight decay
validator.add_argument(
"weight_decay",
type=float,
help="Weight decay rate.",
)
# add arguments for training batch size
validator.add_argument(
"batch_size_train",
type=int,
help="Training batch size.",
)
# add arguments for evaluation batch size
validator.add_argument(
"batch_size_eval",
type=int,
help="Evaluation batch size, including validation and testing.",
)
# add arguments for number of workers for data loading
validator.add_argument(
"num_workers",
help="Number of workers for data loading.",
)
# add arguments for warm up steps
validator.add_argument(
"warmup_steps",
type=int,
help="Number of warmup steps. Required if a warmup schedule is used.",
)
# add arguments for random seed
validator.add_argument(
"seed",
type=int,
help="Random seed.",
)
# add arguments for output directory
validator.add_argument(
"output_dir",
type=str,
help="Output directory to save checkpoints and logs.",
)
# add arguments for whether only use evaluation
validator.add_argument(
"evaluate",
help="Whether to only evaluate the model. If true, training will not be performed.",
)
# add arguments for splits used for training, e.g. ["train", "val"]
validator.add_argument(
"train_splits",
type=list,
help="Splits to use for training.",
)
# add arguments for splits used for validation, e.g. ["val"]
validator.add_argument(
"valid_splits",
type=list,
help="Splits to use for validation. If not provided, will skip the validation.",
)
# add arguments for splits used for testing, e.g. ["test"]
validator.add_argument(
"test_splits",
type=list,
help="Splits to use for testing. If not provided, will skip the testing.",
)
# add arguments for accumulating gradient for iterations
validator.add_argument(
"accum_grad_iters",
type=int,
help="Number of iterations to accumulate gradient for.",
)
# ====== distributed training ======
validator.add_argument(
"device",
type=str,
choices=["cpu", "cuda"],
help="Device to use. Support 'cuda' or 'cpu' as for now.",
)
validator.add_argument(
"world_size",
type=int,
help="Number of processes participating in the job.",
)
validator.add_argument("dist_url", type=str)
validator.add_argument("distributed", type=bool)
# add arguments to opt using distributed sampler during evaluation or not
validator.add_argument(
"use_dist_eval_sampler",
type=bool,
help="Whether to use distributed sampler during evaluation or not.",
)
# ====== task specific ======
# generation task specific arguments
# add arguments for maximal length of text output
validator.add_argument(
"max_len",
type=int,
help="Maximal length of text output.",
)
# add arguments for minimal length of text output
validator.add_argument(
"min_len",
type=int,
help="Minimal length of text output.",
)
# add arguments number of beams
validator.add_argument(
"num_beams",
type=int,
help="Number of beams used for beam search.",
)
# vqa task specific arguments
# add arguments for number of answer candidates
validator.add_argument(
"num_ans_candidates",
type=int,
help="""For ALBEF and BLIP, these models first rank answers according to likelihood to select answer candidates.""",
)
# add arguments for inference method
validator.add_argument(
"inference_method",
type=str,
choices=["genearte", "rank"],
help="""Inference method to use for question answering. If rank, requires a answer list.""",
)
# ====== model specific ======
validator.add_argument(
"k_test",
type=int,
help="Number of top k most similar samples from ITC/VTC selection to be tested.",
)
return validator
================================================
FILE: lavis/common/dist_utils.py
================================================
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import datetime
import functools
import os
import torch
import torch.distributed as dist
import timm.models.hub as timm_hub
def setup_for_distributed(is_master):
"""
This function disables printing when not in master process
"""
import builtins as __builtin__
builtin_print = __builtin__.print
def print(*args, **kwargs):
force = kwargs.pop("force", False)
if is_master or force:
builtin_print(*args, **kwargs)
__builtin__.print = print
def is_dist_avail_and_initialized():
if not dist.is_available():
return False
if not dist.is_initialized():
return False
return True
def get_world_size():
if not is_dist_avail_and_initialized():
return 1
return dist.get_world_size()
def get_rank():
if not is_dist_avail_and_initialized():
return 0
return dist.get_rank()
def is_main_process():
return get_rank() == 0
def init_distributed_mode(args):
if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
args.rank = int(os.environ["RANK"])
args.world_size = int(os.environ["WORLD_SIZE"])
args.gpu = int(os.environ["LOCAL_RANK"])
elif "SLURM_PROCID" in os.environ:
args.rank = int(os.environ["SLURM_PROCID"])
args.gpu = args.rank % torch.cuda.device_count()
else:
print("Not using distributed mode")
args.distributed = False
return
args.distributed = True
torch.cuda.set_device(args.gpu)
args.dist_backend = "nccl"
print(
"| distributed init (rank {}, world {}): {}".format(
args.rank, args.world_size, args.dist_url
),
flush=True,
)
torch.distributed.init_process_group(
backend=args.dist_backend,
init_method=args.dist_url,
world_size=args.world_size,
rank=args.rank,
timeout=datetime.timedelta(
days=365
), # allow auto-downloading and de-compressing
)
torch.distributed.barrier()
setup_for_distributed(args.rank == 0)
def get_dist_info():
if torch.__version__ < "1.0":
initialized = dist._initialized
else:
initialized = dist.is_initialized()
if initialized:
rank = dist.get_rank()
world_size = dist.get_world_size()
else: # non-distributed training
rank = 0
world_size = 1
return rank, world_size
def main_process(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
rank, _ = get_dist_info()
if rank == 0:
return func(*args, **kwargs)
return wrapper
def download_cached_file(url, check_hash=True, progress=False):
"""
Download a file from a URL and cache it locally. If the file already exists, it is not downloaded again.
If distributed, only the main process downloads the file, and the other processes wait for the file to be downloaded.
"""
def get_cached_file_path():
# a hack to sync the file path across processes
parts = torch.hub.urlparse(url)
filename = os.path.basename(parts.path)
cached_file = os.path.join(timm_hub.get_cache_dir(), filename)
return cached_file
if is_main_process():
timm_hub.download_cached_file(url, check_hash, progress)
if is_dist_avail_and_initialized():
dist.barrier()
return get_cached_file_path()
================================================
FILE: lavis/common/gradcam.py
================================================
import numpy as np
from matplotlib import pyplot as plt
from scipy.ndimage import filters
from skimage import transform as skimage_transform
def getAttMap(img, attMap, blur=True, overlap=True):
attMap -= attMap.min()
if attMap.max() > 0:
attMap /= attMap.max()
attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant")
if blur:
attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2]))
attMap -= attMap.min()
attMap /= attMap.max()
cmap = plt.get_cmap("jet")
attMapV = cmap(attMap)
attMapV = np.delete(attMapV, 3, 2)
if overlap:
attMap = (
1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img
+ (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV
)
return attMap
================================================
FILE: lavis/common/logger.py
================================================
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import datetime
import logging
import time
from collections import defaultdict, deque
import torch
import torch.distributed as dist
from lavis.common import dist_utils
class SmoothedValue(object):
"""Track a series of values and provide access to smoothed values over a
window or the global series average.
"""
def __init__(self, window_size=20, fmt=None):
if fmt is None:
fmt = "{median:.4f} ({global_avg:.4f})"
self.deque = deque(maxlen=window_size)
self.total = 0.0
self.count = 0
self.fmt = fmt
def update(self, value, n=1):
self.deque.append(value)
self.count += n
self.total += value * n
def synchronize_between_processes(self):
"""
Warning: does not synchronize the deque!
"""
if not dist_utils.is_dist_avail_and_initialized():
return
t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
dist.barrier()
dist.all_reduce(t)
t = t.tolist()
self.count = int(t[0])
self.total = t[1]
@property
def median(self):
d = torch.tensor(list(self.deque))
return d.median().item()
@property
def avg(self):
d = torch.tensor(list(self.deque), dtype=torch.float32)
return d.mean().item()
@property
def global_avg(self):
return self.total / self.count
@property
def max(self):
return max(self.deque)
@property
def value(self):
return self.deque[-1]
def __str__(self):
return self.fmt.format(
median=self.median,
avg=self.avg,
global_avg=self.global_avg,
max=self.max,
value=self.value,
)
class MetricLogger(object):
def __init__(self, delimiter="\t"):
self.meters = defaultdict(SmoothedValue)
self.delimiter = delimiter
def update(self, **kwargs):
for k, v in kwargs.items():
if isinstance(v, torch.Tensor):
v = v.item()
assert isinstance(v, (float, int))
self.meters[k].update(v)
def __getattr__(self, attr):
if attr in self.meters:
return self.meters[attr]
if attr in self.__dict__:
return self.__dict__[attr]
raise AttributeError(
"'{}' object has no attribute '{}'".format(type(self).__name__, attr)
)
def __str__(self):
loss_str = []
for name, meter in self.meters.items():
loss_str.append("{}: {}".format(name, str(meter)))
return self.delimiter.join(loss_str)
def global_avg(self):
loss_str = []
for name, meter in self.meters.items():
loss_str.append("{}: {:.4f}".format(name, meter.global_avg))
return self.delimiter.join(loss_str)
def synchronize_between_processes(self):
for meter in self.meters.values():
meter.synchronize_between_processes()
def add_meter(self, name, meter):
self.meters[name] = meter
def log_every(self, iterable, print_freq, header=None):
i = 0
if not header:
header = ""
start_time = time.time()
end = time.time()
iter_time = SmoothedValue(fmt="{avg:.4f}")
data_time = SmoothedValue(fmt="{avg:.4f}")
space_fmt = ":" + str(len(str(len(iterable)))) + "d"
log_msg = [
header,
"[{0" + space_fmt + "}/{1}]",
"eta: {eta}",
"{meters}",
"time: {time}",
"data: {data}",
]
if torch.cuda.is_available():
log_msg.append("max mem: {memory:.0f}")
log_msg = self.delimiter.join(log_msg)
MB = 1024.0 * 1024.0
for obj in iterable:
data_time.update(time.time() - end)
yield obj
iter_time.update(time.time() - end)
if i % print_freq == 0 or i == len(iterable) - 1:
eta_seconds = iter_time.global_avg * (len(iterable) - i)
eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
if torch.cuda.is_available():
print(
log_msg.format(
i,
len(iterable),
eta=eta_string,
meters=str(self),
time=str(iter_time),
data=str(data_time),
memory=torch.cuda.max_memory_allocated() / MB,
)
)
else:
print(
log_msg.format(
i,
len(iterable),
eta=eta_string,
meters=str(self),
time=str(iter_time),
data=str(data_time),
)
)
i += 1
end = time.time()
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print(
"{} Total time: {} ({:.4f} s / it)".format(
header, total_time_str, total_time / len(iterable)
)
)
class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
def setup_logger():
logging.basicConfig(
level=logging.INFO if dist_utils.is_main_process() else logging.WARN,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[logging.StreamHandler()],
)
================================================
FILE: lavis/common/optims.py
================================================
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import math
from lavis.common.registry import registry
@registry.register_lr_scheduler("linear_warmup_step_lr")
class LinearWarmupStepLRScheduler:
def __init__(
self,
optimizer,
max_epoch,
min_lr,
init_lr,
decay_rate=1,
warmup_start_lr=-1,
warmup_steps=0,
**kwargs
):
self.optimizer = optimizer
self.max_epoch = max_epoch
self.min_lr = min_lr
self.decay_rate = decay_rate
self.init_lr = init_lr
self.warmup_steps = warmup_steps
self.warmup_start_lr = warmup_start_lr if warmup_start_lr >= 0 else init_lr
def step(self, cur_epoch, cur_step):
if cur_epoch == 0:
warmup_lr_schedule(
step=cur_step,
optimizer=self.optimizer,
max_step=self.warmup_steps,
init_lr=self.warmup_start_lr,
max_lr=self.init_lr,
)
else:
step_lr_schedule(
epoch=cur_epoch,
optimizer=self.optimizer,
init_lr=self.init_lr,
min_lr=self.min_lr,
decay_rate=self.decay_rate,
)
@registry.register_lr_scheduler("linear_warmup_cosine_lr")
class LinearWarmupCosineLRScheduler:
def __init__(
self,
optimizer,
max_epoch,
min_lr,
init_lr,
warmup_steps=0,
warmup_start_lr=-1,
**kwargs
):
self.optimizer = optimizer
self.max_epoch = max_epoch
self.min_lr = min_lr
self.init_lr = init_lr
self.warmup_steps = warmup_steps
self.warmup_start_lr = warmup_start_lr if warmup_start_lr >= 0 else init_lr
def step(self, cur_epoch, cur_step):
# assuming the warmup iters less than one epoch
if cur_epoch == 0:
warmup_lr_schedule(
step=cur_step,
optimizer=self.optimizer,
max_step=self.warmup_steps,
init_lr=self.warmup_start_lr,
max_lr=self.init_lr,
)
else:
cosine_lr_schedule(
epoch=cur_epoch,
optimizer=self.optimizer,
max_epoch=self.max_epoch,
init_lr=self.init_lr,
min_lr=self.min_lr,
)
def cosine_lr_schedule(optimizer, epoch, max_epoch, init_lr, min_lr):
"""Decay the learning rate"""
lr = (init_lr - min_lr) * 0.5 * (
1.0 + math.cos(math.pi * epoch / max_epoch)
) + min_lr
for param_group in optimizer.param_groups:
param_group["lr"] = lr
def warmup_lr_schedule(optimizer, step, max_step, init_lr, max_lr):
"""Warmup the learning rate"""
lr = min(max_lr, init_lr + (max_lr - init_lr) * step / max(max_step, 1))
for param_group in optimizer.param_groups:
param_group["lr"] = lr
def step_lr_schedule(optimizer, epoch, init_lr, min_lr, decay_rate):
"""Decay the learning rate"""
lr = max(min_lr, init_lr * (decay_rate**epoch))
for param_group in optimizer.param_groups:
param_group["lr"] = lr
================================================
FILE: lavis/common/registry.py
================================================
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
class Registry:
mapping = {
"builder_name_mapping": {},
"task_name_mapping": {},
"processor_name_mapping": {},
"model_name_mapping": {},
"lr_scheduler_name_mapping": {},
"runner_name_mapping": {},
"state": {},
"paths": {},
}
@classmethod
def register_builder(cls, name):
r"""Register a dataset builder to registry with key 'name'
Args:
name: Key with which the builder will be registered.
Usage:
from lavis.common.registry import registry
from lavis.datasets.base_dataset_builder import BaseDatasetBuilder
"""
def wrap(builder_cls):
from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
assert issubclass(
builder_cls, BaseDatasetBuilder
), "All builders must inherit BaseDatasetBuilder class, found {}".format(
builder_cls
)
if name in cls.mapping["builder_name_mapping"]:
raise KeyError(
"Name '{}' already registered for {}.".format(
name, cls.mapping["builder_name_mapping"][name]
)
)
cls.mapping["builder_name_mapping"][name] = builder_cls
return builder_cls
return wrap
@classmethod
def register_task(cls, name):
r"""Register a task to registry with key 'name'
Args:
name: Key with which the task will be registered.
Usage:
from lavis.common.registry import registry
"""
def wrap(task_cls):
from lavis.tasks.base_task import BaseTask
assert issubclass(
task_cls, BaseTask
), "All tasks must inherit BaseTask class"
if name in cls.mapping["task_name_mapping"]:
raise KeyError(
"Name '{}' already registered for {}.".format(
name, cls.mapping["task_name_mapping"][name]
)
)
cls.mapping["task_name_mapping"][name] = task_cls
return task_cls
return wrap
@classmethod
def register_model(cls, name):
r"""Register a task to registry with key 'name'
Args:
name: Key with which the task will be registered.
Usage:
from lavis.common.registry import registry
"""
def wrap(model_cls):
from lavis.models import BaseModel
assert issubclass(
model_cls, BaseModel
), "All models must inherit BaseModel class"
if name in cls.mapping["model_name_mapping"]:
raise KeyError(
"Name '{}' already registered for {}.".format(
name, cls.mapping["model_name_mapping"][name]
)
)
cls.mapping["model_name_mapping"][name] = model_cls
return model_cls
return wrap
@classmethod
def register_processor(cls, name):
r"""Register a processor to registry with key 'name'
Args:
name: Key with which the task will be registered.
Usage:
from lavis.common.registry import registry
"""
def wrap(processor_cls):
from lavis.processors import BaseProcessor
assert issubclass(
processor_cls, BaseProcessor
), "All processors must inherit BaseProcessor class"
if name in cls.mapping["processor_name_mapping"]:
raise KeyError(
"Name '{}' already registered for {}.".format(
name, cls.mapping["processor_name_mapping"][name]
)
)
cls.mapping["processor_name_mapping"][name] = processor_cls
return processor_cls
retur
gitextract_g7qpcjx_/ ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── app/ │ ├── __init__.py │ ├── calculate_coco_features.py │ ├── caption.py │ ├── classification.py │ ├── dataset_browser.py │ ├── image_text_match.py │ ├── main.py │ ├── multimodal_search.py │ ├── multipage.py │ ├── text_localization.py │ ├── utils.py │ └── vqa.py ├── app.py ├── docs/ │ ├── Makefile │ ├── benchmark.rst │ ├── build_docs.sh │ ├── conf.py │ ├── getting_started.rst │ ├── index.rst │ ├── intro.rst │ ├── make.bat │ ├── requirements.txt │ ├── tutorial.configs.rst │ ├── tutorial.datasets.rst │ ├── tutorial.evaluation.rst │ ├── tutorial.models.rst │ ├── tutorial.processors.rst │ ├── tutorial.rst │ ├── tutorial.tasks.rst │ └── tutorial.training-example.rst ├── evaluate.py ├── lavis/ │ ├── __init__.py │ ├── common/ │ │ ├── config.py │ │ ├── dist_utils.py │ │ ├── gradcam.py │ │ ├── logger.py │ │ ├── optims.py │ │ ├── registry.py │ │ ├── utils.py │ │ └── vqa_tools/ │ │ ├── __init__.py │ │ ├── vqa.py │ │ └── vqa_eval.py │ ├── configs/ │ │ ├── datasets/ │ │ │ ├── aokvqa/ │ │ │ │ └── defaults.yaml │ │ │ ├── avsd/ │ │ │ │ └── defaults_dial.yaml │ │ │ ├── coco/ │ │ │ │ ├── defaults_cap.yaml │ │ │ │ ├── defaults_ret.yaml │ │ │ │ ├── defaults_vqa.yaml │ │ │ │ └── eval_vqa.yaml │ │ │ ├── conceptual_caption/ │ │ │ │ ├── defaults_12m.yaml │ │ │ │ └── defaults_3m.yaml │ │ │ ├── didemo/ │ │ │ │ └── defaults_ret.yaml │ │ │ ├── flickr30k/ │ │ │ │ └── defaults.yaml │ │ │ ├── gqa/ │ │ │ │ ├── balanced_testdev.yaml │ │ │ │ ├── balanced_val.yaml │ │ │ │ └── defaults.yaml │ │ │ ├── how2qa/ │ │ │ │ └── defaults_qa.yaml │ │ │ ├── imagenet/ │ │ │ │ └── defaults.yaml │ │ │ ├── laion/ │ │ │ │ └── defaults_2B_multi.yaml │ │ │ ├── msrvtt/ │ │ │ │ ├── defaults_cap.yaml │ │ │ │ ├── defaults_qa.yaml │ │ │ │ └── defaults_ret.yaml │ │ │ ├── msrvttmc/ │ │ │ │ └── defaults_qa.yaml │ │ │ ├── msvd/ │ │ │ │ ├── defaults_cap.yaml │ │ │ │ └── defaults_qa.yaml │ │ │ ├── nextqa/ │ │ │ │ └── defaults_qa.yaml │ │ │ ├── nlvr/ │ │ │ │ └── defaults.yaml │ │ │ ├── nocaps/ │ │ │ │ └── defaults.yaml │ │ │ ├── okvqa/ │ │ │ │ └── defaults.yaml │ │ │ ├── qvh/ │ │ │ │ └── defaults.yaml │ │ │ ├── sbu_caption/ │ │ │ │ └── defaults.yaml │ │ │ ├── snli_ve/ │ │ │ │ └── defaults.yaml │ │ │ ├── star/ │ │ │ │ └── defaults_qa.yaml │ │ │ ├── tvqa/ │ │ │ │ └── defaults_qa.yaml │ │ │ ├── vatex/ │ │ │ │ └── defaults_cap.yaml │ │ │ ├── vg/ │ │ │ │ ├── defaults_caption.yaml │ │ │ │ └── defaults_vqa.yaml │ │ │ └── vlep/ │ │ │ └── defaults_qa.yaml │ │ ├── default.yaml │ │ └── models/ │ │ ├── albef_classification_ve.yaml │ │ ├── albef_feature_extractor.yaml │ │ ├── albef_nlvr.yaml │ │ ├── albef_pretrain_base.yaml │ │ ├── albef_retrieval_coco.yaml │ │ ├── albef_retrieval_flickr.yaml │ │ ├── albef_vqav2.yaml │ │ ├── alpro_qa_msrvtt.yaml │ │ ├── alpro_qa_msvd.yaml │ │ ├── alpro_retrieval_didemo.yaml │ │ ├── alpro_retrieval_msrvtt.yaml │ │ ├── bert_config.json │ │ ├── bert_config_alpro.json │ │ ├── blip2/ │ │ │ ├── blip2_caption_flant5xl.yaml │ │ │ ├── blip2_caption_opt2.7b.yaml │ │ │ ├── blip2_caption_opt6.7b.yaml │ │ │ ├── blip2_coco.yaml │ │ │ ├── blip2_pretrain.yaml │ │ │ ├── blip2_pretrain_flant5xl.yaml │ │ │ ├── blip2_pretrain_flant5xxl.yaml │ │ │ ├── blip2_pretrain_opt2.7b.yaml │ │ │ └── blip2_pretrain_opt6.7b.yaml │ │ ├── blip_caption_base_coco.yaml │ │ ├── blip_caption_large_coco.yaml │ │ ├── blip_classification_base.yaml │ │ ├── blip_feature_extractor_base.yaml │ │ ├── blip_itm_base.yaml │ │ ├── blip_itm_large.yaml │ │ ├── blip_nlvr.yaml │ │ ├── blip_pretrain_base.yaml │ │ ├── blip_pretrain_large.yaml │ │ ├── blip_retrieval_coco.yaml │ │ ├── blip_retrieval_flickr.yaml │ │ ├── blip_vqa_aokvqa.yaml │ │ ├── blip_vqa_okvqa.yaml │ │ ├── blip_vqav2.yaml │ │ ├── clip/ │ │ │ ├── RN101-quickgelu.json │ │ │ ├── RN101.json │ │ │ ├── RN50-quickgelu.json │ │ │ ├── RN50.json │ │ │ ├── RN50x16.json │ │ │ ├── RN50x4.json │ │ │ ├── ViT-B-16-plus-240.json │ │ │ ├── ViT-B-16-plus.json │ │ │ ├── ViT-B-16.json │ │ │ ├── ViT-B-32-plus-256.json │ │ │ ├── ViT-B-32-quickgelu.json │ │ │ ├── ViT-B-32.json │ │ │ ├── ViT-H-14.json │ │ │ ├── ViT-H-16.json │ │ │ ├── ViT-L-14-280.json │ │ │ ├── ViT-L-14-336.json │ │ │ ├── ViT-L-14.json │ │ │ ├── ViT-L-16-320.json │ │ │ ├── ViT-L-16.json │ │ │ ├── ViT-g-14.json │ │ │ ├── timm-efficientnetv2_rw_s.json │ │ │ ├── timm-resnet50d.json │ │ │ ├── timm-resnetaa50d.json │ │ │ ├── timm-resnetblur50.json │ │ │ ├── timm-swin_base_patch4_window7_224.json │ │ │ ├── timm-vit_base_patch16_224.json │ │ │ ├── timm-vit_base_patch32_224.json │ │ │ └── timm-vit_small_patch16_224.json │ │ ├── clip_resnet50.yaml │ │ ├── clip_vit_base16.yaml │ │ ├── clip_vit_base32.yaml │ │ ├── clip_vit_large14.yaml │ │ ├── clip_vit_large14_336.yaml │ │ ├── gpt_dialogue_base.yaml │ │ ├── img2prompt-vqa/ │ │ │ └── img2prompt_vqa_base.yaml │ │ ├── med_config.json │ │ ├── med_config_albef.json │ │ ├── med_large_config.json │ │ ├── pnp-vqa/ │ │ │ ├── pnp_vqa_3b.yaml │ │ │ ├── pnp_vqa_base.yaml │ │ │ ├── pnp_vqa_large.yaml │ │ │ ├── unifiedqav2_3b_config.json │ │ │ ├── unifiedqav2_base_config.json │ │ │ └── unifiedqav2_large_config.json │ │ └── sevila.yaml │ ├── datasets/ │ │ ├── builders/ │ │ │ ├── __init__.py │ │ │ ├── base_dataset_builder.py │ │ │ ├── caption_builder.py │ │ │ ├── classification_builder.py │ │ │ ├── dialogue_builder.py │ │ │ ├── image_text_pair_builder.py │ │ │ ├── imagefolder_builder.py │ │ │ ├── retrieval_builder.py │ │ │ ├── video_qa_builder.py │ │ │ └── vqa_builder.py │ │ ├── data_utils.py │ │ ├── datasets/ │ │ │ ├── aok_vqa_datasets.py │ │ │ ├── avsd_dialogue_datasets.py │ │ │ ├── base_dataset.py │ │ │ ├── caption_datasets.py │ │ │ ├── coco_caption_datasets.py │ │ │ ├── coco_vqa_datasets.py │ │ │ ├── dataloader_utils.py │ │ │ ├── dialogue_datasets.py │ │ │ ├── gqa_datasets.py │ │ │ ├── image_text_pair_datasets.py │ │ │ ├── imagefolder_dataset.py │ │ │ ├── laion_dataset.py │ │ │ ├── mc_video_vqa_datasets.py │ │ │ ├── multimodal_classification_datasets.py │ │ │ ├── nlvr_datasets.py │ │ │ ├── retrieval_datasets.py │ │ │ ├── snli_ve_datasets.py │ │ │ ├── vg_vqa_datasets.py │ │ │ ├── video_caption_datasets.py │ │ │ ├── video_vqa_datasets.py │ │ │ └── vqa_datasets.py │ │ └── download_scripts/ │ │ ├── DownloadConceptualCaptions/ │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── create_annotation_12m.ipynb │ │ │ ├── create_annotation_3m.ipynb │ │ │ ├── download_data_cc12m.py │ │ │ └── download_data_cc3m.py │ │ ├── download_coco.py │ │ ├── download_didemo.py │ │ ├── download_flickr.py │ │ ├── download_gqa.py │ │ ├── download_msrvtt.py │ │ ├── download_msvd.py │ │ ├── download_nocaps.py │ │ ├── download_sbu.py │ │ └── download_vg.py │ ├── models/ │ │ ├── __init__.py │ │ ├── albef_models/ │ │ │ ├── __init__.py │ │ │ ├── albef_classification.py │ │ │ ├── albef_feature_extractor.py │ │ │ ├── albef_nlvr.py │ │ │ ├── albef_outputs.py │ │ │ ├── albef_pretrain.py │ │ │ ├── albef_retrieval.py │ │ │ └── albef_vqa.py │ │ ├── alpro_models/ │ │ │ ├── __init__.py │ │ │ ├── alpro_outputs.py │ │ │ ├── alpro_qa.py │ │ │ └── alpro_retrieval.py │ │ ├── base_model.py │ │ ├── blip2_models/ │ │ │ ├── Qformer.py │ │ │ ├── __init__.py │ │ │ ├── blip2.py │ │ │ ├── blip2_fmr.py │ │ │ ├── blip2_image_text_matching.py │ │ │ ├── blip2_opt.py │ │ │ ├── blip2_qformer.py │ │ │ ├── blip2_t5.py │ │ │ ├── modeling_opt.py │ │ │ └── modeling_t5.py │ │ ├── blip_models/ │ │ │ ├── __init__.py │ │ │ ├── blip.py │ │ │ ├── blip_caption.py │ │ │ ├── blip_classification.py │ │ │ ├── blip_feature_extractor.py │ │ │ ├── blip_image_text_matching.py │ │ │ ├── blip_nlvr.py │ │ │ ├── blip_outputs.py │ │ │ ├── blip_pretrain.py │ │ │ ├── blip_retrieval.py │ │ │ ├── blip_vqa.py │ │ │ └── nlvr_encoder.py │ │ ├── clip_models/ │ │ │ ├── __init__.py │ │ │ ├── clip_outputs.py │ │ │ ├── loss.py │ │ │ ├── model.py │ │ │ ├── pretrained.py │ │ │ ├── timm_model.py │ │ │ ├── tokenizer.py │ │ │ ├── transform.py │ │ │ └── utils.py │ │ ├── eva_vit.py │ │ ├── gpt_models/ │ │ │ └── gpt_dialogue.py │ │ ├── img2prompt_models/ │ │ │ ├── __init__.py │ │ │ └── img2prompt_vqa.py │ │ ├── med.py │ │ ├── pnp_vqa_models/ │ │ │ ├── __init__.py │ │ │ ├── pnp_unifiedqav2_fid.py │ │ │ └── pnp_vqa.py │ │ ├── sevila_models/ │ │ │ ├── __init__.py │ │ │ └── sevila.py │ │ ├── timesformer/ │ │ │ ├── __init__.py │ │ │ ├── conv2d_same.py │ │ │ ├── features.py │ │ │ ├── helpers.py │ │ │ ├── linear.py │ │ │ ├── vit.py │ │ │ └── vit_utils.py │ │ ├── topk.py │ │ └── vit.py │ ├── processors/ │ │ ├── __init__.py │ │ ├── alpro_processors.py │ │ ├── base_processor.py │ │ ├── blip_processors.py │ │ ├── clip_processors.py │ │ ├── functional_video.py │ │ ├── gpt_processors.py │ │ ├── randaugment.py │ │ └── transforms_video.py │ ├── projects/ │ │ ├── albef/ │ │ │ ├── eval/ │ │ │ │ ├── nlvr_eval.yaml │ │ │ │ ├── ret_coco_eval.yaml │ │ │ │ ├── ret_flickr30k_eval.yaml │ │ │ │ ├── snli_ve_eval.yaml │ │ │ │ ├── vqa_test.yaml │ │ │ │ └── vqa_val.yaml │ │ │ └── train/ │ │ │ ├── aokvqa_ft.yaml │ │ │ ├── nlvr_ft.yaml │ │ │ ├── okvqa_ft.yaml │ │ │ ├── pretrain.yaml │ │ │ ├── ret_coco_ft.yaml │ │ │ ├── ret_flickr30k_ft.yaml │ │ │ ├── snli_ve_ft.yaml │ │ │ └── vqa_ft.yaml │ │ ├── alpro/ │ │ │ ├── eval/ │ │ │ │ ├── didemo_ret_eval.yaml │ │ │ │ ├── msrvtt_qa_eval.yaml │ │ │ │ ├── msrvtt_ret_eval.yaml │ │ │ │ └── msvd_qa_eval.yaml │ │ │ └── train/ │ │ │ ├── didemo_ret_ft.yaml │ │ │ ├── msrvtt_qa_ft.yaml │ │ │ ├── msrvtt_retrieval_ft.yaml │ │ │ └── msvd_qa_ft.yaml │ │ ├── blip/ │ │ │ ├── coco_cap_ft_iter.yaml │ │ │ ├── eval/ │ │ │ │ ├── aokvqa_eval.yaml │ │ │ │ ├── caption_coco_eval.yaml │ │ │ │ ├── caption_coco_eval_large.yaml │ │ │ │ ├── nlvr_eval.yaml │ │ │ │ ├── nocaps_eval.yaml │ │ │ │ ├── okvqa_eval.yaml │ │ │ │ ├── ret_coco_eval.yaml │ │ │ │ ├── ret_flickr_eval.yaml │ │ │ │ └── vqav2_eval.yaml │ │ │ └── train/ │ │ │ ├── aokvqa_ft.yaml │ │ │ ├── caption_coco_ft.yaml │ │ │ ├── caption_coco_large_ft.yaml │ │ │ ├── nlvr_ft.yaml │ │ │ ├── okvqa_ft.yaml │ │ │ ├── pretrain_14m.yaml │ │ │ ├── retrieval_coco_ft.yaml │ │ │ ├── retrieval_flickr_ft.yaml │ │ │ └── vqav2_ft.yaml │ │ ├── blip2/ │ │ │ ├── eval/ │ │ │ │ ├── caption_coco_flant5xl_eval.yaml │ │ │ │ ├── caption_coco_opt2.7b_eval.yaml │ │ │ │ ├── caption_coco_opt6.7b_eval.yaml │ │ │ │ ├── gqa_zeroshot_flant5xl_eval.yaml │ │ │ │ ├── okvqa_zeroshot_flant5xl_eval.yaml │ │ │ │ ├── ret_coco_eval.yaml │ │ │ │ ├── ret_flickr_eval.yaml │ │ │ │ └── vqav2_zeroshot_flant5xl_eval.yaml │ │ │ └── train/ │ │ │ ├── caption_coco_ft.yaml │ │ │ ├── pretrain_stage1.yaml │ │ │ └── pretrain_stage2.yaml │ │ ├── clip/ │ │ │ ├── exp_coco_ret_eval.yaml │ │ │ ├── exp_flickr_ret_eval.yaml │ │ │ └── exp_imnet_zs_eval.yaml │ │ ├── gpt/ │ │ │ ├── eval/ │ │ │ │ └── dialogue_avsd_eval.yaml │ │ │ └── train/ │ │ │ └── dialogue_avsd_ft.yaml │ │ ├── pnp-vqa/ │ │ │ └── eval/ │ │ │ ├── gqa_eval.yaml │ │ │ ├── gqa_eval_3b.yaml │ │ │ ├── gqa_eval_large.yaml │ │ │ ├── okvqa_eval.yaml │ │ │ ├── okvqa_eval_3b.yaml │ │ │ ├── okvqa_eval_large.yaml │ │ │ ├── vqav2_eval.yaml │ │ │ ├── vqav2_eval_3b.yaml │ │ │ ├── vqav2_eval_large.yaml │ │ │ ├── vqav2_test_eval.yaml │ │ │ ├── vqav2_test_eval_3b.yaml │ │ │ └── vqav2_test_eval_large.yaml │ │ └── sevila/ │ │ ├── eval/ │ │ │ ├── how2qa_eval.yaml │ │ │ ├── nextqa_eval.yaml │ │ │ ├── qvh_eval.yaml │ │ │ ├── star_eval.yaml │ │ │ ├── tvqa_eval.yaml │ │ │ └── vlep_eval.yaml │ │ └── train/ │ │ ├── how2qa.yaml │ │ ├── nextqa.yaml │ │ ├── qvh.yaml │ │ ├── star.yaml │ │ ├── tvqa.yaml │ │ └── vlep.yaml │ ├── runners/ │ │ ├── __init__.py │ │ ├── runner_base.py │ │ └── runner_iter.py │ └── tasks/ │ ├── __init__.py │ ├── base_task.py │ ├── captioning.py │ ├── dialogue.py │ ├── image_text_pretrain.py │ ├── multimodal_classification.py │ ├── retrieval.py │ ├── vqa.py │ └── vqa_reading_comprehension.py ├── pyproject.toml ├── requirements.txt ├── run_scripts/ │ └── sevila/ │ ├── finetune/ │ │ └── nexqa_ft.sh │ ├── inference/ │ │ └── nexqa_infer.sh │ ├── pre-train/ │ │ └── pretrain_qvh.sh │ └── refinement/ │ └── nexqa_sr.sh ├── setup.py ├── sevila_checkpoints/ │ └── __init__.py ├── sevila_data/ │ ├── Data Preprocess.ipynb │ └── README.md └── train.py
SYMBOL INDEX (1548 symbols across 146 files)
FILE: app.py
function sevila_demo (line 60) | def sevila_demo(video,
FILE: app/__init__.py
function load_demo_image (line 16) | def load_demo_image():
FILE: app/calculate_coco_features.py
function load_demo_image (line 22) | def load_demo_image():
function read_img (line 31) | def read_img(filepath):
FILE: app/caption.py
function app (line 15) | def app():
function generate_caption (line 72) | def generate_caption(
FILE: app/classification.py
function load_demo_image (line 23) | def load_demo_image(img_url=None):
function load_model_cache (line 38) | def load_model_cache(model_type, device):
function app (line 63) | def app():
FILE: app/dataset_browser.py
function sample_dataset (line 26) | def sample_dataset(dataset, indices):
function get_concat_v (line 32) | def get_concat_v(im1, im2):
function resize_img_w (line 43) | def resize_img_w(raw_img, new_w=224):
function get_visual_key (line 58) | def get_visual_key(dataset):
function gather_items (line 69) | def gather_items(samples, exclude=[]):
function load_dataset_cache (line 84) | def load_dataset_cache(name):
function format_text (line 88) | def format_text(text):
function show_samples (line 94) | def show_samples(dataset, offset=0, is_next=False):
FILE: app/image_text_match.py
function app (line 19) | def app():
FILE: app/multimodal_search.py
function load_feat (line 34) | def load_feat():
function load_feature_extractor_model (line 61) | def load_feature_extractor_model(device):
function app (line 72) | def app():
function read_and_process_images (line 183) | def read_and_process_images(image_paths, vis_processor):
function compute_gradcam_batch (line 191) | def compute_gradcam_batch(model, visual_input, text_input, tokenized_tex...
FILE: app/multipage.py
class MultiPage (line 17) | class MultiPage:
method __init__ (line 20) | def __init__(self) -> None:
method add_page (line 24) | def add_page(self, title, func) -> None:
method run (line 34) | def run(self):
FILE: app/text_localization.py
function app (line 20) | def app():
FILE: app/utils.py
function resize_img (line 18) | def resize_img(raw_img):
function read_img (line 25) | def read_img(filepath):
function load_model_cache (line 39) | def load_model_cache(name, model_type, is_eval, device):
function init_bert_tokenizer (line 44) | def init_bert_tokenizer():
function getAttMap (line 49) | def getAttMap(img, attMap, blur=True, overlap=True):
function load_blip_itm_model (line 77) | def load_blip_itm_model(device, model_type="base"):
FILE: app/vqa.py
function app (line 15) | def app():
FILE: evaluate.py
function parse_args (line 33) | def parse_args():
function setup_seeds (line 52) | def setup_seeds(config):
function main (line 63) | def main():
FILE: lavis/common/config.py
class Config (line 16) | class Config:
method __init__ (line 17) | def __init__(self, args):
method _validate_runner_config (line 43) | def _validate_runner_config(self, runner_config):
method _build_opt_list (line 52) | def _build_opt_list(self, opts):
method build_model_config (line 57) | def build_model_config(config, **kwargs):
method build_runner_config (line 84) | def build_runner_config(config):
method build_dataset_config (line 88) | def build_dataset_config(config):
method _convert_to_dot_list (line 114) | def _convert_to_dot_list(self, opts):
method get_config (line 128) | def get_config(self):
method run_cfg (line 132) | def run_cfg(self):
method datasets_cfg (line 136) | def datasets_cfg(self):
method model_cfg (line 140) | def model_cfg(self):
method pretty_print (line 143) | def pretty_print(self):
method _convert_node_to_json (line 161) | def _convert_node_to_json(self, node):
method to_dict (line 165) | def to_dict(self):
function node_to_dict (line 169) | def node_to_dict(node):
class ConfigValidator (line 173) | class ConfigValidator:
class _Argument (line 187) | class _Argument:
method __init__ (line 188) | def __init__(self, name, choices=None, type=None, help=None):
method __str__ (line 195) | def __str__(self):
method __init__ (line 205) | def __init__(self, description):
method __getitem__ (line 212) | def __getitem__(self, key):
method __str__ (line 217) | def __str__(self) -> str:
method add_argument (line 220) | def add_argument(self, *args, **kwargs):
method validate (line 226) | def validate(self, config=None):
method format_arguments (line 248) | def format_arguments(self):
method format_help (line 251) | def format_help(self):
method print_help (line 256) | def print_help(self):
function create_runner_config_validator (line 261) | def create_runner_config_validator():
FILE: lavis/common/dist_utils.py
function setup_for_distributed (line 17) | def setup_for_distributed(is_master):
function is_dist_avail_and_initialized (line 33) | def is_dist_avail_and_initialized():
function get_world_size (line 41) | def get_world_size():
function get_rank (line 47) | def get_rank():
function is_main_process (line 53) | def is_main_process():
function init_distributed_mode (line 57) | def init_distributed_mode(args):
function get_dist_info (line 93) | def get_dist_info():
function main_process (line 107) | def main_process(func):
function download_cached_file (line 117) | def download_cached_file(url, check_hash=True, progress=False):
FILE: lavis/common/gradcam.py
function getAttMap (line 7) | def getAttMap(img, attMap, blur=True, overlap=True):
FILE: lavis/common/logger.py
class SmoothedValue (line 19) | class SmoothedValue(object):
method __init__ (line 24) | def __init__(self, window_size=20, fmt=None):
method update (line 32) | def update(self, value, n=1):
method synchronize_between_processes (line 37) | def synchronize_between_processes(self):
method median (line 51) | def median(self):
method avg (line 56) | def avg(self):
method global_avg (line 61) | def global_avg(self):
method max (line 65) | def max(self):
method value (line 69) | def value(self):
method __str__ (line 72) | def __str__(self):
class MetricLogger (line 82) | class MetricLogger(object):
method __init__ (line 83) | def __init__(self, delimiter="\t"):
method update (line 87) | def update(self, **kwargs):
method __getattr__ (line 94) | def __getattr__(self, attr):
method __str__ (line 103) | def __str__(self):
method global_avg (line 109) | def global_avg(self):
method synchronize_between_processes (line 115) | def synchronize_between_processes(self):
method add_meter (line 119) | def add_meter(self, name, meter):
method log_every (line 122) | def log_every(self, iterable, print_freq, header=None):
class AttrDict (line 184) | class AttrDict(dict):
method __init__ (line 185) | def __init__(self, *args, **kwargs):
function setup_logger (line 190) | def setup_logger():
FILE: lavis/common/optims.py
class LinearWarmupStepLRScheduler (line 14) | class LinearWarmupStepLRScheduler:
method __init__ (line 15) | def __init__(
method step (line 37) | def step(self, cur_epoch, cur_step):
class LinearWarmupCosineLRScheduler (line 57) | class LinearWarmupCosineLRScheduler:
method __init__ (line 58) | def __init__(
method step (line 77) | def step(self, cur_epoch, cur_step):
function cosine_lr_schedule (line 97) | def cosine_lr_schedule(optimizer, epoch, max_epoch, init_lr, min_lr):
function warmup_lr_schedule (line 106) | def warmup_lr_schedule(optimizer, step, max_step, init_lr, max_lr):
function step_lr_schedule (line 113) | def step_lr_schedule(optimizer, epoch, init_lr, min_lr, decay_rate):
FILE: lavis/common/registry.py
class Registry (line 9) | class Registry:
method register_builder (line 22) | def register_builder(cls, name):
method register_task (line 54) | def register_task(cls, name):
method register_model (line 83) | def register_model(cls, name):
method register_processor (line 112) | def register_processor(cls, name):
method register_lr_scheduler (line 141) | def register_lr_scheduler(cls, name):
method register_runner (line 165) | def register_runner(cls, name):
method register_path (line 189) | def register_path(cls, name, path):
method register (line 205) | def register(cls, name, obj):
method get_builder_class (line 232) | def get_builder_class(cls, name):
method get_model_class (line 236) | def get_model_class(cls, name):
method get_task_class (line 240) | def get_task_class(cls, name):
method get_processor_class (line 244) | def get_processor_class(cls, name):
method get_lr_scheduler_class (line 248) | def get_lr_scheduler_class(cls, name):
method get_runner_class (line 252) | def get_runner_class(cls, name):
method list_runners (line 256) | def list_runners(cls):
method list_models (line 260) | def list_models(cls):
method list_tasks (line 264) | def list_tasks(cls):
method list_processors (line 268) | def list_processors(cls):
method list_lr_schedulers (line 272) | def list_lr_schedulers(cls):
method list_datasets (line 276) | def list_datasets(cls):
method get_path (line 280) | def get_path(cls, name):
method get (line 284) | def get(cls, name, default=None, no_warning=False):
method unregister (line 315) | def unregister(cls, name):
FILE: lavis/common/utils.py
function now (line 35) | def now():
function is_url (line 41) | def is_url(url_or_filename):
function get_cache_path (line 46) | def get_cache_path(rel_path):
function get_abs_path (line 50) | def get_abs_path(rel_path):
function load_json (line 54) | def load_json(filename):
function makedir (line 64) | def makedir(dir_path):
function get_redirected_url (line 78) | def get_redirected_url(url: str):
function to_google_drive_download_url (line 93) | def to_google_drive_download_url(view_url: str) -> str:
function download_google_drive_url (line 108) | def download_google_drive_url(url: str, output_path: str, output_file_na...
function _get_google_drive_file_id (line 141) | def _get_google_drive_file_id(url: str) -> Optional[str]:
function _urlretrieve (line 154) | def _urlretrieve(url: str, filename: str, chunk_size: int = 1024) -> None:
function download_url (line 167) | def download_url(
function download_and_extract_archive (line 221) | def download_and_extract_archive(
function cache_url (line 242) | def cache_url(url: str, cache_dir: str) -> str:
function create_file_symlink (line 261) | def create_file_symlink(file1, file2):
function save_file (line 275) | def save_file(data, filename, append_to_json=True, verbose=True):
function load_file (line 313) | def load_file(filename, mmap_mode=None, verbose=True, allow_pickle=False):
function abspath (line 374) | def abspath(resource_path: str):
function makedir (line 386) | def makedir(dir_path):
function is_url (line 400) | def is_url(input_url):
function cleanup_dir (line 408) | def cleanup_dir(dir):
function get_file_size (line 419) | def get_file_size(filename):
FILE: lavis/common/vqa_tools/vqa.py
class VQA (line 31) | class VQA:
method __init__ (line 32) | def __init__(self, annotation_file=None, question_file=None):
method createIndex (line 53) | def createIndex(self):
method info (line 71) | def info(self):
method getQuesIds (line 79) | def getQuesIds(self, imgIds=[], quesTypes=[], ansTypes=[]):
method getImgIds (line 114) | def getImgIds(self, quesIds=[], quesTypes=[], ansTypes=[]):
method loadQA (line 148) | def loadQA(self, ids=[]):
method showQA (line 159) | def showQA(self, anns):
method loadRes (line 173) | def loadRes(self, resFile, quesFile):
FILE: lavis/common/vqa_tools/vqa_eval.py
class VQAEval (line 18) | class VQAEval:
method __init__ (line 19) | def __init__(self, vqa=None, vqaRes=None, n=2):
method evaluate (line 193) | def evaluate(self, quesIds=None):
method processPunctuation (line 249) | def processPunctuation(self, inText):
method processDigitArticle (line 261) | def processDigitArticle(self, inText):
method setAccuracy (line 276) | def setAccuracy(self, accQA, accQuesType, accAnsType):
method setEvalQA (line 292) | def setEvalQA(self, quesId, acc):
method setEvalQuesType (line 295) | def setEvalQuesType(self, quesId, quesType, acc):
method setEvalAnsType (line 300) | def setEvalAnsType(self, quesId, ansType, acc):
method updateProgress (line 305) | def updateProgress(self, progress):
FILE: lavis/datasets/builders/__init__.py
function load_dataset (line 79) | def load_dataset(name, cfg_path=None, vis_path=None, data_type=None):
class DatasetZoo (line 117) | class DatasetZoo:
method __init__ (line 118) | def __init__(self) -> None:
method get_names (line 124) | def get_names(self):
FILE: lavis/datasets/builders/base_dataset_builder.py
class BaseDatasetBuilder (line 23) | class BaseDatasetBuilder:
method __init__ (line 26) | def __init__(self, cfg=None):
method build_datasets (line 42) | def build_datasets(self):
method build_processors (line 58) | def build_processors(self):
method _build_proc_from_cfg (line 77) | def _build_proc_from_cfg(cfg):
method default_config_path (line 85) | def default_config_path(cls, type="default"):
method _download_data (line 88) | def _download_data(self):
method _download_ann (line 92) | def _download_ann(self):
method _download_vis (line 149) | def _download_vis(self):
method build (line 163) | def build(self):
function load_dataset_config (line 229) | def load_dataset_config(cfg_path):
FILE: lavis/datasets/builders/caption_builder.py
class COCOCapBuilder (line 22) | class COCOCapBuilder(BaseDatasetBuilder):
class COCOCapBuilder (line 32) | class COCOCapBuilder(BaseDatasetBuilder):
class MSRVTTCapBuilder (line 41) | class MSRVTTCapBuilder(BaseDatasetBuilder):
class MSVDCapBuilder (line 51) | class MSVDCapBuilder(BaseDatasetBuilder):
class VATEXCapBuilder (line 61) | class VATEXCapBuilder(BaseDatasetBuilder):
FILE: lavis/datasets/builders/classification_builder.py
class NLVRBuilder (line 15) | class NLVRBuilder(BaseDatasetBuilder):
class SNLIVisualEntailmentBuilder (line 23) | class SNLIVisualEntailmentBuilder(BaseDatasetBuilder):
FILE: lavis/datasets/builders/dialogue_builder.py
class AVSDDialBuilder (line 17) | class AVSDDialBuilder(BaseDatasetBuilder):
FILE: lavis/datasets/builders/image_text_pair_builder.py
class ConceptualCaption3MBuilder (line 17) | class ConceptualCaption3MBuilder(BaseDatasetBuilder):
class ConceptualCaption12MBuilder (line 26) | class ConceptualCaption12MBuilder(BaseDatasetBuilder):
class SBUCaptionBuilder (line 35) | class SBUCaptionBuilder(BaseDatasetBuilder):
class VGCaptionBuilder (line 42) | class VGCaptionBuilder(BaseDatasetBuilder):
class Laion2BMultiBuilder (line 49) | class Laion2BMultiBuilder(BaseDatasetBuilder):
method _download_ann (line 54) | def _download_ann(self):
method _download_vis (line 57) | def _download_vis(self):
method build (line 60) | def build(self):
FILE: lavis/datasets/builders/imagefolder_builder.py
class ImageNetBuilder (line 16) | class ImageNetBuilder(BaseDatasetBuilder):
method _download_ann (line 22) | def _download_ann(self):
method build (line 25) | def build(self):
FILE: lavis/datasets/builders/retrieval_builder.py
class MSRVTTRetrievalBuilder (line 20) | class MSRVTTRetrievalBuilder(BaseDatasetBuilder):
class DiDeMoRetrievalBuilder (line 28) | class DiDeMoRetrievalBuilder(BaseDatasetBuilder):
class COCORetrievalBuilder (line 36) | class COCORetrievalBuilder(BaseDatasetBuilder):
class Flickr30kBuilder (line 44) | class Flickr30kBuilder(BaseDatasetBuilder):
FILE: lavis/datasets/builders/video_qa_builder.py
class VideoQABuilder (line 14) | class VideoQABuilder(BaseDatasetBuilder):
method build (line 18) | def build(self):
class MCVideoQABuilder (line 32) | class MCVideoQABuilder(BaseDatasetBuilder):
method build (line 36) | def build(self):
class MSRVTTQABuilder (line 45) | class MSRVTTQABuilder(VideoQABuilder):
class MSVDQABuilder (line 52) | class MSVDQABuilder(VideoQABuilder):
class NextQABuilder (line 59) | class NextQABuilder(MCVideoQABuilder):
class STARBuilder (line 64) | class STARBuilder(MCVideoQABuilder):
class TVQABuilder (line 70) | class TVQABuilder(MCVideoQABuilder):
class How2QABuilder (line 76) | class How2QABuilder(MCVideoQABuilder):
class VLEPBuilder (line 82) | class VLEPBuilder(MCVideoQABuilder):
class QVHBuilder (line 88) | class QVHBuilder(MCVideoQABuilder):
FILE: lavis/datasets/builders/vqa_builder.py
class COCOVQABuilder (line 18) | class COCOVQABuilder(BaseDatasetBuilder):
class VGVQABuilder (line 29) | class VGVQABuilder(BaseDatasetBuilder):
class OKVQABuilder (line 35) | class OKVQABuilder(COCOVQABuilder):
class AOKVQABuilder (line 42) | class AOKVQABuilder(BaseDatasetBuilder):
class GQABuilder (line 50) | class GQABuilder(BaseDatasetBuilder):
FILE: lavis/datasets/data_utils.py
function load_video (line 29) | def load_video(video_path, n_frms=MAX_INT, height=-1, width=-1, sampling...
function load_video_demo (line 74) | def load_video_demo(video_path, n_frms=MAX_INT, height=-1, width=-1, sam...
function apply_to_sample (line 123) | def apply_to_sample(f, sample):
function move_to_cuda (line 140) | def move_to_cuda(sample):
function prepare_sample (line 147) | def prepare_sample(samples, cuda_enabled=True):
function reorg_datasets_by_split (line 156) | def reorg_datasets_by_split(datasets):
function concat_datasets (line 182) | def concat_datasets(datasets):
function extract_archive (line 249) | def extract_archive(from_path, to_path=None, overwrite=False):
function save_frames_grid (line 331) | def save_frames_grid(img_array, out_path):
FILE: lavis/datasets/datasets/aok_vqa_datasets.py
class __DisplMixin (line 18) | class __DisplMixin:
method displ_item (line 19) | def displ_item(self, index):
class AOKVQADataset (line 34) | class AOKVQADataset(VQADataset, __DisplMixin):
method __init__ (line 35) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method __getitem__ (line 38) | def __getitem__(self, index):
class AOKVQAEvalDataset (line 67) | class AOKVQAEvalDataset(VQAEvalDataset, __DisplMixin):
method __init__ (line 68) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method collater (line 96) | def collater(self, samples):
method __getitem__ (line 126) | def __getitem__(self, index):
FILE: lavis/datasets/datasets/avsd_dialogue_datasets.py
class AVSDDialDataset (line 15) | class AVSDDialDataset(DialogueDataset):
method __init__ (line 16) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method __getitem__ (line 24) | def __getitem__(self, index):
method collater (line 45) | def collater(self, samples):
class AVSDDialEvalDataset (line 92) | class AVSDDialEvalDataset(DialogueEvalDataset):
method __init__ (line 93) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method __getitem__ (line 101) | def __getitem__(self, index):
method collater (line 122) | def collater(self, samples):
FILE: lavis/datasets/datasets/base_dataset.py
class BaseDataset (line 16) | class BaseDataset(Dataset):
method __init__ (line 17) | def __init__(
method __len__ (line 44) | def __len__(self):
method collater (line 47) | def collater(self, samples):
method set_processors (line 50) | def set_processors(self, vis_processor, text_processor):
method _add_instance_ids (line 54) | def _add_instance_ids(self, key="instance_id"):
class ConcatDataset (line 62) | class ConcatDataset(ConcatDataset):
method __init__ (line 63) | def __init__(self, datasets: Iterable[Dataset]) -> None:
method collater (line 66) | def collater(self, samples):
FILE: lavis/datasets/datasets/caption_datasets.py
class __DisplMixin (line 15) | class __DisplMixin:
method displ_item (line 16) | def displ_item(self, index):
class CaptionDataset (line 28) | class CaptionDataset(BaseDataset, __DisplMixin):
method __init__ (line 29) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method __getitem__ (line 44) | def __getitem__(self, index):
class CaptionEvalDataset (line 62) | class CaptionEvalDataset(BaseDataset, __DisplMixin):
method __init__ (line 63) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method __getitem__ (line 71) | def __getitem__(self, index):
FILE: lavis/datasets/datasets/coco_caption_datasets.py
class COCOCapEvalDataset (line 21) | class COCOCapEvalDataset(CaptionEvalDataset):
method __init__ (line 22) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method __getitem__ (line 30) | def __getitem__(self, index):
class NoCapsEvalDataset (line 47) | class NoCapsEvalDataset(CaptionEvalDataset):
method __init__ (line 48) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method __getitem__ (line 56) | def __getitem__(self, index):
FILE: lavis/datasets/datasets/coco_vqa_datasets.py
class __DisplMixin (line 18) | class __DisplMixin:
method displ_item (line 19) | def displ_item(self, index):
class COCOVQADataset (line 33) | class COCOVQADataset(VQADataset, __DisplMixin):
method __init__ (line 34) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method __getitem__ (line 37) | def __getitem__(self, index):
class COCOVQAEvalDataset (line 64) | class COCOVQAEvalDataset(VQAEvalDataset, __DisplMixin):
method __init__ (line 65) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method __getitem__ (line 93) | def __getitem__(self, index):
FILE: lavis/datasets/datasets/dataloader_utils.py
class MultiIterLoader (line 15) | class MultiIterLoader:
method __init__ (line 24) | def __init__(self, loaders, ratios=None):
method __next__ (line 40) | def __next__(self):
class PrefetchLoader (line 46) | class PrefetchLoader(object):
method __init__ (line 54) | def __init__(self, loader):
method __iter__ (line 58) | def __iter__(self):
method __len__ (line 73) | def __len__(self):
method preload (line 76) | def preload(self, it):
method next (line 101) | def next(self, it):
method __getattr__ (line 109) | def __getattr__(self, name):
function record_cuda_stream (line 114) | def record_cuda_stream(batch):
class IterLoader (line 127) | class IterLoader:
method __init__ (line 135) | def __init__(self, dataloader: DataLoader, use_distributed: bool = Fal...
method epoch (line 142) | def epoch(self) -> int:
method __next__ (line 145) | def __next__(self):
method __iter__ (line 158) | def __iter__(self):
method __len__ (line 161) | def __len__(self):
FILE: lavis/datasets/datasets/dialogue_datasets.py
class __DisplMixin (line 19) | class __DisplMixin:
method displ_item (line 20) | def displ_item(self, index):
class DialogueDataset (line 32) | class DialogueDataset(BaseDataset, __DisplMixin):
method __init__ (line 33) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method __getitem__ (line 71) | def __getitem__(self, index):
class DialogueEvalDataset (line 88) | class DialogueEvalDataset(BaseDataset, __DisplMixin):
method __init__ (line 89) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method __getitem__ (line 128) | def __getitem__(self, index):
FILE: lavis/datasets/datasets/gqa_datasets.py
class __DisplMixin (line 18) | class __DisplMixin:
method displ_item (line 19) | def displ_item(self, index):
class GQADataset (line 33) | class GQADataset(VQADataset, __DisplMixin):
method __init__ (line 34) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method __getitem__ (line 37) | def __getitem__(self, index):
class GQAEvalDataset (line 57) | class GQAEvalDataset(VQAEvalDataset, __DisplMixin):
method __init__ (line 58) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method __getitem__ (line 80) | def __getitem__(self, index):
FILE: lavis/datasets/datasets/image_text_pair_datasets.py
class __DisplMixin (line 15) | class __DisplMixin:
method displ_item (line 16) | def displ_item(self, index):
class ImageTextPairDataset (line 28) | class ImageTextPairDataset(BaseDataset, __DisplMixin):
method __init__ (line 29) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method __getitem__ (line 36) | def __getitem__(self, index):
FILE: lavis/datasets/datasets/imagefolder_dataset.py
class ImageFolderDataset (line 16) | class ImageFolderDataset(BaseDataset):
method __init__ (line 17) | def __init__(self, vis_processor, vis_root, classnames=[], **kwargs):
method __len__ (line 31) | def __len__(self):
method __getitem__ (line 34) | def __getitem__(self, index):
method displ_item (line 50) | def displ_item(self, index):
FILE: lavis/datasets/datasets/laion_dataset.py
class LaionDataset (line 12) | class LaionDataset(BaseDataset):
method __init__ (line 13) | def __init__(self, vis_processor, text_processor, location):
method to_dict (line 26) | def to_dict(self, sample):
function to_image_text_pair (line 36) | def to_image_text_pair(sample):
FILE: lavis/datasets/datasets/mc_video_vqa_datasets.py
class __DisplMixin (line 18) | class __DisplMixin:
method displ_item (line 19) | def displ_item(self, index):
class MCVideoQADataset (line 30) | class MCVideoQADataset(MultimodalClassificationDataset, __DisplMixin):
method __init__ (line 31) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method _load_auxiliary_mappings (line 34) | def _load_auxiliary_mappings(self):
method _get_answer_label (line 37) | def _get_answer_label(self, answer):
method __getitem__ (line 43) | def __getitem__(self, index):
FILE: lavis/datasets/datasets/multimodal_classification_datasets.py
class MultimodalClassificationDataset (line 12) | class MultimodalClassificationDataset(BaseDataset):
method __init__ (line 13) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method _build_class_labels (line 19) | def _build_class_labels(self):
method _load_auxiliary_mappings (line 23) | def _load_auxiliary_mappings(self):
FILE: lavis/datasets/datasets/nlvr_datasets.py
class __DisplMixin (line 19) | class __DisplMixin:
method displ_item (line 20) | def displ_item(self, index):
class NLVRDataset (line 34) | class NLVRDataset(MultimodalClassificationDataset, __DisplMixin):
method __init__ (line 35) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method _build_class_labels (line 40) | def _build_class_labels(self):
method _flip (line 44) | def _flip(samples):
method __getitem__ (line 65) | def __getitem__(self, index):
class NLVREvalDataset (line 91) | class NLVREvalDataset(NLVRDataset):
method _flip (line 93) | def _flip(samples):
FILE: lavis/datasets/datasets/retrieval_datasets.py
class __DisplMixin (line 15) | class __DisplMixin:
method displ_item (line 16) | def displ_item(self, index):
class RetrievalDataset (line 29) | class RetrievalDataset(BaseDataset, __DisplMixin):
method __init__ (line 30) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method __getitem__ (line 45) | def __getitem__(self, index):
class RetrievalEvalDataset (line 63) | class RetrievalEvalDataset(BaseDataset, __DisplMixin):
method __init__ (line 64) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method __getitem__ (line 88) | def __getitem__(self, index):
class VideoRetrievalDataset (line 98) | class VideoRetrievalDataset(BaseDataset, __DisplMixin):
method __init__ (line 99) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method __getitem__ (line 114) | def __getitem__(self, index):
class VideoRetrievalEvalDataset (line 131) | class VideoRetrievalEvalDataset(BaseDataset, __DisplMixin):
method __init__ (line 132) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method __getitem__ (line 156) | def __getitem__(self, index):
FILE: lavis/datasets/datasets/snli_ve_datasets.py
class __DisplMixin (line 17) | class __DisplMixin:
method displ_item (line 18) | def displ_item(self, index):
class SNLIVisualEntialmentDataset (line 31) | class SNLIVisualEntialmentDataset(MultimodalClassificationDataset, __Dis...
method __init__ (line 32) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method _build_class_labels (line 37) | def _build_class_labels(self):
method __getitem__ (line 40) | def __getitem__(self, index):
FILE: lavis/datasets/datasets/vg_vqa_datasets.py
class VGVQADataset (line 15) | class VGVQADataset(VQADataset):
method __init__ (line 16) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method __getitem__ (line 19) | def __getitem__(self, index):
FILE: lavis/datasets/datasets/video_caption_datasets.py
class VideoCaptionDataset (line 14) | class VideoCaptionDataset(CaptionDataset):
method __init__ (line 15) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method __getitem__ (line 23) | def __getitem__(self, index):
class VideoCaptionEvalDataset (line 41) | class VideoCaptionEvalDataset(BaseDataset):
method __init__ (line 42) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method __getitem__ (line 50) | def __getitem__(self, index):
FILE: lavis/datasets/datasets/video_vqa_datasets.py
class __DisplMixin (line 17) | class __DisplMixin:
method displ_item (line 18) | def displ_item(self, index):
class VideoQADataset (line 28) | class VideoQADataset(MultimodalClassificationDataset, __DisplMixin):
method __init__ (line 29) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method _build_class_labels (line 32) | def _build_class_labels(self, ans_path):
method _get_answer_label (line 37) | def _get_answer_label(self, answer):
method __getitem__ (line 43) | def __getitem__(self, index):
FILE: lavis/datasets/datasets/vqa_datasets.py
class VQADataset (line 13) | class VQADataset(BaseDataset):
method __init__ (line 14) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
method collater (line 17) | def collater(self, samples):
class VQAEvalDataset (line 42) | class VQAEvalDataset(BaseDataset):
method __init__ (line 43) | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
FILE: lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc12m.py
function _df_split_apply (line 34) | def _df_split_apply(tup_arg):
function df_multiprocess (line 40) | def df_multiprocess(df, processes, chunk_size, func, dataset_name):
function _file_name (line 82) | def _file_name(row):
function check_mimetype (line 97) | def check_mimetype(row):
function check_download (line 106) | def check_download(row):
function resize_img (line 124) | def resize_img(req):
function download_image (line 134) | def download_image(row):
function open_tsv (line 174) | def open_tsv(fname, folder):
function df_from_shelve (line 184) | def df_from_shelve(chunk_size, func, dataset_name):
FILE: lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc3m.py
function _df_split_apply (line 34) | def _df_split_apply(tup_arg):
function df_multiprocess (line 40) | def df_multiprocess(df, processes, chunk_size, func, dataset_name):
function _file_name (line 82) | def _file_name(row):
function check_mimetype (line 97) | def check_mimetype(row):
function check_download (line 106) | def check_download(row):
function resize_img (line 124) | def resize_img(req):
function download_image (line 134) | def download_image(row):
function open_tsv (line 174) | def open_tsv(fname, folder):
function df_from_shelve (line 184) | def df_from_shelve(chunk_size, func, dataset_name):
FILE: lavis/datasets/download_scripts/download_coco.py
function download_datasets (line 29) | def download_datasets(root, url):
FILE: lavis/datasets/download_scripts/download_didemo.py
function download_datasets (line 23) | def download_datasets(root, url):
function move_files (line 31) | def move_files(download_path, storage_path):
FILE: lavis/datasets/download_scripts/download_flickr.py
function move_directory (line 32) | def move_directory(src_dir, dst_dir):
FILE: lavis/datasets/download_scripts/download_gqa.py
function download_datasets (line 24) | def download_datasets(root, url):
FILE: lavis/datasets/download_scripts/download_msrvtt.py
function download_datasets (line 38) | def download_datasets(root, url):
function merge_datasets (line 46) | def merge_datasets(download_path, storage_path):
FILE: lavis/datasets/download_scripts/download_msvd.py
function download_datasets (line 24) | def download_datasets(root, url):
function move_files (line 28) | def move_files(download_path, storage_path):
FILE: lavis/datasets/download_scripts/download_nocaps.py
function download_file (line 39) | def download_file(url, filename):
function download_image_from_url_val (line 63) | def download_image_from_url_val(url):
function download_image_from_url_test (line 70) | def download_image_from_url_test(url):
FILE: lavis/datasets/download_scripts/download_sbu.py
function fetch_single_image (line 28) | def fetch_single_image(image_url, timeout=None, retries=0):
function download_and_save_image (line 44) | def download_and_save_image(ann, save_dir, timeout=None, retries=0):
FILE: lavis/datasets/download_scripts/download_vg.py
function download_datasets (line 27) | def download_datasets(root, url):
FILE: lavis/models/__init__.py
function load_model (line 91) | def load_model(name, model_type, is_eval=False, device="cpu", checkpoint...
function load_preprocess (line 125) | def load_preprocess(config):
function load_model_and_preprocess (line 177) | def load_model_and_preprocess(name, model_type, is_eval=False, device="c...
class ModelZoo (line 226) | class ModelZoo:
method __init__ (line 237) | def __init__(self) -> None:
method __str__ (line 243) | def __str__(self) -> str:
method __iter__ (line 258) | def __iter__(self):
method __len__ (line 261) | def __len__(self):
FILE: lavis/models/albef_models/__init__.py
class AlbefBase (line 25) | class AlbefBase(BaseModel):
method init_tokenizer (line 27) | def init_tokenizer(cls):
method load_from_pretrained (line 30) | def load_from_pretrained(self, url_or_filename, rename_text_keys=True):
function compute_sim_matrix (line 76) | def compute_sim_matrix(model, data_loader, **kwargs):
FILE: lavis/models/albef_models/albef_classification.py
class AlbefClassification (line 26) | class AlbefClassification(AlbefBase, MomentumDistilationMixin):
method __init__ (line 31) | def __init__(
method _rampup_factor (line 80) | def _rampup_factor(self, epoch, iters, num_iters_per_epoch):
method forward (line 83) | def forward(self, samples, is_train=True):
method predict (line 149) | def predict(self, samples):
method from_config (line 154) | def from_config(cls, cfg=None):
FILE: lavis/models/albef_models/albef_feature_extractor.py
class AlbefFeatureExtractor (line 23) | class AlbefFeatureExtractor(AlbefBase):
method __init__ (line 28) | def __init__(self, image_encoder, text_encoder, embed_dim=256, max_txt...
method extract_features (line 49) | def extract_features(self, samples, mode="multimodal"):
method from_config (line 175) | def from_config(cls, cfg=None):
FILE: lavis/models/albef_models/albef_nlvr.py
class AlbefNLVR (line 24) | class AlbefNLVR(AlbefBase, MomentumDistilationMixin):
method __init__ (line 29) | def __init__(
method _rampup_factor (line 76) | def _rampup_factor(self, epoch, iters, num_iters_per_epoch):
method forward (line 79) | def forward(self, samples, is_train=True):
method share_cross_attention (line 198) | def share_cross_attention(self, model):
method predict (line 213) | def predict(self, samples):
method load_from_pretrained (line 217) | def load_from_pretrained(self, url_or_filename, use_distill=True):
method from_config (line 227) | def from_config(cls, cfg=None):
FILE: lavis/models/albef_models/albef_outputs.py
class AlbefSimilarity (line 20) | class AlbefSimilarity(ModelOutput):
class AlbefIntermediateOutput (line 32) | class AlbefIntermediateOutput(ModelOutput):
class AlbefOutput (line 54) | class AlbefOutput(ModelOutput):
class AlbefOutputWithLogits (line 70) | class AlbefOutputWithLogits(AlbefOutput):
class AlbefOutputFeatures (line 76) | class AlbefOutputFeatures(ModelOutput):
FILE: lavis/models/albef_models/albef_pretrain.py
class AlbefPretrain (line 29) | class AlbefPretrain(AlbefBase, MomentumDistilationMixin, SharedQueueMixin):
method __init__ (line 41) | def __init__(
method _rampup_factor (line 102) | def _rampup_factor(self, epoch, iters, num_iters_per_epoch):
method forward (line 105) | def forward(self, samples):
method mask (line 341) | def mask(
method from_config (line 386) | def from_config(cls, cfg=None):
FILE: lavis/models/albef_models/albef_retrieval.py
class AlbefRetrieval (line 26) | class AlbefRetrieval(AlbefBase, MomentumDistilationMixin, SharedQueueMix...
method __init__ (line 45) | def __init__(
method _rampup_factor (line 104) | def _rampup_factor(self, epoch, iters, num_iters_per_epoch):
method forward (line 107) | def forward(self, samples):
method from_config (line 310) | def from_config(cls, cfg=None):
method compute_sim_matrix (line 338) | def compute_sim_matrix(self, data_loader, task_cfg):
FILE: lavis/models/albef_models/albef_vqa.py
class AlbefVQA (line 25) | class AlbefVQA(AlbefBase, MomentumDistilationMixin):
method __init__ (line 42) | def __init__(
method _rampup_factor (line 80) | def _rampup_factor(self, epoch, iters, num_iters_per_epoch):
method forward (line 83) | def forward(self, samples):
method forward_encoder (line 137) | def forward_encoder(self, samples):
method forward_decoder (line 167) | def forward_decoder(self, samples, encoder_out, **kwargs):
method predict_answers (line 228) | def predict_answers(self, samples, answer_list, num_ans_candidates=128...
method rank_answers (line 269) | def rank_answers(self, samples, answer_list, num_ans_candidates):
method from_config (line 349) | def from_config(cls, cfg=None):
method load_from_pretrained (line 381) | def load_from_pretrained(self, url_or_filename):
FILE: lavis/models/alpro_models/__init__.py
class AlproBase (line 19) | class AlproBase(BaseModel):
method init_tokenizer (line 21) | def init_tokenizer(cls):
method load_from_pretrained (line 24) | def load_from_pretrained(self, url_or_filename, num_frames, num_patches):
function resize_spatial_embedding (line 78) | def resize_spatial_embedding(state_dict, key, num_patches):
function resize_temporal_embedding (line 95) | def resize_temporal_embedding(state_dict, key, num_frames):
FILE: lavis/models/alpro_models/alpro_outputs.py
class AlproSimilarity (line 19) | class AlproSimilarity(ModelOutput):
class AlproIntermediateOutput (line 28) | class AlproIntermediateOutput(ModelOutput):
class AlproOutput (line 42) | class AlproOutput(ModelOutput):
class AlproOutputWithLogits (line 58) | class AlproOutputWithLogits(AlproOutput):
FILE: lavis/models/alpro_models/alpro_qa.py
class AlproQA (line 25) | class AlproQA(AlproBase):
method __init__ (line 31) | def __init__(
method forward (line 53) | def forward(self, samples, is_train=True):
method predict (line 109) | def predict(self, samples):
method from_config (line 114) | def from_config(cls, cfg):
FILE: lavis/models/alpro_models/alpro_retrieval.py
class AlproRetrieval (line 30) | class AlproRetrieval(AlproBase):
method __init__ (line 36) | def __init__(
method forward (line 65) | def forward(self, samples):
method compute_vtm (line 150) | def compute_vtm(
method compute_sim_matrix (line 242) | def compute_sim_matrix(self, data_loader, task_cfg):
method from_config (line 397) | def from_config(cls, cfg):
FILE: lavis/models/base_model.py
class BaseModel (line 19) | class BaseModel(nn.Module):
method __init__ (line 22) | def __init__(self):
method device (line 26) | def device(self):
method load_checkpoint (line 29) | def load_checkpoint(self, url_or_filename):
method from_pretrained (line 59) | def from_pretrained(cls, model_type):
method default_config_path (line 75) | def default_config_path(cls, model_type):
method load_checkpoint_from_config (line 81) | def load_checkpoint_from_config(self, cfg, **kwargs):
method before_evaluation (line 102) | def before_evaluation(self, **kwargs):
method show_n_params (line 105) | def show_n_params(self, return_str=True):
class BaseEncoder (line 121) | class BaseEncoder(nn.Module):
method __init__ (line 126) | def __init__(self):
method forward_features (line 129) | def forward_features(self, samples, **kwargs):
method device (line 133) | def device(self):
class SharedQueueMixin (line 137) | class SharedQueueMixin:
method _dequeue_and_enqueue (line 139) | def _dequeue_and_enqueue(self, image_feat, text_feat, idxs=None):
class MomentumDistilationMixin (line 161) | class MomentumDistilationMixin:
method copy_params (line 163) | def copy_params(self):
method _momentum_update (line 172) | def _momentum_update(self):
class GatherLayer (line 182) | class GatherLayer(torch.autograd.Function):
method forward (line 189) | def forward(ctx, x):
method backward (line 197) | def backward(ctx, *grads):
function all_gather_with_grad (line 203) | def all_gather_with_grad(tensors):
function concat_all_gather (line 221) | def concat_all_gather(tensor):
function tile (line 239) | def tile(x, dim, n_tile):
FILE: lavis/models/blip2_models/Qformer.py
class BertEmbeddings (line 51) | class BertEmbeddings(nn.Module):
method __init__ (line 54) | def __init__(self, config):
method forward (line 78) | def forward(
class BertSelfAttention (line 111) | class BertSelfAttention(nn.Module):
method __init__ (line 112) | def __init__(self, config, is_cross_attention):
method save_attn_gradients (line 149) | def save_attn_gradients(self, attn_gradients):
method get_attn_gradients (line 152) | def get_attn_gradients(self):
method save_attention_map (line 155) | def save_attention_map(self, attention_map):
method get_attention_map (line 158) | def get_attention_map(self):
method transpose_for_scores (line 161) | def transpose_for_scores(self, x):
method forward (line 169) | def forward(
class BertSelfOutput (line 278) | class BertSelfOutput(nn.Module):
method __init__ (line 279) | def __init__(self, config):
method forward (line 285) | def forward(self, hidden_states, input_tensor):
class BertAttention (line 292) | class BertAttention(nn.Module):
method __init__ (line 293) | def __init__(self, config, is_cross_attention=False):
method prune_heads (line 299) | def prune_heads(self, heads):
method forward (line 322) | def forward(
class BertIntermediate (line 349) | class BertIntermediate(nn.Module):
method __init__ (line 350) | def __init__(self, config):
method forward (line 358) | def forward(self, hidden_states):
class BertOutput (line 364) | class BertOutput(nn.Module):
method __init__ (line 365) | def __init__(self, config):
method forward (line 371) | def forward(self, hidden_states, input_tensor):
class BertLayer (line 378) | class BertLayer(nn.Module):
method __init__ (line 379) | def __init__(self, config, layer_num):
method forward (line 402) | def forward(
method feed_forward_chunk (line 476) | def feed_forward_chunk(self, attention_output):
method feed_forward_chunk_query (line 481) | def feed_forward_chunk_query(self, attention_output):
class BertEncoder (line 487) | class BertEncoder(nn.Module):
method __init__ (line 488) | def __init__(self, config):
method forward (line 495) | def forward(
class BertPooler (line 592) | class BertPooler(nn.Module):
method __init__ (line 593) | def __init__(self, config):
method forward (line 598) | def forward(self, hidden_states):
class BertPredictionHeadTransform (line 607) | class BertPredictionHeadTransform(nn.Module):
method __init__ (line 608) | def __init__(self, config):
method forward (line 617) | def forward(self, hidden_states):
class BertLMPredictionHead (line 624) | class BertLMPredictionHead(nn.Module):
method __init__ (line 625) | def __init__(self, config):
method forward (line 638) | def forward(self, hidden_states):
class BertOnlyMLMHead (line 644) | class BertOnlyMLMHead(nn.Module):
method __init__ (line 645) | def __init__(self, config):
method forward (line 649) | def forward(self, sequence_output):
class BertPreTrainedModel (line 654) | class BertPreTrainedModel(PreTrainedModel):
method _init_weights (line 664) | def _init_weights(self, module):
class BertModel (line 677) | class BertModel(BertPreTrainedModel):
method __init__ (line 687) | def __init__(self, config, add_pooling_layer=False):
method get_input_embeddings (line 699) | def get_input_embeddings(self):
method set_input_embeddings (line 702) | def set_input_embeddings(self, value):
method _prune_heads (line 705) | def _prune_heads(self, heads_to_prune):
method get_extended_attention_mask (line 713) | def get_extended_attention_mask(
method forward (line 804) | def forward(
class BertLMHeadModel (line 968) | class BertLMHeadModel(BertPreTrainedModel):
method __init__ (line 973) | def __init__(self, config):
method get_output_embeddings (line 981) | def get_output_embeddings(self):
method set_output_embeddings (line 984) | def set_output_embeddings(self, new_embeddings):
method forward (line 987) | def forward(
method prepare_inputs_for_generation (line 1097) | def prepare_inputs_for_generation(
method _reorder_cache (line 1120) | def _reorder_cache(self, past, beam_idx):
class BertForMaskedLM (line 1131) | class BertForMaskedLM(BertPreTrainedModel):
method __init__ (line 1136) | def __init__(self, config):
method get_output_embeddings (line 1144) | def get_output_embeddings(self):
method set_output_embeddings (line 1147) | def set_output_embeddings(self, new_embeddings):
method forward (line 1150) | def forward(
FILE: lavis/models/blip2_models/blip2.py
class Blip2Base (line 27) | class Blip2Base(BaseModel):
method init_tokenizer (line 29) | def init_tokenizer(cls):
method init_Qformer (line 35) | def init_Qformer(cls, num_query_token, vision_width):
method init_TemporalQFormer (line 52) | def init_TemporalQFormer(cls, num_of_frame):
method init_vision_encoder (line 65) | def init_vision_encoder(
method init_vision_encoder_sevila (line 75) | def init_vision_encoder_sevila(
method load_from_pretrained (line 85) | def load_from_pretrained(self, url_or_filename):
method load_qformer_loc (line 105) | def load_qformer_loc(self):
function disabled_train (line 112) | def disabled_train(self, mode=True):
class LayerNorm (line 118) | class LayerNorm(nn.LayerNorm):
method forward (line 121) | def forward(self, x: torch.Tensor):
function compute_sim_matrix (line 127) | def compute_sim_matrix(model, data_loader, **kwargs):
FILE: lavis/models/blip2_models/blip2_fmr.py
class Blip2FMR (line 20) | class Blip2FMR(Blip2Base):
method __init__ (line 38) | def __init__( self, img_size=224, drop_path_rate=0,
method forward (line 99) | def forward(self, samples):
method generate (line 162) | def generate(self,
method predict_answers (line 256) | def predict_answers(
method _lemmatize (line 323) | def _lemmatize(self, answers):
method lemmatizer (line 340) | def lemmatizer(self):
method from_config (line 361) | def from_config(cls, cfg):
FILE: lavis/models/blip2_models/blip2_image_text_matching.py
class Blip2ITM (line 15) | class Blip2ITM(Blip2Qformer):
method __init__ (line 27) | def __init__(
method forward (line 49) | def forward(self, samples, match_head="itm"):
FILE: lavis/models/blip2_models/blip2_opt.py
class Blip2OPT (line 20) | class Blip2OPT(Blip2Base):
method __init__ (line 40) | def __init__(
method forward (line 95) | def forward(self, samples):
method generate (line 151) | def generate(
method from_config (line 260) | def from_config(cls, cfg):
FILE: lavis/models/blip2_models/blip2_qformer.py
class Blip2Qformer (line 27) | class Blip2Qformer(Blip2Base):
method __init__ (line 43) | def __init__(
method forward (line 86) | def forward(self, samples):
method generate (line 257) | def generate(
method forward_image (line 319) | def forward_image(self, image):
method forward_text (line 335) | def forward_text(self, text_tokens):
method compute_itm (line 343) | def compute_itm(self, image_inputs, text_ids, text_atts):
method extract_features (line 366) | def extract_features(self, samples, mode="multimodal"):
method from_config (line 476) | def from_config(cls, cfg):
method compute_sim_matrix (line 500) | def compute_sim_matrix(self, data_loader, task_cfg):
FILE: lavis/models/blip2_models/blip2_t5.py
class Blip2T5 (line 20) | class Blip2T5(Blip2Base):
method __init__ (line 38) | def __init__(
method forward (line 99) | def forward(self, samples):
method generate (line 154) | def generate(
method predict_answers (line 262) | def predict_answers(
method _lemmatize (line 329) | def _lemmatize(self, answers):
method lemmatizer (line 346) | def lemmatizer(self):
method from_config (line 367) | def from_config(cls, cfg):
FILE: lavis/models/blip2_models/modeling_opt.py
function _make_causal_mask (line 72) | def _make_causal_mask(
function _expand_mask (line 93) | def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Option...
class OPTLearnedPositionalEmbedding (line 109) | class OPTLearnedPositionalEmbedding(nn.Embedding):
method __init__ (line 114) | def __init__(self, num_embeddings: int, embedding_dim: int):
method forward (line 120) | def forward(
class OPTAttention (line 137) | class OPTAttention(nn.Module):
method __init__ (line 140) | def __init__(
method _shape (line 167) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
method forward (line 174) | def forward(
class OPTDecoderLayer (line 308) | class OPTDecoderLayer(nn.Module):
method __init__ (line 309) | def __init__(self, config: OPTConfig):
method forward (line 327) | def forward(
class OPTPreTrainedModel (line 432) | class OPTPreTrainedModel(PreTrainedModel):
method _init_weights (line 440) | def _init_weights(self, module):
method _set_gradient_checkpointing (line 451) | def _set_gradient_checkpointing(self, module, value=False):
class OPTDecoder (line 518) | class OPTDecoder(OPTPreTrainedModel):
method __init__ (line 526) | def __init__(self, config: OPTConfig):
method get_input_embeddings (line 571) | def get_input_embeddings(self):
method set_input_embeddings (line 574) | def set_input_embeddings(self, value):
method _prepare_decoder_attention_mask (line 578) | def _prepare_decoder_attention_mask(
method forward (line 604) | def forward(
class OPTModel (line 819) | class OPTModel(OPTPreTrainedModel):
method __init__ (line 820) | def __init__(self, config: OPTConfig):
method get_input_embeddings (line 826) | def get_input_embeddings(self):
method set_input_embeddings (line 829) | def set_input_embeddings(self, value):
method get_decoder (line 832) | def get_decoder(self):
method forward (line 843) | def forward(
class OPTForCausalLM (line 897) | class OPTForCausalLM(OPTPreTrainedModel):
method __init__ (line 900) | def __init__(self, config):
method get_input_embeddings (line 912) | def get_input_embeddings(self):
method set_input_embeddings (line 915) | def set_input_embeddings(self, value):
method get_output_embeddings (line 918) | def get_output_embeddings(self):
method set_output_embeddings (line 921) | def set_output_embeddings(self, new_embeddings):
method set_decoder (line 924) | def set_decoder(self, decoder):
method get_decoder (line 927) | def get_decoder(self):
method forward (line 933) | def forward(
method prepare_inputs_for_generation (line 1079) | def prepare_inputs_for_generation(
method _reorder_cache (line 1105) | def _reorder_cache(past, beam_idx):
FILE: lavis/models/blip2_models/modeling_t5.py
function load_tf_weights_in_t5 (line 79) | def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
class T5LayerNorm (line 254) | class T5LayerNorm(nn.Module):
method __init__ (line 255) | def __init__(self, hidden_size, eps=1e-6):
method forward (line 263) | def forward(self, hidden_states):
class T5DenseActDense (line 298) | class T5DenseActDense(nn.Module):
method __init__ (line 299) | def __init__(self, config: T5Config):
method forward (line 306) | def forward(self, hidden_states):
class T5DenseGatedActDense (line 314) | class T5DenseGatedActDense(nn.Module):
method __init__ (line 315) | def __init__(self, config: T5Config):
method forward (line 323) | def forward(self, hidden_states):
class T5LayerFF (line 332) | class T5LayerFF(nn.Module):
method __init__ (line 333) | def __init__(self, config: T5Config):
method forward (line 343) | def forward(self, hidden_states):
class T5Attention (line 350) | class T5Attention(nn.Module):
method __init__ (line 351) | def __init__(self, config: T5Config, has_relative_attention_bias=False):
method prune_heads (line 376) | def prune_heads(self, heads):
method _relative_position_bucket (line 393) | def _relative_position_bucket(
method compute_bias (line 447) | def compute_bias(self, query_length, key_length, device=None):
method forward (line 474) | def forward(
class T5LayerSelfAttention (line 623) | class T5LayerSelfAttention(nn.Module):
method __init__ (line 624) | def __init__(self, config, has_relative_attention_bias=False):
method forward (line 632) | def forward(
class T5LayerCrossAttention (line 659) | class T5LayerCrossAttention(nn.Module):
method __init__ (line 660) | def __init__(self, config):
method forward (line 666) | def forward(
class T5Block (line 697) | class T5Block(nn.Module):
method __init__ (line 698) | def __init__(self, config, has_relative_attention_bias=False):
method forward (line 712) | def forward(
class T5PreTrainedModel (line 829) | class T5PreTrainedModel(PreTrainedModel):
method dummy_inputs (line 843) | def dummy_inputs(self):
method _init_weights (line 853) | def _init_weights(self, module):
method _set_gradient_checkpointing (line 915) | def _set_gradient_checkpointing(self, module, value=False):
method _shift_right (line 919) | def _shift_right(self, input_ids):
class T5Stack (line 951) | class T5Stack(T5PreTrainedModel):
method __init__ (line 952) | def __init__(self, config, embed_tokens=None):
method parallelize (line 977) | def parallelize(self, device_map=None):
method deparallelize (line 1004) | def deparallelize(self):
method get_input_embeddings (line 1015) | def get_input_embeddings(self):
method set_input_embeddings (line 1018) | def set_input_embeddings(self, new_embeddings):
method forward (line 1021) | def forward(
class T5Model (line 1449) | class T5Model(T5PreTrainedModel):
method __init__ (line 1458) | def __init__(self, config: T5Config):
method parallelize (line 1482) | def parallelize(self, device_map=None):
method deparallelize (line 1494) | def deparallelize(self):
method get_input_embeddings (line 1503) | def get_input_embeddings(self):
method set_input_embeddings (line 1506) | def set_input_embeddings(self, new_embeddings):
method get_encoder (line 1511) | def get_encoder(self):
method get_decoder (line 1514) | def get_decoder(self):
method _prune_heads (line 1517) | def _prune_heads(self, heads_to_prune):
method forward (line 1529) | def forward(
class T5ForConditionalGeneration (line 1649) | class T5ForConditionalGeneration(T5PreTrainedModel):
method __init__ (line 1659) | def __init__(self, config: T5Config):
method parallelize (line 1687) | def parallelize(self, device_map=None):
method deparallelize (line 1700) | def deparallelize(self):
method get_input_embeddings (line 1710) | def get_input_embeddings(self):
method set_input_embeddings (line 1713) | def set_input_embeddings(self, new_embeddings):
method set_output_embeddings (line 1718) | def set_output_embeddings(self, new_embeddings):
method get_output_embeddings (line 1721) | def get_output_embeddings(self):
method get_encoder (line 1724) | def get_encoder(self):
method get_decoder (line 1727) | def get_decoder(self):
method forward (line 1734) | def forward(
method prepare_inputs_for_generation (line 1895) | def prepare_inputs_for_generation(
method prepare_decoder_input_ids_from_labels (line 1923) | def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
method _reorder_cache (line 1926) | def _reorder_cache(self, past, beam_idx):
class T5EncoderModel (line 1961) | class T5EncoderModel(T5PreTrainedModel):
method __init__ (line 1966) | def __init__(self, config: T5Config):
method parallelize (line 1983) | def parallelize(self, device_map=None):
method deparallelize (line 1994) | def deparallelize(self):
method get_input_embeddings (line 2001) | def get_input_embeddings(self):
method set_input_embeddings (line 2004) | def set_input_embeddings(self, new_embeddings):
method get_encoder (line 2008) | def get_encoder(self):
method _prune_heads (line 2011) | def _prune_heads(self, heads_to_prune):
method forward (line 2023) | def forward(
FILE: lavis/models/blip_models/__init__.py
function tie_encoder_decoder_weights (line 14) | def tie_encoder_decoder_weights(
FILE: lavis/models/blip_models/blip.py
class BlipBase (line 19) | class BlipBase(BaseModel):
method init_tokenizer (line 21) | def init_tokenizer(cls):
method load_from_pretrained (line 28) | def load_from_pretrained(self, url_or_filename):
FILE: lavis/models/blip_models/blip_caption.py
class BlipCaption (line 21) | class BlipCaption(BlipBase):
method __init__ (line 40) | def __init__(self, image_encoder, text_decoder, prompt=None, max_txt_l...
method forward_encoder (line 53) | def forward_encoder(self, samples):
method forward_decoder (line 57) | def forward_decoder(self, samples, image_embeds):
method forward (line 90) | def forward(self, samples):
method generate (line 136) | def generate(
method from_config (line 207) | def from_config(cls, cfg):
FILE: lavis/models/blip_models/blip_classification.py
class BlipClassification (line 25) | class BlipClassification(BlipBase, MomentumDistilationMixin):
method __init__ (line 30) | def __init__(
method _rampup_factor (line 74) | def _rampup_factor(self, epoch, iters, num_iters_per_epoch):
method forward (line 77) | def forward(self, samples, is_train=True):
method predict (line 142) | def predict(self, samples):
method from_config (line 147) | def from_config(cls, cfg=None):
FILE: lavis/models/blip_models/blip_feature_extractor.py
class BlipFeatureExtractor (line 21) | class BlipFeatureExtractor(BlipBase):
method __init__ (line 38) | def __init__(self, image_encoder, text_encoder, embed_dim, max_txt_len...
method extract_features (line 58) | def extract_features(self, samples, mode="multimodal"):
method from_config (line 190) | def from_config(cls, cfg=None):
FILE: lavis/models/blip_models/blip_image_text_matching.py
class BlipITM (line 19) | class BlipITM(BlipBase):
method __init__ (line 38) | def __init__(self, image_encoder, text_encoder, embed_dim=256, max_txt...
method forward (line 58) | def forward(self, samples, match_head="itm"):
method itm_rank (line 101) | def itm_rank(self, image_embeds, image_atts, encoder_input_ids, match_...
method from_config (line 132) | def from_config(cls, cfg=None):
function compute_gradcam (line 151) | def compute_gradcam(model, visual_input, text_input, tokenized_text, blo...
FILE: lavis/models/blip_models/blip_nlvr.py
class BlipNLVR (line 25) | class BlipNLVR(BlipBase, MomentumDistilationMixin):
method __init__ (line 42) | def __init__(self, image_encoder, text_encoder, num_classes):
method forward (line 56) | def forward(self, samples, is_train=True):
method predict (line 128) | def predict(self, samples):
method from_config (line 133) | def from_config(cls, cfg=None):
method load_from_pretrained (line 156) | def load_from_pretrained(self, url_or_filename):
FILE: lavis/models/blip_models/blip_outputs.py
class BlipSimilarity (line 20) | class BlipSimilarity(ModelOutput):
class BlipIntermediateOutput (line 32) | class BlipIntermediateOutput(ModelOutput):
class BlipOutput (line 73) | class BlipOutput(ModelOutput):
class BlipOutputWithLogits (line 89) | class BlipOutputWithLogits(BlipOutput):
class BlipOutputFeatures (line 95) | class BlipOutputFeatures(ModelOutput):
FILE: lavis/models/blip_models/blip_pretrain.py
class BlipPretrain (line 27) | class BlipPretrain(BlipBase, SharedQueueMixin, MomentumDistilationMixin):
method __init__ (line 40) | def __init__(
method _rampup_factor (line 111) | def _rampup_factor(self, epoch, iters, num_iters_per_epoch):
method forward (line 114) | def forward(self, samples):
method reset_queue_ptr (line 362) | def reset_queue_ptr(self):
method from_config (line 366) | def from_config(cls, cfg=None):
FILE: lavis/models/blip_models/blip_retrieval.py
class BlipRetrieval (line 32) | class BlipRetrieval(BlipBase, MomentumDistilationMixin, SharedQueueMixin):
method __init__ (line 51) | def __init__(
method _rampup_factor (line 113) | def _rampup_factor(self, epoch, iters, num_iters_per_epoch):
method forward (line 116) | def forward(self, samples):
method reset_queue_ptr (line 357) | def reset_queue_ptr(self):
method from_config (line 361) | def from_config(cls, cfg=None):
method compute_sim_matrix (line 390) | def compute_sim_matrix(self, data_loader, task_cfg):
FILE: lavis/models/blip_models/blip_vqa.py
class BlipVQA (line 22) | class BlipVQA(BlipBase):
method __init__ (line 43) | def __init__(self, image_encoder, text_encoder, text_decoder, max_txt_...
method forward (line 54) | def forward(self, samples):
method forward_encoder (line 104) | def forward_encoder(self, samples):
method forward_decoder (line 123) | def forward_decoder(self, samples, encoder_out, **kwargs):
method predict_answers (line 162) | def predict_answers(
method _generate_answers (line 237) | def _generate_answers(self, samples, num_beams=3, max_length=10, min_l...
method _rank_answers (line 277) | def _rank_answers(self, samples, answer_list, num_ans_candidates):
method from_config (line 357) | def from_config(cls, cfg=None):
FILE: lavis/models/blip_models/nlvr_encoder.py
class BertEmbeddings (line 31) | class BertEmbeddings(nn.Module):
method __init__ (line 34) | def __init__(self, config):
method forward (line 58) | def forward(
class BertSelfAttention (line 90) | class BertSelfAttention(nn.Module):
method __init__ (line 91) | def __init__(self, config, is_cross_attention):
method save_attn_gradients (line 128) | def save_attn_gradients(self, attn_gradients):
method get_attn_gradients (line 131) | def get_attn_gradients(self):
method save_attention_map (line 134) | def save_attention_map(self, attention_map):
method get_attention_map (line 137) | def get_attention_map(self):
method transpose_for_scores (line 140) | def transpose_for_scores(self, x):
method forward (line 148) | def forward(
class BertSelfOutput (line 256) | class BertSelfOutput(nn.Module):
method __init__ (line 257) | def __init__(self, config, twin=False, merge=False):
method forward (line 273) | def forward(self, hidden_states, input_tensor):
class BertAttention (line 291) | class BertAttention(nn.Module):
method __init__ (line 292) | def __init__(self, config, is_cross_attention=False, layer_num=-1):
method prune_heads (line 306) | def prune_heads(self, heads):
method forward (line 329) | def forward(
class BertIntermediate (line 382) | class BertIntermediate(nn.Module):
method __init__ (line 383) | def __init__(self, config):
method forward (line 391) | def forward(self, hidden_states):
class BertOutput (line 397) | class BertOutput(nn.Module):
method __init__ (line 398) | def __init__(self, config):
method forward (line 404) | def forward(self, hidden_states, input_tensor):
class BertLayer (line 411) | class BertLayer(nn.Module):
method __init__ (line 412) | def __init__(self, config, layer_num):
method forward (line 428) | def forward(
method feed_forward_chunk (line 483) | def feed_forward_chunk(self, attention_output):
class BertEncoder (line 489) | class BertEncoder(nn.Module):
method __init__ (line 490) | def __init__(self, config):
method forward (line 498) | def forward(
class BertPooler (line 593) | class BertPooler(nn.Module):
method __init__ (line 594) | def __init__(self, config):
method forward (line 599) | def forward(self, hidden_states):
class BertPredictionHeadTransform (line 608) | class BertPredictionHeadTransform(nn.Module):
method __init__ (line 609) | def __init__(self, config):
method forward (line 618) | def forward(self, hidden_states):
class BertLMPredictionHead (line 625) | class BertLMPredictionHead(nn.Module):
method __init__ (line 626) | def __init__(self, config):
method forward (line 639) | def forward(self, hidden_states):
class BertOnlyMLMHead (line 645) | class BertOnlyMLMHead(nn.Module):
method __init__ (line 646) | def __init__(self, config):
method forward (line 650) | def forward(self, sequence_output):
class BertPreTrainedModel (line 655) | class BertPreTrainedModel(PreTrainedModel):
method _init_weights (line 665) | def _init_weights(self, module):
class BertModel (line 678) | class BertModel(BertPreTrainedModel):
method __init__ (line 688) | def __init__(self, config, add_pooling_layer=True):
method get_input_embeddings (line 700) | def get_input_embeddings(self):
method set_input_embeddings (line 703) | def set_input_embeddings(self, value):
method _prune_heads (line 706) | def _prune_heads(self, heads_to_prune):
method get_extended_attention_mask (line 714) | def get_extended_attention_mask(
method forward (line 792) | def forward(
FILE: lavis/models/clip_models/clip_outputs.py
class ClipOutputFeatures (line 19) | class ClipOutputFeatures(ModelOutput):
class ClipOutput (line 38) | class ClipOutput(ModelOutput):
FILE: lavis/models/clip_models/loss.py
function gather_features (line 20) | def gather_features(
class ClipLoss (line 78) | class ClipLoss(nn.Module):
method __init__ (line 79) | def __init__(
method forward (line 100) | def forward(self, image_features, text_features, logit_scale):
FILE: lavis/models/clip_models/model.py
class Bottleneck (line 50) | class Bottleneck(nn.Module):
method __init__ (line 53) | def __init__(self, inplanes, planes, stride=1):
method forward (line 93) | def forward(self, x: torch.Tensor):
class AttentionPool2d (line 109) | class AttentionPool2d(nn.Module):
method __init__ (line 110) | def __init__(
method forward (line 123) | def forward(self, x):
class ModifiedResNet (line 156) | class ModifiedResNet(nn.Module):
method __init__ (line 164) | def __init__(self, layers, output_dim, heads, image_size=224, width=64):
method _make_layer (line 195) | def _make_layer(self, planes, blocks, stride=1):
method init_parameters (line 204) | def init_parameters(self):
method lock (line 217) | def lock(self, unlocked_groups=0, freeze_bn_stats=False):
method stem (line 226) | def stem(self, x):
method forward (line 236) | def forward(self, x):
class LayerNorm (line 247) | class LayerNorm(nn.LayerNorm):
method forward (line 250) | def forward(self, x: torch.Tensor):
class QuickGELU (line 256) | class QuickGELU(nn.Module):
method forward (line 258) | def forward(self, x: torch.Tensor):
class ResidualAttentionBlock (line 262) | class ResidualAttentionBlock(nn.Module):
method __init__ (line 263) | def __init__(self, d_model: int, n_head: int, act_layer: Callable = nn...
method attention (line 279) | def attention(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor]...
method forward (line 282) | def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] =...
class Transformer (line 288) | class Transformer(nn.Module):
method __init__ (line 289) | def __init__(
method forward (line 302) | def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] =...
class VisualTransformer (line 308) | class VisualTransformer(nn.Module):
method __init__ (line 309) | def __init__(
method lock (line 342) | def lock(self, unlocked_groups=0, freeze_bn_stats=False):
method forward (line 349) | def forward(self, x: torch.Tensor):
class CLIPVisionCfg (line 379) | class CLIPVisionCfg:
class CLIPTextCfg (line 399) | class CLIPTextCfg:
class CLIP (line 409) | class CLIP(BaseModel):
method __init__ (line 418) | def __init__(
method loss (line 501) | def loss(self):
method init_parameters (line 516) | def init_parameters(self):
method build_attention_mask (line 538) | def build_attention_mask(self):
method lock_image_tower (line 546) | def lock_image_tower(self, unlocked_groups=0, freeze_bn_stats=False):
method encode_image (line 552) | def encode_image(self, image):
method encode_text (line 555) | def encode_text(self, text):
method forward (line 571) | def forward(self, samples):
method extract_features (line 603) | def extract_features(self, samples):
method predict (line 640) | def predict(self, samples):
method before_evaluation (line 651) | def before_evaluation(self, dataset, task_type, **kwargs):
method zero_shot_classifier (line 658) | def zero_shot_classifier(self, classnames, templates):
method default_config_path (line 675) | def default_config_path(cls, model_type="base"):
method from_config (line 686) | def from_config(cls, cfg=None):
method zero_shot_predict (line 696) | def zero_shot_predict(self, image_path, categories):
method compute_sim_matrix (line 720) | def compute_sim_matrix(self, data_loader, **kwargs):
function convert_weights_to_fp16 (line 763) | def convert_weights_to_fp16(model: nn.Module):
function build_model_from_openai_state_dict (line 792) | def build_model_from_openai_state_dict(state_dict: dict):
function trace_model (line 873) | def trace_model(model, batch_size=256, device=torch.device("cpu")):
function _natural_key (line 892) | def _natural_key(string_):
function _rescan_model_configs (line 896) | def _rescan_model_configs():
function load_state_dict (line 923) | def load_state_dict(checkpoint_path: str, map_location="cpu"):
function create_model (line 934) | def create_model(
function create_model_and_transforms (line 1009) | def create_model_and_transforms(
function list_models (line 1032) | def list_models():
function add_model_config (line 1037) | def add_model_config(path):
function list_openai_models (line 1045) | def list_openai_models() -> List[str]:
function load_openai_model (line 1050) | def load_openai_model(
FILE: lavis/models/clip_models/pretrained.py
function list_pretrained (line 92) | def list_pretrained(as_str: bool = False):
function list_pretrained_tag_models (line 103) | def list_pretrained_tag_models(tag: str):
function list_pretrained_model_tags (line 112) | def list_pretrained_model_tags(model: str):
function get_pretrained_url (line 120) | def get_pretrained_url(model: str, tag: str):
function download_pretrained (line 130) | def download_pretrained(url: str, root: str = os.path.expanduser("~/.cac...
FILE: lavis/models/clip_models/timm_model.py
class TimmModel (line 37) | class TimmModel(nn.Module):
method __init__ (line 42) | def __init__(
method lock (line 91) | def lock(self, unlocked_groups=0, freeze_bn_stats=False):
method forward (line 124) | def forward(self, x):
class RotAttentionPool2d (line 130) | class RotAttentionPool2d(nn.Module):
method __init__ (line 139) | def __init__(
method forward (line 161) | def forward(self, x):
class AttentionPool2d (line 192) | class AttentionPool2d(nn.Module):
method __init__ (line 200) | def __init__(
method forward (line 227) | def forward(self, x):
function pixel_freq_bands (line 250) | def pixel_freq_bands(
function inv_freq_bands (line 266) | def inv_freq_bands(
function build_sincos2d_pos_embed (line 280) | def build_sincos2d_pos_embed(
function build_fourier_pos_embed (line 329) | def build_fourier_pos_embed(
class FourierEmbed (line 386) | class FourierEmbed(nn.Module):
method __init__ (line 387) | def __init__(
method forward (line 403) | def forward(self, x):
function rot (line 430) | def rot(x):
function apply_rot_embed (line 434) | def apply_rot_embed(x: torch.Tensor, sin_emb, cos_emb):
function apply_rot_embed_list (line 438) | def apply_rot_embed_list(x: List[torch.Tensor], sin_emb, cos_emb):
function apply_rot_embed_split (line 444) | def apply_rot_embed_split(x: torch.Tensor, emb):
function build_rotary_pos_embed (line 449) | def build_rotary_pos_embed(
class RotaryEmbedding (line 479) | class RotaryEmbedding(nn.Module):
method __init__ (line 488) | def __init__(self, dim, max_res=224, linear_bands: bool = False):
method get_embed (line 497) | def get_embed(self, shape: List[int]):
method forward (line 500) | def forward(self, x):
function _no_grad_trunc_normal_ (line 506) | def _no_grad_trunc_normal_(tensor, mean, std, a, b):
function trunc_normal_ (line 544) | def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
FILE: lavis/models/clip_models/tokenizer.py
function default_bpe (line 25) | def default_bpe():
function bytes_to_unicode (line 32) | def bytes_to_unicode():
function get_pairs (line 58) | def get_pairs(word):
function basic_clean (line 70) | def basic_clean(text):
function whitespace_clean (line 76) | def whitespace_clean(text):
class SimpleTokenizer (line 82) | class SimpleTokenizer(object):
method __init__ (line 83) | def __init__(self, bpe_path: str = default_bpe(), special_tokens=None):
method bpe (line 111) | def bpe(self, token):
method encode (line 152) | def encode(self, text):
method decode (line 162) | def decode(self, tokens):
function tokenize (line 175) | def tokenize(
FILE: lavis/models/clip_models/transform.py
class ResizeMaxSize (line 28) | class ResizeMaxSize(nn.Module):
method __init__ (line 29) | def __init__(
method forward (line 40) | def forward(self, img):
function _convert_to_rgb (line 64) | def _convert_to_rgb(image):
function image_transform (line 68) | def image_transform(
FILE: lavis/models/clip_models/utils.py
function freeze_batch_norm_2d (line 14) | def freeze_batch_norm_2d(module, module_match={}, name=""):
FILE: lavis/models/eva_vit.py
function _cfg (line 20) | def _cfg(url='', **kwargs):
class DropPath (line 30) | class DropPath(nn.Module):
method __init__ (line 33) | def __init__(self, drop_prob=None):
method forward (line 37) | def forward(self, x):
method extra_repr (line 40) | def extra_repr(self) -> str:
class Mlp (line 44) | class Mlp(nn.Module):
method __init__ (line 45) | def __init__(self, in_features, hidden_features=None, out_features=Non...
method forward (line 54) | def forward(self, x):
class Attention (line 64) | class Attention(nn.Module):
method __init__ (line 65) | def __init__(
method forward (line 118) | def forward(self, x, rel_pos_bias=None):
class Block (line 151) | class Block(nn.Module):
method __init__ (line 153) | def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_sc...
method forward (line 173) | def forward(self, x, rel_pos_bias=None):
class PatchEmbed (line 183) | class PatchEmbed(nn.Module):
method __init__ (line 186) | def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=...
method forward (line 198) | def forward(self, x, **kwargs):
class RelativePositionBias (line 207) | class RelativePositionBias(nn.Module):
method __init__ (line 209) | def __init__(self, window_size, num_heads):
method forward (line 238) | def forward(self):
class VisionTransformer (line 246) | class VisionTransformer(nn.Module):
method __init__ (line 249) | def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classe...
method fix_init_weight (line 300) | def fix_init_weight(self):
method _init_weights (line 308) | def _init_weights(self, m):
method get_classifier (line 317) | def get_classifier(self):
method reset_classifier (line 320) | def reset_classifier(self, num_classes, global_pool=''):
method forward_features (line 324) | def forward_features(self, x):
method forward (line 349) | def forward(self, x):
method get_intermediate_layers (line 354) | def get_intermediate_layers(self, x):
function interpolate_pos_embed (line 373) | def interpolate_pos_embed(model, checkpoint_model):
function convert_weights_to_fp16 (line 397) | def convert_weights_to_fp16(model: nn.Module):
function create_eva_vit_g (line 415) | def create_eva_vit_g(img_size=224,drop_path_rate=0.4,use_checkpoint=Fals...
FILE: lavis/models/gpt_models/gpt_dialogue.py
class GPTDialogue (line 18) | class GPTDialogue(BaseModel, GPT2LMHeadModel):
method __init__ (line 22) | def __init__(self, config, len_video_ft=4224):
method forward (line 36) | def forward(
method from_config (line 107) | def from_config(cls, cfg):
FILE: lavis/models/img2prompt_models/img2prompt_vqa.py
class Img2PromptVQA (line 25) | class Img2PromptVQA(BaseModel):
method __init__ (line 46) | def __init__(
method forward_itm (line 63) | def forward_itm(self, samples, block_num=7):
method itm_rank (line 98) | def itm_rank(self, image_embeds, image_atts, encoder_input_ids, match_...
method forward_cap (line 133) | def forward_cap(
method answer_extraction (line 247) | def answer_extraction(self, caption, num_question_generation=30):
method forward_qa_generation (line 307) | def forward_qa_generation(self, samples):
method create_context_prompt (line 344) | def create_context_prompt(self, samples, num_caps_per_img=30):
method create_task_prompt (line 363) | def create_task_prompt(
method prompts_construction (line 432) | def prompts_construction(
method prepare_LLM_input (line 459) | def prepare_LLM_input(
method from_config (line 550) | def from_config(cls, model_config):
FILE: lavis/models/med.py
class BertEmbeddings (line 56) | class BertEmbeddings(nn.Module):
method __init__ (line 59) | def __init__(self, config):
method forward (line 88) | def forward(
class BertSelfAttention (line 126) | class BertSelfAttention(nn.Module):
method __init__ (line 127) | def __init__(self, config, is_cross_attention):
method save_attn_gradients (line 164) | def save_attn_gradients(self, attn_gradients):
method get_attn_gradients (line 167) | def get_attn_gradients(self):
method save_attention_map (line 170) | def save_attention_map(self, attention_map):
method get_attention_map (line 173) | def get_attention_map(self):
method transpose_for_scores (line 176) | def transpose_for_scores(self, x):
method forward (line 184) | def forward(
class BertSelfOutput (line 292) | class BertSelfOutput(nn.Module):
method __init__ (line 293) | def __init__(self, config):
method forward (line 299) | def forward(self, hidden_states, input_tensor):
class BertAttention (line 306) | class BertAttention(nn.Module):
method __init__ (line 307) | def __init__(self, config, is_cross_attention=False):
method prune_heads (line 313) | def prune_heads(self, heads):
method forward (line 336) | def forward(
class BertIntermediate (line 362) | class BertIntermediate(nn.Module):
method __init__ (line 363) | def __init__(self, config):
method forward (line 371) | def forward(self, hidden_states):
class BertOutput (line 377) | class BertOutput(nn.Module):
method __init__ (line 378) | def __init__(self, config):
method forward (line 384) | def forward(self, hidden_states, input_tensor):
class BertLayer (line 391) | class BertLayer(nn.Module):
method __init__ (line 392) | def __init__(self, config, layer_num):
method forward (line 422) | def forward(
method feed_forward_chunk (line 499) | def feed_forward_chunk(self, attention_output):
class BertEncoder (line 505) | class BertEncoder(nn.Module):
method __init__ (line 506) | def __init__(self, config):
method forward (line 514) | def forward(
class BertPooler (line 633) | class BertPooler(nn.Module):
method __init__ (line 634) | def __init__(self, config):
method forward (line 639) | def forward(self, hidden_states):
class BertPredictionHeadTransform (line 648) | class BertPredictionHeadTransform(nn.Module):
method __init__ (line 649) | def __init__(self, config):
method forward (line 658) | def forward(self, hidden_states):
class BertLMPredictionHead (line 665) | class BertLMPredictionHead(nn.Module):
method __init__ (line 666) | def __init__(self, config):
method forward (line 679) | def forward(self, hidden_states):
class BertOnlyMLMHead (line 685) | class BertOnlyMLMHead(nn.Module):
method __init__ (line 686) | def __init__(self, config):
method forward (line 690) | def forward(self, sequence_output):
class BertPreTrainedModel (line 695) | class BertPreTrainedModel(PreTrainedModel):
method _init_weights (line 705) | def _init_weights(self, module):
class BertModel (line 718) | class BertModel(BertPreTrainedModel):
method __init__ (line 728) | def __init__(self, config, add_pooling_layer=True):
method get_input_embeddings (line 740) | def get_input_embeddings(self):
method set_input_embeddings (line 743) | def set_input_embeddings(self, value):
method _prune_heads (line 746) | def _prune_heads(self, heads_to_prune):
method get_extended_attention_mask (line 754) | def get_extended_attention_mask(
method forward (line 832) | def forward(
class BertForMaskedLM (line 1005) | class BertForMaskedLM(BertPreTrainedModel):
method __init__ (line 1010) | def __init__(self, config):
method get_output_embeddings (line 1018) | def get_output_embeddings(self):
method set_output_embeddings (line 1021) | def set_output_embeddings(self, new_embeddings):
method forward (line 1024) | def forward(
method prepare_inputs_for_generation (line 1106) | def prepare_inputs_for_generation(
class BertLMHeadModel (line 1131) | class BertLMHeadModel(BertPreTrainedModel):
method __init__ (line 1136) | def __init__(self, config):
method get_output_embeddings (line 1144) | def get_output_embeddings(self):
method set_output_embeddings (line 1147) | def set_output_embeddings(self, new_embeddings):
method forward (line 1150) | def forward(
method prepare_inputs_for_generation (line 1266) | def prepare_inputs_for_generation(
method _reorder_cache (line 1287) | def _reorder_cache(self, past, beam_idx):
class XBertLMHeadDecoder (line 1298) | class XBertLMHeadDecoder(BertLMHeadModel):
method from_config (line 1306) | def from_config(cls, cfg, from_pretrained=False):
method generate_from_encoder (line 1316) | def generate_from_encoder(
class XBertEncoder (line 1374) | class XBertEncoder(BertModel, BaseEncoder):
method from_config (line 1376) | def from_config(cls, cfg, from_pretrained=False):
method forward_automask (line 1388) | def forward_automask(self, tokenized_text, visual_embeds, **kwargs):
method forward_text (line 1404) | def forward_text(self, tokenized_text, **kwargs):
FILE: lavis/models/pnp_vqa_models/__init__.py
function prepare_qa_input (line 11) | def prepare_qa_input(sample, num_captions, num_captions_fid):
FILE: lavis/models/pnp_vqa_models/pnp_unifiedqav2_fid.py
class PNPUnifiedQAv2FiD (line 20) | class PNPUnifiedQAv2FiD(T5ForConditionalGeneration, BaseModel):
method __init__ (line 24) | def __init__(self, config, model_path):
method forward (line 29) | def forward(self, input_ids=None, attention_mask=None, **kwargs):
method generate (line 43) | def generate(self, input_ids, attention_mask, num_beams=1, min_length=...
method load_unifiedqa (line 54) | def load_unifiedqa(self, state_dict):
method from_config (line 59) | def from_config(cls, cfg):
class T5EncoderWrapper (line 69) | class T5EncoderWrapper(torch.nn.Module):
method __init__ (line 71) | def __init__(self, encoder):
method forward (line 79) | def forward(self, input_ids=None, attention_mask=None, **kwargs):
FILE: lavis/models/pnp_vqa_models/pnp_vqa.py
class PNPVQA (line 21) | class PNPVQA(BaseModel):
method __init__ (line 45) | def __init__(self, image_question_matching_model, image_captioning_model,
method forward_itm (line 54) | def forward_itm(self, samples, block_num=7):
method forward_cap (line 84) | def forward_cap(
method forward_qa (line 174) | def forward_qa(
method predict_answers (line 232) | def predict_answers(
method from_config (line 321) | def from_config(cls, model_config):
FILE: lavis/models/sevila_models/sevila.py
class SeViLA (line 20) | class SeViLA(Blip2Base):
method __init__ (line 37) | def __init__( self, img_size=224, drop_path_rate=0,
method forward (line 126) | def forward(self, samples,
method generate (line 438) | def generate(self,
method generate_demo (line 691) | def generate_demo(self,
method predict_answers (line 872) | def predict_answers(
method _lemmatize (line 939) | def _lemmatize(self, answers):
method lemmatizer (line 956) | def lemmatizer(self):
method from_config (line 977) | def from_config(cls, cfg):
FILE: lavis/models/timesformer/conv2d_same.py
function pad_same (line 24) | def pad_same(x, k: List[int], s: List[int], d: List[int] = (1, 1), value...
function get_same_padding (line 39) | def get_same_padding(x: int, k: int, s: int, d: int):
function get_padding_value (line 43) | def get_padding_value(padding, kernel_size, **kwargs) -> Tuple[Tuple, bo...
function conv2d_same (line 66) | def conv2d_same(
class Conv2dSame (line 79) | class Conv2dSame(nn.Conv2d):
method __init__ (line 82) | def __init__(
method forward (line 97) | def forward(self, x):
function create_conv2d_pad (line 109) | def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs):
FILE: lavis/models/timesformer/features.py
class FeatureInfo (line 21) | class FeatureInfo:
method __init__ (line 22) | def __init__(self, feature_info: List[Dict], out_indices: Tuple[int]):
method from_other (line 33) | def from_other(self, out_indices: Tuple[int]):
method get (line 36) | def get(self, key, idx=None):
method get_dicts (line 49) | def get_dicts(self, keys=None, idx=None):
method channels (line 66) | def channels(self, idx=None):
method reduction (line 70) | def reduction(self, idx=None):
method module_name (line 74) | def module_name(self, idx=None):
method __getitem__ (line 78) | def __getitem__(self, item):
method __len__ (line 81) | def __len__(self):
class FeatureHooks (line 85) | class FeatureHooks:
method __init__ (line 92) | def __init__(self, hooks, named_modules, out_map=None, default_hook_ty...
method _collect_output_hook (line 109) | def _collect_output_hook(self, hook_id, *args):
method get_output (line 117) | def get_output(self, device) -> Dict[str, torch.tensor]:
function _module_list (line 123) | def _module_list(module, flatten_sequential=False):
function _get_feature_info (line 137) | def _get_feature_info(net, out_indices):
function _get_return_layers (line 147) | def _get_return_layers(feature_info, out_map):
class FeatureDictNet (line 157) | class FeatureDictNet(nn.ModuleDict):
method __init__ (line 178) | def __init__(
method _collect (line 207) | def _collect(self, x) -> (Dict[str, torch.Tensor]):
method forward (line 221) | def forward(self, x) -> Dict[str, torch.Tensor]:
class FeatureListNet (line 225) | class FeatureListNet(FeatureDictNet):
method __init__ (line 231) | def __init__(
method forward (line 247) | def forward(self, x) -> (List[torch.Tensor]):
class FeatureHookNet (line 251) | class FeatureHookNet(nn.ModuleDict):
method __init__ (line 261) | def __init__(
method forward (line 304) | def forward(self, x):
FILE: lavis/models/timesformer/helpers.py
function load_state_dict (line 24) | def load_state_dict(checkpoint_path, use_ema=False):
function load_checkpoint (line 57) | def load_checkpoint(model, checkpoint_path, use_ema=False, strict=True):
function load_pretrained (line 102) | def load_pretrained(
function load_pretrained_imagenet (line 235) | def load_pretrained_imagenet(
function load_pretrained_kinetics (line 299) | def load_pretrained_kinetics(
function resize_spatial_embedding (line 353) | def resize_spatial_embedding(state_dict, key, num_patches):
function resize_temporal_embedding (line 370) | def resize_temporal_embedding(state_dict, key, num_frames):
function detach_variable (line 381) | def detach_variable(inputs):
function check_backward_validity (line 396) | def check_backward_validity(inputs):
FILE: lavis/models/timesformer/linear.py
class Linear (line 15) | class Linear(nn.Linear):
method forward (line 16) | def forward(self, input: torch.Tensor) -> torch.Tensor:
FILE: lavis/models/timesformer/vit.py
function _cfg (line 35) | def _cfg(url="", **kwargs):
class Mlp (line 60) | class Mlp(nn.Module):
method __init__ (line 61) | def __init__(
method forward (line 77) | def forward(self, x):
class Attention (line 86) | class Attention(nn.Module):
method __init__ (line 87) | def __init__(
method forward (line 108) | def forward(self, x):
class Block (line 134) | class Block(nn.Module):
method __init__ (line 135) | def __init__(
method forward (line 202) | def forward(self, x, B, T, W):
class PatchEmbed (line 263) | class PatchEmbed(nn.Module):
method __init__ (line 266) | def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=...
method forward (line 279) | def forward(self, x):
class VisionTransformer (line 288) | class VisionTransformer(nn.Module):
method __init__ (line 291) | def __init__(
method _init_weights (line 385) | def _init_weights(self, m):
method no_weight_decay (line 395) | def no_weight_decay(self):
method get_classifier (line 398) | def get_classifier(self):
method reset_classifier (line 401) | def reset_classifier(self, num_classes, global_pool=""):
method remove_classifier (line 407) | def remove_classifier(self):
method forward_features (line 411) | def forward_features(self, x):
method forward (line 464) | def forward(self, x):
function _conv_filter (line 470) | def _conv_filter(state_dict, patch_size=16):
class vit_base_patch16_224 (line 482) | class vit_base_patch16_224(nn.Module):
method __init__ (line 483) | def __init__(self, cfg, **kwargs):
method forward (line 523) | def forward(self, x):
class TimeSformer (line 528) | class TimeSformer(nn.Module):
method __init__ (line 529) | def __init__(
method forward (line 592) | def forward(self, x):
method forward_features (line 596) | def forward_features(self, x):
method load_state_dict (line 614) | def load_state_dict(self, pretrained_ckpt_path):
FILE: lavis/models/timesformer/vit_utils.py
function _no_grad_trunc_normal_ (line 31) | def _no_grad_trunc_normal_(tensor, mean, std, a, b):
function trunc_normal_ (line 67) | def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
function _ntuple (line 88) | def _ntuple(n):
function get_padding (line 100) | def get_padding(kernel_size: int, stride: int = 1, dilation: int = 1, **...
function get_padding_value (line 105) | def get_padding_value(padding, kernel_size, **kwargs):
function get_same_padding (line 129) | def get_same_padding(x: int, k: int, s: int, d: int):
function is_static_pad (line 134) | def is_static_pad(kernel_size: int, stride: int = 1, dilation: int = 1, ...
function pad_same (line 140) | def pad_same(x, k, s, d=(1, 1), value=0):
function adaptive_pool_feat_mult (line 154) | def adaptive_pool_feat_mult(pool_type="avg"):
function drop_path (line 161) | def drop_path(x, drop_prob: float = 0.0, training: bool = False):
class DropPath (line 181) | class DropPath(nn.Module):
method __init__ (line 184) | def __init__(self, drop_prob=None):
method forward (line 188) | def forward(self, x):
FILE: lavis/models/topk.py
class PerturbedTopK (line 18) | class PerturbedTopK(nn.Module):
method __init__ (line 19) | def __init__(self, k: int, num_samples: int = 1000):
method __call__ (line 24) | def __call__(self, x, sigma):
class PerturbedTopKFunction (line 28) | class PerturbedTopKFunction(torch.autograd.Function):
method forward (line 30) | def forward(ctx, x, k: int, num_samples: int = 1000, sigma: float = 0....
method backward (line 59) | def backward(ctx, grad_output):
function HardTopK (line 78) | def HardTopK(k, x):
function batched_index_select (line 85) | def batched_index_select(input, dim, index):
function extract_frames_from_indices (line 95) | def extract_frames_from_indices(x, indices):
function extract_frames_from_indicators (line 104) | def extract_frames_from_indicators(x, indicators):
class ModalityEmbeddingsID (line 111) | class ModalityEmbeddingsID(IntEnum):
class ModalityEmbeddings (line 118) | class ModalityEmbeddings(nn.Module):
method __init__ (line 122) | def __init__(self,
method forward (line 142) | def forward(self, x, num_frame):
class ATPConfig (line 168) | class ATPConfig:
method default_args (line 188) | def default_args(cls):
method from_args (line 204) | def from_args(cls, args):
class ATPEncoder (line 217) | class ATPEncoder(nn.Module):
method __init__ (line 223) | def __init__(self, config: ATPConfig):
method forward (line 249) | def forward(self, x_inputs: torch.tensor, vis_L):
class TopK_Selector (line 262) | class TopK_Selector(nn.Module):
method __init__ (line 269) | def __init__(self, config=ATPConfig, num_select=4):
method forward (line 285) | def forward(self,
FILE: lavis/models/vit.py
class Mlp (line 26) | class Mlp(nn.Module):
method __init__ (line 29) | def __init__(
method forward (line 45) | def forward(self, x):
class Attention (line 54) | class Attention(nn.Module):
method __init__ (line 55) | def __init__(
method save_attn_gradients (line 76) | def save_attn_gradients(self, attn_gradients):
method get_attn_gradients (line 79) | def get_attn_gradients(self):
method save_attention_map (line 82) | def save_attention_map(self, attention_map):
method get_attention_map (line 85) | def get_attention_map(self):
method forward (line 88) | def forward(self, x, register_hook=False):
class Block (line 115) | class Block(nn.Module):
method __init__ (line 116) | def __init__(
method forward (line 155) | def forward(self, x, register_hook=False):
class VisionTransformer (line 161) | class VisionTransformer(nn.Module):
method __init__ (line 167) | def __init__(
method _init_weights (line 252) | def _init_weights(self, m):
method no_weight_decay (line 262) | def no_weight_decay(self):
method forward (line 265) | def forward(self, x, register_blk=-1):
method load_pretrained (line 284) | def load_pretrained(self, checkpoint_path, prefix=""):
function _load_weights (line 289) | def _load_weights(model: VisionTransformer, checkpoint_path: str, prefix...
function resize_pos_embed (line 402) | def resize_pos_embed(posemb, posemb_new, num_tokens=1, gs_new=()):
function interpolate_pos_embed (line 426) | def interpolate_pos_embed(pos_embed_checkpoint, visual_encoder):
class VisionTransformerEncoder (line 458) | class VisionTransformerEncoder(VisionTransformer, BaseEncoder):
method from_config (line 460) | def from_config(cls, cfg, from_pretrained=False):
method forward_features (line 526) | def forward_features(self, x, register_blk=-1):
FILE: lavis/processors/__init__.py
function load_processor (line 45) | def load_processor(name, cfg=None):
FILE: lavis/processors/alpro_processors.py
class AlproVideoBaseProcessor (line 21) | class AlproVideoBaseProcessor(BaseProcessor):
method __init__ (line 22) | def __init__(self, mean=None, std=None, n_frms=MAX_INT):
class ToUint8 (line 33) | class ToUint8(object):
method __init__ (line 34) | def __init__(self):
method __call__ (line 37) | def __call__(self, tensor):
method __repr__ (line 40) | def __repr__(self):
class ToTHWC (line 44) | class ToTHWC(object):
method __init__ (line 52) | def __init__(self):
method __call__ (line 55) | def __call__(self, tensor):
method __repr__ (line 58) | def __repr__(self):
class ResizeVideo (line 62) | class ResizeVideo(object):
method __init__ (line 63) | def __init__(self, target_size, interpolation_mode="bilinear"):
method __call__ (line 67) | def __call__(self, clip):
method __repr__ (line 77) | def __repr__(self):
class AlproVideoTrainProcessor (line 82) | class AlproVideoTrainProcessor(AlproVideoBaseProcessor):
method __init__ (line 83) | def __init__(
method __call__ (line 128) | def __call__(self, vpath):
method from_config (line 146) | def from_config(cls, cfg=None):
class AlproVideoEvalProcessor (line 171) | class AlproVideoEvalProcessor(AlproVideoBaseProcessor):
method __init__ (line 172) | def __init__(self, image_size=256, mean=None, std=None, n_frms=MAX_INT):
method __call__ (line 188) | def __call__(self, vpath):
method from_config (line 205) | def from_config(cls, cfg=None):
FILE: lavis/processors/base_processor.py
class BaseProcessor (line 11) | class BaseProcessor:
method __init__ (line 12) | def __init__(self):
method __call__ (line 16) | def __call__(self, item):
method from_config (line 20) | def from_config(cls, cfg=None):
method build (line 23) | def build(self, **kwargs):
FILE: lavis/processors/blip_processors.py
class ToUint8 (line 21) | class ToUint8(object):
method __init__ (line 22) | def __init__(self):
method __call__ (line 25) | def __call__(self, tensor):
method __repr__ (line 28) | def __repr__(self):
class ToTHWC (line 32) | class ToTHWC(object):
method __init__ (line 40) | def __init__(self):
method __call__ (line 43) | def __call__(self, tensor):
method __repr__ (line 46) | def __repr__(self):
class BlipImageBaseProcessor (line 49) | class BlipImageBaseProcessor(BaseProcessor):
method __init__ (line 50) | def __init__(self, mean=None, std=None):
class BlipVideoBaseProcessor (line 58) | class BlipVideoBaseProcessor(BaseProcessor):
method __init__ (line 59) | def __init__(self, mean=None, std=None, n_frms=MAX_INT):
class BlipCaptionProcessor (line 70) | class BlipCaptionProcessor(BaseProcessor):
method __init__ (line 71) | def __init__(self, prompt="", max_words=50):
method __call__ (line 75) | def __call__(self, caption):
method from_config (line 81) | def from_config(cls, cfg=None):
method pre_caption (line 90) | def pre_caption(self, caption):
class BlipQuestionProcessor (line 112) | class BlipQuestionProcessor(BaseProcessor):
method __init__ (line 113) | def __init__(self, max_words=50):
method __call__ (line 116) | def __call__(self, question):
method from_config (line 120) | def from_config(cls, cfg=None):
method pre_question (line 128) | def pre_question(self, question):
class BlipImageTrainProcessor (line 146) | class BlipImageTrainProcessor(BlipImageBaseProcessor):
method __init__ (line 147) | def __init__(
method __call__ (line 182) | def __call__(self, item):
method from_config (line 186) | def from_config(cls, cfg=None):
class BlipImageEvalProcessor (line 208) | class BlipImageEvalProcessor(BlipImageBaseProcessor):
method __init__ (line 209) | def __init__(self, image_size=384, mean=None, std=None):
method __call__ (line 222) | def __call__(self, item):
method from_config (line 226) | def from_config(cls, cfg=None):
class Blip2ImageTrainProcessor (line 239) | class Blip2ImageTrainProcessor(BlipImageBaseProcessor):
method __init__ (line 240) | def __init__(
method __call__ (line 258) | def __call__(self, item):
method from_config (line 262) | def from_config(cls, cfg=None):
class Blip2VideoTrainProcessor (line 283) | class Blip2VideoTrainProcessor(BlipVideoBaseProcessor):
method __init__ (line 284) | def __init__(
method __call__ (line 312) | def __call__(self, vpath, clip_proposal=None):
method from_config (line 326) | def from_config(cls, cfg=None):
class BlipVideoEvalProcessor (line 350) | class BlipVideoEvalProcessor(BlipVideoBaseProcessor):
method __init__ (line 351) | def __init__(self, image_size=384, mean=None, std=None, n_frms=MAX_INT):
method __call__ (line 365) | def __call__(self, vpath, clip_proposal=None):
method from_config (line 378) | def from_config(cls, cfg=None):
FILE: lavis/processors/clip_processors.py
function _convert_to_rgb (line 15) | def _convert_to_rgb(image):
class ClipImageTrainProcessor (line 20) | class ClipImageTrainProcessor(BlipImageBaseProcessor):
method __init__ (line 21) | def __init__(
method from_config (line 41) | def from_config(cls, cfg=None):
class ClipImageEvalProcessor (line 63) | class ClipImageEvalProcessor(BlipImageBaseProcessor):
method __init__ (line 64) | def __init__(self, image_size=224, mean=None, std=None):
method from_config (line 79) | def from_config(cls, cfg=None):
FILE: lavis/processors/functional_video.py
function _is_tensor_video_clip (line 13) | def _is_tensor_video_clip(clip):
function crop (line 23) | def crop(clip, i, j, h, w):
function resize (line 33) | def resize(clip, target_size, interpolation_mode):
function resized_crop (line 43) | def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"):
function center_crop (line 63) | def center_crop(clip, crop_size):
function to_tensor (line 76) | def to_tensor(clip):
function normalize (line 93) | def normalize(clip, mean, std, inplace=False):
function hflip (line 112) | def hflip(clip):
FILE: lavis/processors/gpt_processors.py
class GPTVideoFeatureBaseProcessor (line 39) | class GPTVideoFeatureBaseProcessor(BaseProcessor):
method __init__ (line 40) | def __init__(self, visual_ft=["i3d_rgb"], audio_ft=["vggish"]):
class GPTDialogueProcessor (line 46) | class GPTDialogueProcessor(BaseProcessor):
method __init__ (line 47) | def __init__(self, max_turns=3, use_caption=True):
method sample_sequence (line 53) | def sample_sequence(self, caption, history, answer):
method padding (line 77) | def padding(self, seq, pad_token=-1):
method get_attention_mask (line 85) | def get_attention_mask(self, seq, pad_token=-1):
method __call__ (line 90) | def __call__(self, ann):
method from_config (line 111) | def from_config(cls, cfg=None):
class GPTVideoFeatureProcessor (line 122) | class GPTVideoFeatureProcessor(GPTVideoFeatureBaseProcessor):
method __init__ (line 123) | def __init__(self, visual_ft, audio_ft):
method padding (line 128) | def padding(self, seq):
method get_attention_mask (line 134) | def get_attention_mask(self, seq):
method __call__ (line 137) | def __call__(self, ft_root, vname):
method from_config (line 164) | def from_config(cls, cfg=None):
FILE: lavis/processors/randaugment.py
function identity_func (line 15) | def identity_func(img):
function autocontrast_func (line 19) | def autocontrast_func(img, cutoff=0):
function equalize_func (line 52) | def equalize_func(img):
function rotate_func (line 76) | def rotate_func(img, degree, fill=(0, 0, 0)):
function solarize_func (line 87) | def solarize_func(img, thresh=128):
function color_func (line 97) | def color_func(img, factor):
function contrast_func (line 115) | def contrast_func(img, factor):
function brightness_func (line 129) | def brightness_func(img, factor):
function sharpness_func (line 138) | def sharpness_func(img, factor):
function shear_x_func (line 159) | def shear_x_func(img, factor, fill=(0, 0, 0)):
function translate_x_func (line 168) | def translate_x_func(img, offset, fill=(0, 0, 0)):
function translate_y_func (line 180) | def translate_y_func(img, offset, fill=(0, 0, 0)):
function posterize_func (line 192) | def posterize_func(img, bits):
function shear_y_func (line 200) | def shear_y_func(img, factor, fill=(0, 0, 0)):
function cutout_func (line 209) | def cutout_func(img, pad_size, replace=(0, 0, 0)):
function enhance_level_to_args (line 223) | def enhance_level_to_args(MAX_LEVEL):
function shear_level_to_args (line 230) | def shear_level_to_args(MAX_LEVEL, replace_value):
function translate_level_to_args (line 240) | def translate_level_to_args(translate_const, MAX_LEVEL, replace_value):
function cutout_level_to_args (line 250) | def cutout_level_to_args(cutout_const, MAX_LEVEL, replace_value):
function solarize_level_to_args (line 258) | def solarize_level_to_args(MAX_LEVEL):
function none_level_to_args (line 266) | def none_level_to_args(level):
function posterize_level_to_args (line 270) | def posterize_level_to_args(MAX_LEVEL):
function rotate_level_to_args (line 278) | def rotate_level_to_args(MAX_LEVEL, replace_value):
class RandomAugment (line 326) | class RandomAugment(object):
method __init__ (line 327) | def __init__(self, N=2, M=10, isPIL=False, augs=[]):
method get_random_ops (line 336) | def get_random_ops(self):
method __call__ (line 340) | def __call__(self, img):
class VideoRandomAugment (line 352) | class VideoRandomAugment(object):
method __init__ (line 353) | def __init__(self, N=2, M=10, p=0.0, tensor_in_tensor_out=True, augs=[]):
method get_random_ops (line 363) | def get_random_ops(self):
method __call__ (line 367) | def __call__(self, frames):
method _aug (line 386) | def _aug(self, img, ops, apply_or_not):
FILE: lavis/processors/transforms_video.py
class RandomCropVideo (line 31) | class RandomCropVideo(RandomCrop):
method __init__ (line 32) | def __init__(self, size):
method __call__ (line 38) | def __call__(self, clip):
method __repr__ (line 49) | def __repr__(self) -> str:
class RandomResizedCropVideo (line 53) | class RandomResizedCropVideo(RandomResizedCrop):
method __init__ (line 54) | def __init__(
method __call__ (line 74) | def __call__(self, clip):
method __repr__ (line 85) | def __repr__(self) -> str:
class CenterCropVideo (line 89) | class CenterCropVideo:
method __init__ (line 90) | def __init__(self, crop_size):
method __call__ (line 96) | def __call__(self, clip):
method __repr__ (line 106) | def __repr__(self) -> str:
class NormalizeVideo (line 110) | class NormalizeVideo:
method __init__ (line 119) | def __init__(self, mean, std, inplace=False):
method __call__ (line 124) | def __call__(self, clip):
method __repr__ (line 131) | def __repr__(self) -> str:
class ToTensorVideo (line 135) | class ToTensorVideo:
method __init__ (line 141) | def __init__(self):
method __call__ (line 144) | def __call__(self, clip):
method __repr__ (line 153) | def __repr__(self) -> str:
class RandomHorizontalFlipVideo (line 157) | class RandomHorizontalFlipVideo:
method __init__ (line 164) | def __init__(self, p=0.5):
method __call__ (line 167) | def __call__(self, clip):
method __repr__ (line 178) | def __repr__(self) -> str:
FILE: lavis/runners/runner_base.py
class RunnerBase (line 39) | class RunnerBase:
method __init__ (line 47) | def __init__(self, cfg, task, model, datasets, job_id):
method device (line 69) | def device(self):
method use_distributed (line 76) | def use_distributed(self):
method model (line 80) | def model(self):
method optimizer (line 102) | def optimizer(self):
method scaler (line 134) | def scaler(self):
method lr_scheduler (line 144) | def lr_scheduler(self):
method dataloaders (line 176) | def dataloaders(self) -> dict:
method cuda_enabled (line 275) | def cuda_enabled(self):
method max_epoch (line 279) | def max_epoch(self):
method log_freq (line 283) | def log_freq(self):
method init_lr (line 288) | def init_lr(self):
method min_lr (line 292) | def min_lr(self):
method accum_grad_iters (line 296) | def accum_grad_iters(self):
method valid_splits (line 300) | def valid_splits(self):
method test_splits (line 309) | def test_splits(self):
method train_splits (line 315) | def train_splits(self):
method evaluate_only (line 324) | def evaluate_only(self):
method use_dist_eval_sampler (line 331) | def use_dist_eval_sampler(self):
method resume_ckpt_path (line 335) | def resume_ckpt_path(self):
method train_loader (line 339) | def train_loader(self):
method setup_output_dir (line 344) | def setup_output_dir(self):
method train (line 359) | def train(self):
method evaluate (line 418) | def evaluate(self, cur_epoch="best", skip_reload=False):
method train_epoch (line 429) | def train_epoch(self, epoch):
method eval_epoch (line 446) | def eval_epoch(self, split_name, cur_epoch, skip_reload=False):
method unwrap_dist_model (line 480) | def unwrap_dist_model(self, model):
method create_loaders (line 486) | def create_loaders(
method _save_checkpoint (line 568) | def _save_checkpoint(self, cur_epoch, is_best=False):
method _reload_best_model (line 597) | def _reload_best_model(self, model):
method _load_checkpoint (line 617) | def _load_checkpoint(self, url_or_filename):
method log_stats (line 642) | def log_stats(self, stats, split_name):
method log_config (line 651) | def log_config(self):
FILE: lavis/runners/runner_iter.py
class RunnerIter (line 25) | class RunnerIter(RunnerBase):
method __init__ (line 41) | def __init__(self, cfg, task, model, datasets, job_id):
method max_epoch (line 57) | def max_epoch(self):
method cur_epoch (line 61) | def cur_epoch(self):
method _progress (line 68) | def _progress(self, cur_iters):
method train (line 71) | def train(self):
method train_iters (line 137) | def train_iters(self, epoch, start_iters):
method _save_checkpoint (line 156) | def _save_checkpoint(self, cur_iters, is_best=False):
method _load_checkpoint (line 171) | def _load_checkpoint(self, url_or_filename):
method dataloaders (line 196) | def dataloaders(self) -> dict:
FILE: lavis/tasks/__init__.py
function setup_task (line 21) | def setup_task(cfg):
FILE: lavis/tasks/base_task.py
class BaseTask (line 19) | class BaseTask:
method __init__ (line 20) | def __init__(self, **kwargs):
method setup_task (line 26) | def setup_task(cls, **kwargs):
method build_model (line 29) | def build_model(self, cfg):
method build_datasets (line 35) | def build_datasets(self, cfg):
method train_step (line 61) | def train_step(self, model, samples):
method valid_step (line 65) | def valid_step(self, model, samples):
method before_evaluation (line 68) | def before_evaluation(self, model, dataset, **kwargs):
method after_evaluation (line 71) | def after_evaluation(self, **kwargs):
method inference_step (line 74) | def inference_step(self):
method evaluation (line 77) | def evaluation(self, model, data_loader, cuda_enabled=True):
method train_epoch (line 97) | def train_epoch(
method train_iters (line 122) | def train_iters(
method _train_inner_loop (line 150) | def _train_inner_loop(
method save_result (line 244) | def save_result(result, result_dir, filename, remove_duplicate=""):
FILE: lavis/tasks/captioning.py
class CaptionTask (line 17) | class CaptionTask(BaseTask):
method __init__ (line 18) | def __init__(self, num_beams, max_len, min_len, evaluate, report_metri...
method setup_task (line 29) | def setup_task(cls, cfg):
method valid_step (line 47) | def valid_step(self, model, samples):
method after_evaluation (line 65) | def after_evaluation(self, val_result, split_name, epoch, **kwargs):
method _report_metrics (line 83) | def _report_metrics(self, eval_result_file, split_name):
function coco_caption_eval (line 109) | def coco_caption_eval(coco_gt_root, results_file, split):
FILE: lavis/tasks/dialogue.py
class DialogueTask (line 21) | class DialogueTask(BaseTask):
method __init__ (line 22) | def __init__(self, num_beams, max_len, min_len, evaluate, report_metri...
method setup_task (line 33) | def setup_task(cls, cfg):
method valid_step (line 51) | def valid_step(self, model, samples):
method after_evaluation (line 57) | def after_evaluation(self, val_result, split_name, epoch, **kwargs):
method _report_metrics (line 68) | def _report_metrics(self, eval_result_file, split_name):
function coco_dialogue_eval (line 93) | def coco_dialogue_eval(coco_gt_root, results_file, split):
FILE: lavis/tasks/image_text_pretrain.py
class ImageTextPretrainTask (line 13) | class ImageTextPretrainTask(BaseTask):
method __init__ (line 14) | def __init__(self):
method evaluation (line 17) | def evaluation(self, model, data_loader, cuda_enabled=True):
FILE: lavis/tasks/multimodal_classification.py
class MultimodalClassificationTask (line 20) | class MultimodalClassificationTask(BaseTask):
method __init__ (line 21) | def __init__(self):
method valid_step (line 24) | def valid_step(self, model, samples):
method after_evaluation (line 51) | def after_evaluation(self, val_result, split_name, epoch, **kwargs):
method _report_metrics (line 66) | def _report_metrics(self, eval_result_file, split_name):
FILE: lavis/tasks/retrieval.py
class RetrievalTask (line 20) | class RetrievalTask(BaseTask):
method __init__ (line 21) | def __init__(self, cfg):
method setup_task (line 27) | def setup_task(cls, cfg):
method evaluation (line 32) | def evaluation(self, model, data_loader, **kwargs):
method after_evaluation (line 49) | def after_evaluation(self, val_result, **kwargs):
method _report_metrics (line 54) | def _report_metrics(scores_i2t, scores_t2i, txt2img, img2txt):
FILE: lavis/tasks/vqa.py
class VQATask (line 23) | class VQATask(BaseTask):
method __init__ (line 24) | def __init__(
method setup_task (line 51) | def setup_task(cls, cfg):
method build_datasets (line 74) | def build_datasets(self, cfg):
method valid_step (line 100) | def valid_step(self, model, samples):
method after_evaluation (line 120) | def after_evaluation(self, val_result, split_name, **kwargs):
method _report_metrics (line 133) | def _report_metrics(self, result_file, split):
class GQATask (line 173) | class GQATask(VQATask):
method valid_step (line 174) | def valid_step(self, model, samples):
method _report_metrics (line 197) | def _report_metrics(self, result_file, split):
class AOKVQATask (line 237) | class AOKVQATask(VQATask):
method valid_step (line 238) | def valid_step(self, model, samples):
method _report_metrics (line 262) | def _report_metrics(self, result_file, split):
method _save_result_leaderboard (line 299) | def _save_result_leaderboard(self, results):
class FrameQA (line 320) | class FrameQA(BaseTask):
method __init__ (line 321) | def __init__(self):
method valid_step (line 325) | def valid_step(self, model, samples):
method after_evaluation (line 351) | def after_evaluation(self, val_result, split_name, epoch, **kwargs):
method _report_metrics (line 365) | def _report_metrics(self, eval_result_file, split_name):
class VideoQA (line 419) | class VideoQA(BaseTask):
method __init__ (line 420) | def __init__(self):
method valid_step (line 424) | def valid_step(self, model, samples):
method after_evaluation (line 454) | def after_evaluation(self, val_result, split_name, epoch, **kwargs):
method _report_metrics (line 468) | def _report_metrics(self, eval_result_file, split_name):
class MR (line 509) | class MR(BaseTask):
method __init__ (line 510) | def __init__(self):
method valid_step (line 514) | def valid_step(self, model, samples):
method after_evaluation (line 541) | def after_evaluation(self, val_result, split_name, epoch, **kwargs):
method _report_metrics (line 555) | def _report_metrics(self, eval_result_file, split_name):
FILE: lavis/tasks/vqa_reading_comprehension.py
class VQARCTask (line 23) | class VQARCTask(VQATask):
method __init__ (line 24) | def __init__(
method setup_task (line 39) | def setup_task(cls, cfg):
method valid_step (line 61) | def valid_step(self, model, samples):
method after_evaluation (line 93) | def after_evaluation(self, val_result, split_name, **kwargs):
method save_gradcam (line 122) | def save_gradcam(self, result, result_dir, filename, remove_duplicate=...
class GQARCTask (line 157) | class GQARCTask(VQARCTask):
method valid_step (line 158) | def valid_step(self, model, samples):
method _report_metrics (line 193) | def _report_metrics(self, result_file, split):
method _save_result_leaderboard (line 232) | def _save_result_leaderboard(self, results):
FILE: setup.py
function fetch_requirements (line 16) | def fetch_requirements(filename):
FILE: train.py
function parse_args (line 35) | def parse_args():
function setup_seeds (line 54) | def setup_seeds(config):
function get_runner_class (line 65) | def get_runner_class(cfg):
function main (line 74) | def main():
Condensed preview — 386 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (1,706K chars).
[
{
"path": "LICENSE.txt",
"chars": 1502,
"preview": "BSD 3-Clause License\n\nCopyright (c) 2022 Salesforce, Inc.\nAll rights reserved.\n\nRedistribution and use in source and bin"
},
{
"path": "MANIFEST.in",
"chars": 205,
"preview": "recursive-include lavis/configs *.yaml *.json\nrecursive-include lavis/projects *.yaml *.json\n\nrecursive-exclude lavis/da"
},
{
"path": "README.md",
"chars": 3649,
"preview": "# [NeurIPS 2023] Self-Chained Image-Language Model for Video Localization and Question Answering\n\n* Authors: [Shoubin Yu"
},
{
"path": "app/__init__.py",
"chars": 666,
"preview": "\"\"\"\n # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For "
},
{
"path": "app/calculate_coco_features.py",
"chars": 2380,
"preview": "\"\"\"\n # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For "
},
{
"path": "app/caption.py",
"chars": 2771,
"preview": "\"\"\"\n # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For "
},
{
"path": "app/classification.py",
"chars": 8076,
"preview": "\"\"\"\n # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For "
},
{
"path": "app/dataset_browser.py",
"chars": 7375,
"preview": "\"\"\"\n # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For "
},
{
"path": "app/image_text_match.py",
"chars": 2825,
"preview": "\"\"\"\n # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For "
},
{
"path": "app/main.py",
"chars": 819,
"preview": "\"\"\"\n # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For "
},
{
"path": "app/multimodal_search.py",
"chars": 7818,
"preview": "\"\"\"\n # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For "
},
{
"path": "app/multipage.py",
"chars": 1318,
"preview": "\"\"\"\n # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For "
},
{
"path": "app/text_localization.py",
"chars": 3457,
"preview": "\"\"\"\n # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For "
},
{
"path": "app/utils.py",
"chars": 2226,
"preview": "\"\"\"\n # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For "
},
{
"path": "app/vqa.py",
"chars": 1967,
"preview": "\"\"\"\n # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For "
},
{
"path": "app.py",
"chars": 8723,
"preview": "import gradio as gr\nimport os\nimport torch\nfrom torchvision import transforms\nfrom lavis.processors import transforms_vi"
},
{
"path": "docs/Makefile",
"chars": 638,
"preview": "# Minimal makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line, and also\n# from the "
},
{
"path": "docs/benchmark.rst",
"chars": 14221,
"preview": "Benchmark\n############\n\nWe provide scripts for evaluating and training models on task datasets. The following benchmark "
},
{
"path": "docs/build_docs.sh",
"chars": 3319,
"preview": "#!/bin/bash\nset -euo pipefail\n\n# Change to root directory of repo\nDIRNAME=$(cd \"$( dirname \"${BASH_SOURCE[0]}\" )\" &> /de"
},
{
"path": "docs/conf.py",
"chars": 1974,
"preview": "# Configuration file for the Sphinx documentation builder.\n#\n# This file only contains a selection of the most common op"
},
{
"path": "docs/getting_started.rst",
"chars": 10300,
"preview": "Dataset Zoo\n##################\nLAVIS inherently supports a wide variety of common language-vision datasets by providing "
},
{
"path": "docs/index.rst",
"chars": 731,
"preview": ".. LAVIS documentation master file, created by\n sphinx-quickstart on Sun Jul 31 10:32:27 2022.\n You can adapt this f"
},
{
"path": "docs/intro.rst",
"chars": 7294,
"preview": "What is LAVIS?\n####################################\n\nLAVIS is a Python deep learning library for LAnguage-and-VISion res"
},
{
"path": "docs/make.bat",
"chars": 799,
"preview": "@ECHO OFF\r\n\r\npushd %~dp0\r\n\r\nREM Command file for Sphinx documentation\r\n\r\nif \"%SPHINXBUILD%\" == \"\" (\r\n\tset SPHINXBUILD=sp"
},
{
"path": "docs/requirements.txt",
"chars": 91,
"preview": "GitPython\nipykernel\nnbsphinx==0.8.7\npandoc\nsphinx\nsphinx_autodoc_typehints\nsphinx_rtd_theme"
},
{
"path": "docs/tutorial.configs.rst",
"chars": 5724,
"preview": ".. _config:\n\nTraining Models on Task Datasets (Commands and Configurations) \n###########################################"
},
{
"path": "docs/tutorial.datasets.rst",
"chars": 20543,
"preview": "Adding Datasets\n################################################\n\nThis is a tutorial on adding a new dataset using ``lav"
},
{
"path": "docs/tutorial.evaluation.rst",
"chars": 1298,
"preview": "Evaluating Pre-trained Models on Task Datasets\n###############################################\nLAVIS provides pre-traine"
},
{
"path": "docs/tutorial.models.rst",
"chars": 10799,
"preview": "Adding Models\n####################################\n\nThis is a tutorial on adding new models using ``lavis.models`` modul"
},
{
"path": "docs/tutorial.processors.rst",
"chars": 10575,
"preview": "Adding Processors\n################################################\n\nThis is a tutorial on adding new processors using ``"
},
{
"path": "docs/tutorial.rst",
"chars": 225,
"preview": "Tutorials\n==============================\n\n.. toctree::\n :maxdepth: 1\n\n tutorial.evaluation\n tutorial.training-exam"
},
{
"path": "docs/tutorial.tasks.rst",
"chars": 6989,
"preview": "Adding Tasks\n####################################\n\nThis is a tutorial on adding new machine learning tasks using ``lavis"
},
{
"path": "docs/tutorial.training-example.rst",
"chars": 7139,
"preview": "Example on Finetuning BLIP on COCO-Captioning\n################################################\n\nTo finetune BLIP model o"
},
{
"path": "evaluate.py",
"chars": 2393,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/__init__.py",
"chars": 930,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/common/config.py",
"chars": 15070,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/common/dist_utils.py",
"chars": 3614,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/common/gradcam.py",
"chars": 815,
"preview": "import numpy as np\nfrom matplotlib import pyplot as plt\nfrom scipy.ndimage import filters\nfrom skimage import transform "
},
{
"path": "lavis/common/logger.py",
"chars": 5992,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/common/optims.py",
"chars": 3374,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/common/registry.py",
"chars": 9870,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/common/utils.py",
"chars": 13798,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/common/vqa_tools/__init__.py",
"chars": 246,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/common/vqa_tools/vqa.py",
"chars": 8634,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/common/vqa_tools/vqa_eval.py",
"chars": 11016,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/configs/datasets/aokvqa/defaults.yaml",
"chars": 1630,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/avsd/defaults_dial.yaml",
"chars": 1086,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/coco/defaults_cap.yaml",
"chars": 1266,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/coco/defaults_ret.yaml",
"chars": 1193,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/coco/defaults_vqa.yaml",
"chars": 2073,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/coco/eval_vqa.yaml",
"chars": 1338,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/conceptual_caption/defaults_12m.yaml",
"chars": 693,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/conceptual_caption/defaults_3m.yaml",
"chars": 686,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/didemo/defaults_ret.yaml",
"chars": 1171,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/flickr30k/defaults.yaml",
"chars": 962,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/gqa/balanced_testdev.yaml",
"chars": 1193,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/gqa/balanced_val.yaml",
"chars": 1189,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/gqa/defaults.yaml",
"chars": 1678,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/how2qa/defaults_qa.yaml",
"chars": 716,
"preview": "datasets:\n how2qa: # name of the dataset builder\n # data_dir: ${env.data_dir}/datasets\n data_type: videos # [imag"
},
{
"path": "lavis/configs/datasets/imagenet/defaults.yaml",
"chars": 525,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/laion/defaults_2B_multi.yaml",
"chars": 442,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/msrvtt/defaults_cap.yaml",
"chars": 1058,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/msrvtt/defaults_qa.yaml",
"chars": 1244,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/msrvtt/defaults_ret.yaml",
"chars": 1100,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/msrvttmc/defaults_qa.yaml",
"chars": 787,
"preview": "datasets:\n msrvttmc: # name of the dataset builder\n # data_dir: ${env.data_dir}/datasets\n data_type: videos # [im"
},
{
"path": "lavis/configs/datasets/msvd/defaults_cap.yaml",
"chars": 1042,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/msvd/defaults_qa.yaml",
"chars": 1260,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/nextqa/defaults_qa.yaml",
"chars": 946,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/nlvr/defaults.yaml",
"chars": 1025,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/nocaps/defaults.yaml",
"chars": 918,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/okvqa/defaults.yaml",
"chars": 2028,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/qvh/defaults.yaml",
"chars": 695,
"preview": "datasets:\n qvh: # name of the dataset builder\n # data_dir: ${env.data_dir}/datasets\n data_type: videos # [images|"
},
{
"path": "lavis/configs/datasets/sbu_caption/defaults.yaml",
"chars": 856,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/snli_ve/defaults.yaml",
"chars": 1017,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/star/defaults_qa.yaml",
"chars": 721,
"preview": "datasets:\n star: # name of the dataset builder\n # data_dir: ${env.data_dir}/datasets\n data_type: videos # [images"
},
{
"path": "lavis/configs/datasets/tvqa/defaults_qa.yaml",
"chars": 724,
"preview": "datasets:\n tvqa: # name of the dataset builder\n # data_dir: ${env.data_dir}/datasets\n data_type: videos # [images"
},
{
"path": "lavis/configs/datasets/vatex/defaults_cap.yaml",
"chars": 1078,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/vg/defaults_caption.yaml",
"chars": 679,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/vg/defaults_vqa.yaml",
"chars": 665,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/datasets/vlep/defaults_qa.yaml",
"chars": 721,
"preview": "datasets:\n vlep: # name of the dataset builder\n # data_dir: ${env.data_dir}/datasets\n data_type: videos # [images"
},
{
"path": "lavis/configs/default.yaml",
"chars": 369,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/albef_classification_ve.yaml",
"chars": 979,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/albef_feature_extractor.yaml",
"chars": 739,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/albef_nlvr.yaml",
"chars": 1028,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/albef_pretrain_base.yaml",
"chars": 866,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/albef_retrieval_coco.yaml",
"chars": 1104,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/albef_retrieval_flickr.yaml",
"chars": 1104,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/albef_vqav2.yaml",
"chars": 998,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/alpro_qa_msrvtt.yaml",
"chars": 1083,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/alpro_qa_msvd.yaml",
"chars": 1080,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/alpro_retrieval_didemo.yaml",
"chars": 921,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/alpro_retrieval_msrvtt.yaml",
"chars": 1054,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/bert_config.json",
"chars": 489,
"preview": "{\n \"architectures\": [\n \"BertModel\"\n ],\n \"attention_probs_dropout_prob\": 0.1,\n \"hidden_act\": \"gelu\",\n \"hidden_dro"
},
{
"path": "lavis/configs/models/bert_config_alpro.json",
"chars": 534,
"preview": "{\n \"architectures\": [\n \"BertModel\"\n ],\n \"attention_probs_dropout_prob\": 0.1,\n \"hidden_act\": \"gelu\",\n \"hidden_dro"
},
{
"path": "lavis/configs/models/blip2/blip2_caption_flant5xl.yaml",
"chars": 1074,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml",
"chars": 1073,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml",
"chars": 1073,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/blip2/blip2_coco.yaml",
"chars": 957,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/blip2/blip2_pretrain.yaml",
"chars": 860,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml",
"chars": 955,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml",
"chars": 958,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml",
"chars": 955,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml",
"chars": 955,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/blip_caption_base_coco.yaml",
"chars": 1004,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/blip_caption_large_coco.yaml",
"chars": 972,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/blip_classification_base.yaml",
"chars": 601,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/blip_feature_extractor_base.yaml",
"chars": 710,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/blip_itm_base.yaml",
"chars": 759,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/blip_itm_large.yaml",
"chars": 767,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/blip_nlvr.yaml",
"chars": 999,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/blip_pretrain_base.yaml",
"chars": 816,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/blip_pretrain_large.yaml",
"chars": 492,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/blip_retrieval_coco.yaml",
"chars": 984,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/blip_retrieval_flickr.yaml",
"chars": 1027,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/blip_vqa_aokvqa.yaml",
"chars": 988,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/blip_vqa_okvqa.yaml",
"chars": 987,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/blip_vqav2.yaml",
"chars": 995,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/clip/RN101-quickgelu.json",
"chars": 389,
"preview": "{\n \"embed_dim\": 512,\n \"quick_gelu\": true,\n \"vision_cfg\": {\n \"image_size\": 224,\n \"layers\": [\n "
},
{
"path": "lavis/configs/models/clip/RN101.json",
"chars": 365,
"preview": "{\n \"embed_dim\": 512,\n \"vision_cfg\": {\n \"image_size\": 224,\n \"layers\": [\n 3,\n 4,"
},
{
"path": "lavis/configs/models/clip/RN50-quickgelu.json",
"chars": 389,
"preview": "{\n \"embed_dim\": 1024,\n \"quick_gelu\": true,\n \"vision_cfg\": {\n \"image_size\": 224,\n \"layers\": [\n "
},
{
"path": "lavis/configs/models/clip/RN50.json",
"chars": 365,
"preview": "{\n \"embed_dim\": 1024,\n \"vision_cfg\": {\n \"image_size\": 224,\n \"layers\": [\n 3,\n 4"
},
{
"path": "lavis/configs/models/clip/RN50x16.json",
"chars": 366,
"preview": "{\n \"embed_dim\": 768,\n \"vision_cfg\": {\n \"image_size\": 384,\n \"layers\": [\n 6,\n 8,"
},
{
"path": "lavis/configs/models/clip/RN50x4.json",
"chars": 366,
"preview": "{\n \"embed_dim\": 640,\n \"vision_cfg\": {\n \"image_size\": 288,\n \"layers\": [\n 4,\n 6,"
},
{
"path": "lavis/configs/models/clip/ViT-B-16-plus-240.json",
"chars": 296,
"preview": "{\n \"embed_dim\": 640,\n \"vision_cfg\": {\n \"image_size\": 240,\n \"layers\": 12,\n \"width\": 896,\n "
},
{
"path": "lavis/configs/models/clip/ViT-B-16-plus.json",
"chars": 296,
"preview": "{\n \"embed_dim\": 640,\n \"vision_cfg\": {\n \"image_size\": 224,\n \"layers\": 12,\n \"width\": 896,\n "
},
{
"path": "lavis/configs/models/clip/ViT-B-16.json",
"chars": 295,
"preview": "{\n \"embed_dim\": 512,\n \"vision_cfg\": {\n \"image_size\": 224,\n \"layers\": 12,\n \"width\": 768,\n "
},
{
"path": "lavis/configs/models/clip/ViT-B-32-plus-256.json",
"chars": 296,
"preview": "{\n \"embed_dim\": 640,\n \"vision_cfg\": {\n \"image_size\": 256,\n \"layers\": 12,\n \"width\": 896,\n "
},
{
"path": "lavis/configs/models/clip/ViT-B-32-quickgelu.json",
"chars": 319,
"preview": "{\n \"embed_dim\": 512,\n \"quick_gelu\": true,\n \"vision_cfg\": {\n \"image_size\": 224,\n \"layers\": 12,\n "
},
{
"path": "lavis/configs/models/clip/ViT-B-32.json",
"chars": 295,
"preview": "{\n \"embed_dim\": 512,\n \"vision_cfg\": {\n \"image_size\": 224,\n \"layers\": 12,\n \"width\": 768,\n "
},
{
"path": "lavis/configs/models/clip/ViT-H-14.json",
"chars": 325,
"preview": "{\n \"embed_dim\": 1024,\n \"vision_cfg\": {\n \"image_size\": 224,\n \"layers\": 32,\n \"width\": 1280,\n "
},
{
"path": "lavis/configs/models/clip/ViT-H-16.json",
"chars": 325,
"preview": "{\n \"embed_dim\": 1024,\n \"vision_cfg\": {\n \"image_size\": 224,\n \"layers\": 32,\n \"width\": 1280,\n "
},
{
"path": "lavis/configs/models/clip/ViT-L-14-280.json",
"chars": 297,
"preview": "{\n \"embed_dim\": 768,\n \"vision_cfg\": {\n \"image_size\": 280,\n \"layers\": 24,\n \"width\": 1024,\n "
},
{
"path": "lavis/configs/models/clip/ViT-L-14-336.json",
"chars": 297,
"preview": "{\n \"embed_dim\": 768,\n \"vision_cfg\": {\n \"image_size\": 336,\n \"layers\": 24,\n \"width\": 1024,\n "
},
{
"path": "lavis/configs/models/clip/ViT-L-14.json",
"chars": 297,
"preview": "{\n \"embed_dim\": 768,\n \"vision_cfg\": {\n \"image_size\": 224,\n \"layers\": 24,\n \"width\": 1024,\n "
},
{
"path": "lavis/configs/models/clip/ViT-L-16-320.json",
"chars": 297,
"preview": "{\n \"embed_dim\": 768,\n \"vision_cfg\": {\n \"image_size\": 320,\n \"layers\": 24,\n \"width\": 1024,\n "
},
{
"path": "lavis/configs/models/clip/ViT-L-16.json",
"chars": 297,
"preview": "{\n \"embed_dim\": 768,\n \"vision_cfg\": {\n \"image_size\": 224,\n \"layers\": 24,\n \"width\": 1024,\n "
},
{
"path": "lavis/configs/models/clip/ViT-g-14.json",
"chars": 354,
"preview": "{\n \"embed_dim\": 1024,\n \"vision_cfg\": {\n \"image_size\": 224,\n \"layers\": 40,\n \"width\": 1408,\n "
},
{
"path": "lavis/configs/models/clip/timm-efficientnetv2_rw_s.json",
"chars": 373,
"preview": "{\n \"embed_dim\": 768,\n \"vision_cfg\": {\n \"timm_model_name\": \"efficientnetv2_rw_s\",\n \"timm_model_pretra"
},
{
"path": "lavis/configs/models/clip/timm-resnet50d.json",
"chars": 364,
"preview": "{\n \"embed_dim\": 1024,\n \"vision_cfg\": {\n \"timm_model_name\": \"resnet50d\",\n \"timm_model_pretrained\": fa"
},
{
"path": "lavis/configs/models/clip/timm-resnetaa50d.json",
"chars": 366,
"preview": "{\n \"embed_dim\": 1024,\n \"vision_cfg\": {\n \"timm_model_name\": \"resnetaa50d\",\n \"timm_model_pretrained\": "
},
{
"path": "lavis/configs/models/clip/timm-resnetblur50.json",
"chars": 367,
"preview": "{\n \"embed_dim\": 1024,\n \"vision_cfg\": {\n \"timm_model_name\": \"resnetblur50\",\n \"timm_model_pretrained\":"
},
{
"path": "lavis/configs/models/clip/timm-swin_base_patch4_window7_224.json",
"chars": 380,
"preview": "{\n \"embed_dim\": 512,\n \"vision_cfg\": {\n \"timm_model_name\": \"swin_base_patch4_window7_224\",\n \"timm_mod"
},
{
"path": "lavis/configs/models/clip/timm-vit_base_patch16_224.json",
"chars": 372,
"preview": "{\n \"embed_dim\": 512,\n \"vision_cfg\": {\n \"timm_model_name\": \"vit_base_patch16_224\",\n \"timm_model_pretr"
},
{
"path": "lavis/configs/models/clip/timm-vit_base_patch32_224.json",
"chars": 372,
"preview": "{\n \"embed_dim\": 512,\n \"vision_cfg\": {\n \"timm_model_name\": \"vit_base_patch32_224\",\n \"timm_model_pretr"
},
{
"path": "lavis/configs/models/clip/timm-vit_small_patch16_224.json",
"chars": 373,
"preview": "{\n \"embed_dim\": 512,\n \"vision_cfg\": {\n \"timm_model_name\": \"vit_small_patch16_224\",\n \"timm_model_pret"
},
{
"path": "lavis/configs/models/clip_resnet50.yaml",
"chars": 284,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/clip_vit_base16.yaml",
"chars": 386,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/clip_vit_base32.yaml",
"chars": 1114,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/clip_vit_large14.yaml",
"chars": 1114,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/clip_vit_large14_336.yaml",
"chars": 1118,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/gpt_dialogue_base.yaml",
"chars": 892,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/img2prompt-vqa/img2prompt_vqa_base.yaml",
"chars": 1441,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/med_config.json",
"chars": 489,
"preview": "{\n \"architectures\": [\n \"BertModel\"\n ],\n \"attention_probs_dropout_prob\": 0.1,\n \"hidden_act\": \"gelu\",\n \"hidden_dro"
},
{
"path": "lavis/configs/models/med_config_albef.json",
"chars": 510,
"preview": "{\n \"architectures\": [\n \"BertModel\"\n ],\n \"attention_probs_dropout_prob\": 0.1,\n \"hidden_act\": \"gelu\",\n \"hidden_dro"
},
{
"path": "lavis/configs/models/med_large_config.json",
"chars": 490,
"preview": "{\n \"architectures\": [\n \"BertModel\"\n ],\n \"attention_probs_dropout_prob\": 0.1,\n \"hidden_act\": \"gelu\",\n \"hidden_dro"
},
{
"path": "lavis/configs/models/pnp-vqa/pnp_vqa_3b.yaml",
"chars": 1465,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/pnp-vqa/pnp_vqa_base.yaml",
"chars": 1470,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/pnp-vqa/pnp_vqa_large.yaml",
"chars": 1474,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/configs/models/pnp-vqa/unifiedqav2_3b_config.json",
"chars": 1482,
"preview": "{\n \"architectures\": [\n \"T5ForConditionalGeneration\"\n ],\n \"d_ff\": 16384,\n \"d_kv\": 128,\n \"d_model\": 1024,\n \"decod"
},
{
"path": "lavis/configs/models/pnp-vqa/unifiedqav2_base_config.json",
"chars": 1451,
"preview": "{\n \"architectures\": [\n \"T5ForConditionalGeneration\"\n ],\n \"d_ff\": 3072,\n \"d_kv\": 64,\n \"d_model\": 768,\n \"decoder_"
},
{
"path": "lavis/configs/models/pnp-vqa/unifiedqav2_large_config.json",
"chars": 1452,
"preview": "{\n \"architectures\": [\n \"T5ForConditionalGeneration\"\n ],\n \"d_ff\": 4096,\n \"d_kv\": 64,\n \"d_model\": 1024,\n \"decoder"
},
{
"path": "lavis/configs/models/sevila.yaml",
"chars": 1112,
"preview": " # Copyright (c) 2022, salesforce.com, inc.\n # All rights reserved.\n # SPDX-License-Identifier: BSD-3-Clause\n # For full"
},
{
"path": "lavis/datasets/builders/__init__.py",
"chars": 3431,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/builders/base_dataset_builder.py",
"chars": 8178,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/builders/caption_builder.py",
"chars": 1904,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/builders/classification_builder.py",
"chars": 1009,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/builders/dialogue_builder.py",
"chars": 705,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/builders/image_text_pair_builder.py",
"chars": 2320,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/builders/imagefolder_builder.py",
"chars": 20539,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/builders/retrieval_builder.py",
"chars": 1579,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/builders/video_qa_builder.py",
"chars": 2766,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/builders/vqa_builder.py",
"chars": 1955,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/data_utils.py",
"chars": 11982,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/datasets/aok_vqa_datasets.py",
"chars": 4960,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/datasets/avsd_dialogue_datasets.py",
"chars": 5679,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/datasets/base_dataset.py",
"chars": 2479,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/datasets/caption_datasets.py",
"chars": 2539,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/datasets/coco_caption_datasets.py",
"chars": 2095,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/datasets/coco_vqa_datasets.py",
"chars": 3215,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/datasets/dataloader_utils.py",
"chars": 5249,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/datasets/dialogue_datasets.py",
"chars": 4225,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/datasets/gqa_datasets.py",
"chars": 2936,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/datasets/image_text_pair_datasets.py",
"chars": 1461,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/datasets/imagefolder_dataset.py",
"chars": 1669,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/datasets/laion_dataset.py",
"chars": 2012,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/datasets/mc_video_vqa_datasets.py",
"chars": 6447,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/datasets/multimodal_classification_datasets.py",
"chars": 699,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/datasets/nlvr_datasets.py",
"chars": 2850,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/datasets/retrieval_datasets.py",
"chars": 5123,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/datasets/snli_ve_datasets.py",
"chars": 1728,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/datasets/vg_vqa_datasets.py",
"chars": 1055,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/datasets/video_caption_datasets.py",
"chars": 1993,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/datasets/video_vqa_datasets.py",
"chars": 1839,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/datasets/vqa_datasets.py",
"chars": 1384,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/download_scripts/DownloadConceptualCaptions/LICENSE",
"chars": 1289,
"preview": "// Copyright 2022 Dongxu Li, Junnan Li, Hung Le, Guangsen Wang, Silvio Savarese, Steven Hoi. All rights reserved.\n// Use"
},
{
"path": "lavis/datasets/download_scripts/DownloadConceptualCaptions/README.md",
"chars": 1306,
"preview": "<!--\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full li"
},
{
"path": "lavis/datasets/download_scripts/DownloadConceptualCaptions/create_annotation_12m.ipynb",
"chars": 5542,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 15,\n \"metadata\": {},\n \"outputs\": [],\n \"source\": [\n"
},
{
"path": "lavis/datasets/download_scripts/DownloadConceptualCaptions/create_annotation_3m.ipynb",
"chars": 5533,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 15,\n \"metadata\": {},\n \"outputs\": [],\n \"source\": [\n"
},
{
"path": "lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc12m.py",
"chars": 6597,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
},
{
"path": "lavis/datasets/download_scripts/DownloadConceptualCaptions/download_data_cc3m.py",
"chars": 6546,
"preview": "\"\"\"\n Copyright (c) 2022, salesforce.com, inc.\n All rights reserved.\n SPDX-License-Identifier: BSD-3-Clause\n For full lic"
}
]
// ... and 186 more files (download for full content)
About this extraction
This page contains the full source code of the Yui010206/SeViLA GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 386 files (1.5 MB), approximately 394.5k tokens, and a symbol index with 1548 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.