gitextract__ei5npya/

├── .github/
│   └── workflows/
│       └── docs.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── CODEOWNERS
├── CODE_OF_CONDUCT.md
├── LICENSE.txt
├── MANIFEST.in
├── README.md
├── SECURITY.md
├── app/
│   ├── __init__.py
│   ├── calculate_coco_features.py
│   ├── caption.py
│   ├── classification.py
│   ├── dataset_browser.py
│   ├── image_text_match.py
│   ├── main.py
│   ├── multimodal_search.py
│   ├── multipage.py
│   ├── text_localization.py
│   ├── utils.py
│   └── vqa.py
├── dataset_card/
│   ├── avsd_dialogue.md
│   ├── coco_caption.md
│   ├── coco_retrieval.md
│   ├── conceptual_captions.md
│   ├── didemo_retrieval.md
│   ├── flickr_retrieval.md
│   ├── gqa.md
│   ├── msrvtt_qa.md
│   ├── msrvtt_retrieval.md
│   ├── msvd_qa.md
│   ├── nlvr2.md
│   ├── nocaps.md
│   ├── sbu_caption.md
│   ├── snli_visual_entailment.md
│   └── vqav2.md
├── docs/
│   ├── Makefile
│   ├── benchmark.rst
│   ├── build_docs.sh
│   ├── conf.py
│   ├── getting_started.rst
│   ├── index.rst
│   ├── intro.rst
│   ├── make.bat
│   ├── requirements.txt
│   ├── tutorial.configs.rst
│   ├── tutorial.datasets.rst
│   ├── tutorial.evaluation.rst
│   ├── tutorial.models.rst
│   ├── tutorial.processors.rst
│   ├── tutorial.rst
│   ├── tutorial.tasks.rst
│   └── tutorial.training-example.rst
├── evaluate.py
├── examples/
│   ├── albef_feature_extraction.ipynb
│   ├── albef_vqa.ipynb
│   ├── albef_zero_shot_classification.ipynb
│   ├── blip2_feature_extraction.ipynb
│   ├── blip2_image_text_matching.ipynb
│   ├── blip2_instructed_generation.ipynb
│   ├── blip_feature_extraction.ipynb
│   ├── blip_image_captioning.ipynb
│   ├── blip_image_text_matching.ipynb
│   ├── blip_text_localization.ipynb
│   ├── blip_vqa.ipynb
│   ├── blip_zero_shot_classification.ipynb
│   ├── clip_feature_extraction.ipynb
│   └── clip_zero_shot_classification.ipynb
├── lavis/
│   ├── __init__.py
│   ├── common/
│   │   ├── annotator/
│   │   │   ├── canny/
│   │   │   │   └── __init__.py
│   │   │   ├── ckpts/
│   │   │   │   └── download.sh
│   │   │   ├── hed/
│   │   │   │   └── __init__.py
│   │   │   ├── midas/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── api.py
│   │   │   │   ├── midas/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base_model.py
│   │   │   │   │   ├── blocks.py
│   │   │   │   │   ├── dpt_depth.py
│   │   │   │   │   ├── midas_net.py
│   │   │   │   │   ├── midas_net_custom.py
│   │   │   │   │   ├── transforms.py
│   │   │   │   │   └── vit.py
│   │   │   │   └── utils.py
│   │   │   ├── mlsd/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── models/
│   │   │   │   │   ├── mbv2_mlsd_large.py
│   │   │   │   │   └── mbv2_mlsd_tiny.py
│   │   │   │   └── utils.py
│   │   │   ├── openpose/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── body.py
│   │   │   │   ├── hand.py
│   │   │   │   ├── model.py
│   │   │   │   └── util.py
│   │   │   ├── uniformer/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── configs/
│   │   │   │   │   └── _base_/
│   │   │   │   │       ├── datasets/
│   │   │   │   │       │   ├── ade20k.py
│   │   │   │   │       │   ├── chase_db1.py
│   │   │   │   │       │   ├── cityscapes.py
│   │   │   │   │       │   ├── cityscapes_769x769.py
│   │   │   │   │       │   ├── drive.py
│   │   │   │   │       │   ├── hrf.py
│   │   │   │   │       │   ├── pascal_context.py
│   │   │   │   │       │   ├── pascal_context_59.py
│   │   │   │   │       │   ├── pascal_voc12.py
│   │   │   │   │       │   ├── pascal_voc12_aug.py
│   │   │   │   │       │   └── stare.py
│   │   │   │   │       ├── default_runtime.py
│   │   │   │   │       ├── models/
│   │   │   │   │       │   ├── ann_r50-d8.py
│   │   │   │   │       │   ├── apcnet_r50-d8.py
│   │   │   │   │       │   ├── ccnet_r50-d8.py
│   │   │   │   │       │   ├── cgnet.py
│   │   │   │   │       │   ├── danet_r50-d8.py
│   │   │   │   │       │   ├── deeplabv3_r50-d8.py
│   │   │   │   │       │   ├── deeplabv3_unet_s5-d16.py
│   │   │   │   │       │   ├── deeplabv3plus_r50-d8.py
│   │   │   │   │       │   ├── dmnet_r50-d8.py
│   │   │   │   │       │   ├── dnl_r50-d8.py
│   │   │   │   │       │   ├── emanet_r50-d8.py
│   │   │   │   │       │   ├── encnet_r50-d8.py
│   │   │   │   │       │   ├── fast_scnn.py
│   │   │   │   │       │   ├── fcn_hr18.py
│   │   │   │   │       │   ├── fcn_r50-d8.py
│   │   │   │   │       │   ├── fcn_unet_s5-d16.py
│   │   │   │   │       │   ├── fpn_r50.py
│   │   │   │   │       │   ├── fpn_uniformer.py
│   │   │   │   │       │   ├── gcnet_r50-d8.py
│   │   │   │   │       │   ├── lraspp_m-v3-d8.py
│   │   │   │   │       │   ├── nonlocal_r50-d8.py
│   │   │   │   │       │   ├── ocrnet_hr18.py
│   │   │   │   │       │   ├── ocrnet_r50-d8.py
│   │   │   │   │       │   ├── pointrend_r50.py
│   │   │   │   │       │   ├── psanet_r50-d8.py
│   │   │   │   │       │   ├── pspnet_r50-d8.py
│   │   │   │   │       │   ├── pspnet_unet_s5-d16.py
│   │   │   │   │       │   ├── upernet_r50.py
│   │   │   │   │       │   └── upernet_uniformer.py
│   │   │   │   │       └── schedules/
│   │   │   │   │           ├── schedule_160k.py
│   │   │   │   │           ├── schedule_20k.py
│   │   │   │   │           ├── schedule_40k.py
│   │   │   │   │           └── schedule_80k.py
│   │   │   │   ├── exp/
│   │   │   │   │   └── upernet_global_small/
│   │   │   │   │       ├── config.py
│   │   │   │   │       ├── run.sh
│   │   │   │   │       ├── test.sh
│   │   │   │   │       ├── test_config_g.py
│   │   │   │   │       ├── test_config_h32.py
│   │   │   │   │       └── test_config_w32.py
│   │   │   │   ├── mmcv/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── arraymisc/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── quantization.py
│   │   │   │   │   ├── cnn/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── alexnet.py
│   │   │   │   │   │   ├── bricks/
│   │   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   │   ├── activation.py
│   │   │   │   │   │   │   ├── context_block.py
│   │   │   │   │   │   │   ├── conv.py
│   │   │   │   │   │   │   ├── conv2d_adaptive_padding.py
│   │   │   │   │   │   │   ├── conv_module.py
│   │   │   │   │   │   │   ├── conv_ws.py
│   │   │   │   │   │   │   ├── depthwise_separable_conv_module.py
│   │   │   │   │   │   │   ├── drop.py
│   │   │   │   │   │   │   ├── generalized_attention.py
│   │   │   │   │   │   │   ├── hsigmoid.py
│   │   │   │   │   │   │   ├── hswish.py
│   │   │   │   │   │   │   ├── non_local.py
│   │   │   │   │   │   │   ├── norm.py
│   │   │   │   │   │   │   ├── padding.py
│   │   │   │   │   │   │   ├── plugin.py
│   │   │   │   │   │   │   ├── registry.py
│   │   │   │   │   │   │   ├── scale.py
│   │   │   │   │   │   │   ├── swish.py
│   │   │   │   │   │   │   ├── transformer.py
│   │   │   │   │   │   │   ├── upsample.py
│   │   │   │   │   │   │   └── wrappers.py
│   │   │   │   │   │   ├── builder.py
│   │   │   │   │   │   ├── resnet.py
│   │   │   │   │   │   ├── utils/
│   │   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   │   ├── flops_counter.py
│   │   │   │   │   │   │   ├── fuse_conv_bn.py
│   │   │   │   │   │   │   ├── sync_bn.py
│   │   │   │   │   │   │   └── weight_init.py
│   │   │   │   │   │   └── vgg.py
│   │   │   │   │   ├── engine/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── test.py
│   │   │   │   │   ├── fileio/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── file_client.py
│   │   │   │   │   │   ├── handlers/
│   │   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   │   ├── base.py
│   │   │   │   │   │   │   ├── json_handler.py
│   │   │   │   │   │   │   ├── pickle_handler.py
│   │   │   │   │   │   │   └── yaml_handler.py
│   │   │   │   │   │   ├── io.py
│   │   │   │   │   │   └── parse.py
│   │   │   │   │   ├── image/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── colorspace.py
│   │   │   │   │   │   ├── geometric.py
│   │   │   │   │   │   ├── io.py
│   │   │   │   │   │   ├── misc.py
│   │   │   │   │   │   └── photometric.py
│   │   │   │   │   ├── model_zoo/
│   │   │   │   │   │   ├── deprecated.json
│   │   │   │   │   │   ├── mmcls.json
│   │   │   │   │   │   └── open_mmlab.json
│   │   │   │   │   ├── ops/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── assign_score_withk.py
│   │   │   │   │   │   ├── ball_query.py
│   │   │   │   │   │   ├── bbox.py
│   │   │   │   │   │   ├── border_align.py
│   │   │   │   │   │   ├── box_iou_rotated.py
│   │   │   │   │   │   ├── carafe.py
│   │   │   │   │   │   ├── cc_attention.py
│   │   │   │   │   │   ├── contour_expand.py
│   │   │   │   │   │   ├── corner_pool.py
│   │   │   │   │   │   ├── correlation.py
│   │   │   │   │   │   ├── deform_conv.py
│   │   │   │   │   │   ├── deform_roi_pool.py
│   │   │   │   │   │   ├── deprecated_wrappers.py
│   │   │   │   │   │   ├── focal_loss.py
│   │   │   │   │   │   ├── furthest_point_sample.py
│   │   │   │   │   │   ├── fused_bias_leakyrelu.py
│   │   │   │   │   │   ├── gather_points.py
│   │   │   │   │   │   ├── group_points.py
│   │   │   │   │   │   ├── info.py
│   │   │   │   │   │   ├── iou3d.py
│   │   │   │   │   │   ├── knn.py
│   │   │   │   │   │   ├── masked_conv.py
│   │   │   │   │   │   ├── merge_cells.py
│   │   │   │   │   │   ├── modulated_deform_conv.py
│   │   │   │   │   │   ├── multi_scale_deform_attn.py
│   │   │   │   │   │   ├── nms.py
│   │   │   │   │   │   ├── pixel_group.py
│   │   │   │   │   │   ├── point_sample.py
│   │   │   │   │   │   ├── points_in_boxes.py
│   │   │   │   │   │   ├── points_sampler.py
│   │   │   │   │   │   ├── psa_mask.py
│   │   │   │   │   │   ├── roi_align.py
│   │   │   │   │   │   ├── roi_align_rotated.py
│   │   │   │   │   │   ├── roi_pool.py
│   │   │   │   │   │   ├── roiaware_pool3d.py
│   │   │   │   │   │   ├── roipoint_pool3d.py
│   │   │   │   │   │   ├── saconv.py
│   │   │   │   │   │   ├── scatter_points.py
│   │   │   │   │   │   ├── sync_bn.py
│   │   │   │   │   │   ├── three_interpolate.py
│   │   │   │   │   │   ├── three_nn.py
│   │   │   │   │   │   ├── tin_shift.py
│   │   │   │   │   │   ├── upfirdn2d.py
│   │   │   │   │   │   └── voxelize.py
│   │   │   │   │   ├── parallel/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── _functions.py
│   │   │   │   │   │   ├── collate.py
│   │   │   │   │   │   ├── data_container.py
│   │   │   │   │   │   ├── data_parallel.py
│   │   │   │   │   │   ├── distributed.py
│   │   │   │   │   │   ├── distributed_deprecated.py
│   │   │   │   │   │   ├── registry.py
│   │   │   │   │   │   ├── scatter_gather.py
│   │   │   │   │   │   └── utils.py
│   │   │   │   │   ├── runner/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── base_module.py
│   │   │   │   │   │   ├── base_runner.py
│   │   │   │   │   │   ├── builder.py
│   │   │   │   │   │   ├── checkpoint.py
│   │   │   │   │   │   ├── default_constructor.py
│   │   │   │   │   │   ├── dist_utils.py
│   │   │   │   │   │   ├── epoch_based_runner.py
│   │   │   │   │   │   ├── fp16_utils.py
│   │   │   │   │   │   ├── hooks/
│   │   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   │   ├── checkpoint.py
│   │   │   │   │   │   │   ├── closure.py
│   │   │   │   │   │   │   ├── ema.py
│   │   │   │   │   │   │   ├── evaluation.py
│   │   │   │   │   │   │   ├── hook.py
│   │   │   │   │   │   │   ├── iter_timer.py
│   │   │   │   │   │   │   ├── logger/
│   │   │   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   │   │   ├── base.py
│   │   │   │   │   │   │   │   ├── dvclive.py
│   │   │   │   │   │   │   │   ├── mlflow.py
│   │   │   │   │   │   │   │   ├── neptune.py
│   │   │   │   │   │   │   │   ├── pavi.py
│   │   │   │   │   │   │   │   ├── tensorboard.py
│   │   │   │   │   │   │   │   ├── text.py
│   │   │   │   │   │   │   │   └── wandb.py
│   │   │   │   │   │   │   ├── lr_updater.py
│   │   │   │   │   │   │   ├── memory.py
│   │   │   │   │   │   │   ├── momentum_updater.py
│   │   │   │   │   │   │   ├── optimizer.py
│   │   │   │   │   │   │   ├── profiler.py
│   │   │   │   │   │   │   ├── sampler_seed.py
│   │   │   │   │   │   │   └── sync_buffer.py
│   │   │   │   │   │   ├── iter_based_runner.py
│   │   │   │   │   │   ├── log_buffer.py
│   │   │   │   │   │   ├── optimizer/
│   │   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   │   ├── builder.py
│   │   │   │   │   │   │   └── default_constructor.py
│   │   │   │   │   │   ├── priority.py
│   │   │   │   │   │   └── utils.py
│   │   │   │   │   ├── utils/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── config.py
│   │   │   │   │   │   ├── env.py
│   │   │   │   │   │   ├── ext_loader.py
│   │   │   │   │   │   ├── logging.py
│   │   │   │   │   │   ├── misc.py
│   │   │   │   │   │   ├── parrots_jit.py
│   │   │   │   │   │   ├── parrots_wrapper.py
│   │   │   │   │   │   ├── path.py
│   │   │   │   │   │   ├── progressbar.py
│   │   │   │   │   │   ├── registry.py
│   │   │   │   │   │   ├── testing.py
│   │   │   │   │   │   ├── timer.py
│   │   │   │   │   │   ├── trace.py
│   │   │   │   │   │   └── version_utils.py
│   │   │   │   │   ├── version.py
│   │   │   │   │   ├── video/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── io.py
│   │   │   │   │   │   ├── optflow.py
│   │   │   │   │   │   └── processing.py
│   │   │   │   │   └── visualization/
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── color.py
│   │   │   │   │       ├── image.py
│   │   │   │   │       └── optflow.py
│   │   │   │   ├── mmcv_custom/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── checkpoint.py
│   │   │   │   └── mmseg/
│   │   │   │       ├── apis/
│   │   │   │       │   ├── __init__.py
│   │   │   │       │   ├── inference.py
│   │   │   │       │   ├── test.py
│   │   │   │       │   └── train.py
│   │   │   │       ├── core/
│   │   │   │       │   ├── __init__.py
│   │   │   │       │   ├── evaluation/
│   │   │   │       │   │   ├── __init__.py
│   │   │   │       │   │   ├── class_names.py
│   │   │   │       │   │   ├── eval_hooks.py
│   │   │   │       │   │   └── metrics.py
│   │   │   │       │   ├── seg/
│   │   │   │       │   │   ├── __init__.py
│   │   │   │       │   │   ├── builder.py
│   │   │   │       │   │   └── sampler/
│   │   │   │       │   │       ├── __init__.py
│   │   │   │       │   │       ├── base_pixel_sampler.py
│   │   │   │       │   │       └── ohem_pixel_sampler.py
│   │   │   │       │   └── utils/
│   │   │   │       │       ├── __init__.py
│   │   │   │       │       └── misc.py
│   │   │   │       ├── datasets/
│   │   │   │       │   ├── __init__.py
│   │   │   │       │   ├── ade.py
│   │   │   │       │   ├── builder.py
│   │   │   │       │   ├── chase_db1.py
│   │   │   │       │   ├── cityscapes.py
│   │   │   │       │   ├── custom.py
│   │   │   │       │   ├── dataset_wrappers.py
│   │   │   │       │   ├── drive.py
│   │   │   │       │   ├── hrf.py
│   │   │   │       │   ├── pascal_context.py
│   │   │   │       │   ├── pipelines/
│   │   │   │       │   │   ├── __init__.py
│   │   │   │       │   │   ├── compose.py
│   │   │   │       │   │   ├── formating.py
│   │   │   │       │   │   ├── loading.py
│   │   │   │       │   │   ├── test_time_aug.py
│   │   │   │       │   │   └── transforms.py
│   │   │   │       │   ├── stare.py
│   │   │   │       │   └── voc.py
│   │   │   │       ├── models/
│   │   │   │       │   ├── __init__.py
│   │   │   │       │   ├── backbones/
│   │   │   │       │   │   ├── __init__.py
│   │   │   │       │   │   ├── cgnet.py
│   │   │   │       │   │   ├── fast_scnn.py
│   │   │   │       │   │   ├── hrnet.py
│   │   │   │       │   │   ├── mobilenet_v2.py
│   │   │   │       │   │   ├── mobilenet_v3.py
│   │   │   │       │   │   ├── resnest.py
│   │   │   │       │   │   ├── resnet.py
│   │   │   │       │   │   ├── resnext.py
│   │   │   │       │   │   ├── unet.py
│   │   │   │       │   │   ├── uniformer.py
│   │   │   │       │   │   └── vit.py
│   │   │   │       │   ├── builder.py
│   │   │   │       │   ├── decode_heads/
│   │   │   │       │   │   ├── __init__.py
│   │   │   │       │   │   ├── ann_head.py
│   │   │   │       │   │   ├── apc_head.py
│   │   │   │       │   │   ├── aspp_head.py
│   │   │   │       │   │   ├── cascade_decode_head.py
│   │   │   │       │   │   ├── cc_head.py
│   │   │   │       │   │   ├── da_head.py
│   │   │   │       │   │   ├── decode_head.py
│   │   │   │       │   │   ├── dm_head.py
│   │   │   │       │   │   ├── dnl_head.py
│   │   │   │       │   │   ├── ema_head.py
│   │   │   │       │   │   ├── enc_head.py
│   │   │   │       │   │   ├── fcn_head.py
│   │   │   │       │   │   ├── fpn_head.py
│   │   │   │       │   │   ├── gc_head.py
│   │   │   │       │   │   ├── lraspp_head.py
│   │   │   │       │   │   ├── nl_head.py
│   │   │   │       │   │   ├── ocr_head.py
│   │   │   │       │   │   ├── point_head.py
│   │   │   │       │   │   ├── psa_head.py
│   │   │   │       │   │   ├── psp_head.py
│   │   │   │       │   │   ├── sep_aspp_head.py
│   │   │   │       │   │   ├── sep_fcn_head.py
│   │   │   │       │   │   └── uper_head.py
│   │   │   │       │   ├── losses/
│   │   │   │       │   │   ├── __init__.py
│   │   │   │       │   │   ├── accuracy.py
│   │   │   │       │   │   ├── cross_entropy_loss.py
│   │   │   │       │   │   ├── dice_loss.py
│   │   │   │       │   │   ├── lovasz_loss.py
│   │   │   │       │   │   └── utils.py
│   │   │   │       │   ├── necks/
│   │   │   │       │   │   ├── __init__.py
│   │   │   │       │   │   ├── fpn.py
│   │   │   │       │   │   └── multilevel_neck.py
│   │   │   │       │   ├── segmentors/
│   │   │   │       │   │   ├── __init__.py
│   │   │   │       │   │   ├── base.py
│   │   │   │       │   │   ├── cascade_encoder_decoder.py
│   │   │   │       │   │   └── encoder_decoder.py
│   │   │   │       │   └── utils/
│   │   │   │       │       ├── __init__.py
│   │   │   │       │       ├── drop.py
│   │   │   │       │       ├── inverted_residual.py
│   │   │   │       │       ├── make_divisible.py
│   │   │   │       │       ├── res_layer.py
│   │   │   │       │       ├── se_layer.py
│   │   │   │       │       ├── self_attention_block.py
│   │   │   │       │       ├── up_conv_block.py
│   │   │   │       │       └── weight_init.py
│   │   │   │       ├── ops/
│   │   │   │       │   ├── __init__.py
│   │   │   │       │   ├── encoding.py
│   │   │   │       │   └── wrappers.py
│   │   │   │       └── utils/
│   │   │   │           ├── __init__.py
│   │   │   │           ├── collect_env.py
│   │   │   │           └── logger.py
│   │   │   └── util.py
│   │   ├── config.py
│   │   ├── dist_utils.py
│   │   ├── gradcam.py
│   │   ├── logger.py
│   │   ├── optims.py
│   │   ├── registry.py
│   │   ├── utils.py
│   │   └── vqa_tools/
│   │       ├── __init__.py
│   │       ├── vqa.py
│   │       └── vqa_eval.py
│   ├── configs/
│   │   ├── datasets/
│   │   │   ├── aokvqa/
│   │   │   │   ├── defaults.yaml
│   │   │   │   └── defaults_instruct.yaml
│   │   │   ├── audiocaps/
│   │   │   │   ├── defaults_mm_cap.yaml
│   │   │   │   ├── defaults_mm_cap_instruct.yaml
│   │   │   │   └── defaults_mm_qa.yaml
│   │   │   ├── audioset/
│   │   │   │   ├── defaults_mm_cap.yaml
│   │   │   │   └── defaults_mm_cap_instruct.yaml
│   │   │   ├── avsd/
│   │   │   │   ├── defaults_dial.yaml
│   │   │   │   └── defaults_mm_dial_instruct.yaml
│   │   │   ├── blip_diffusion_datasets/
│   │   │   │   └── defaults.yaml
│   │   │   ├── capfilt14m/
│   │   │   │   ├── defaults_cap.yaml
│   │   │   │   └── defaults_cap_instruct.yaml
│   │   │   ├── charade/
│   │   │   │   ├── defaults_cap.yaml
│   │   │   │   └── defaults_cap_instruct.yaml
│   │   │   ├── clotho/
│   │   │   │   ├── defaults_mm_cap.yaml
│   │   │   │   ├── defaults_mm_cap_instruct.yaml
│   │   │   │   └── defaults_mm_qa.yaml
│   │   │   ├── coco/
│   │   │   │   ├── defaults_cap.yaml
│   │   │   │   ├── defaults_cap_instruct.yaml
│   │   │   │   ├── defaults_ret.yaml
│   │   │   │   ├── defaults_vqa.yaml
│   │   │   │   ├── defaults_vqa_instruct.yaml
│   │   │   │   └── eval_vqa.yaml
│   │   │   ├── coin/
│   │   │   │   ├── defaults_cap.yaml
│   │   │   │   └── defaults_cap_instruct.yaml
│   │   │   ├── conceptual_caption/
│   │   │   │   ├── defaults_12m.yaml
│   │   │   │   ├── defaults_12m_instruct.yaml
│   │   │   │   ├── defaults_3m.yaml
│   │   │   │   └── defaults_3m_instruct.yaml
│   │   │   ├── didemo/
│   │   │   │   └── defaults_ret.yaml
│   │   │   ├── discriminatory_reasoning/
│   │   │   │   ├── defaults_mm_audio_video.yaml
│   │   │   │   ├── defaults_mm_image_pc.yaml
│   │   │   │   └── discriminatory_dataset/
│   │   │   │       ├── audiocaps_discrn.json
│   │   │   │       └── objaverse_discrn.json
│   │   │   ├── esc50/
│   │   │   │   └── defaults_mm_cls.yaml
│   │   │   ├── flickr30k/
│   │   │   │   ├── defaults.yaml
│   │   │   │   ├── defaults_cap.yaml
│   │   │   │   └── defaults_cap_instruct.yaml
│   │   │   ├── gqa/
│   │   │   │   ├── balanced_testdev.yaml
│   │   │   │   ├── balanced_testdev_instruct.yaml
│   │   │   │   ├── balanced_val.yaml
│   │   │   │   ├── balanced_val_instruct.yaml
│   │   │   │   ├── defaults.yaml
│   │   │   │   └── defaults_instruct.yaml
│   │   │   ├── iconqa/
│   │   │   │   ├── defaults.yaml
│   │   │   │   └── defaults_instruct.yaml
│   │   │   ├── imagenet/
│   │   │   │   └── defaults.yaml
│   │   │   ├── laion/
│   │   │   │   ├── defaults_2B_multi.yaml
│   │   │   │   ├── defaults_400M.yaml
│   │   │   │   └── defaults_400M_instruct.yaml
│   │   │   ├── llava150k/
│   │   │   │   └── defaults_dial.yaml
│   │   │   ├── modelnet40/
│   │   │   │   └── defaults_cls.yaml
│   │   │   ├── msrvtt/
│   │   │   │   ├── defaults_cap.yaml
│   │   │   │   ├── defaults_cap_instruct.yaml
│   │   │   │   ├── defaults_qa.yaml
│   │   │   │   ├── defaults_qa_instruct.yaml
│   │   │   │   └── defaults_ret.yaml
│   │   │   ├── msvd/
│   │   │   │   ├── defaults_cap.yaml
│   │   │   │   ├── defaults_cap_instruct.yaml
│   │   │   │   ├── defaults_qa.yaml
│   │   │   │   └── defaults_qa_instruct.yaml
│   │   │   ├── music_avqa/
│   │   │   │   ├── defaults_mm_qa.yaml
│   │   │   │   └── defaults_mm_qa_instruct.yaml
│   │   │   ├── nlvr/
│   │   │   │   └── defaults.yaml
│   │   │   ├── nocaps/
│   │   │   │   └── defaults.yaml
│   │   │   ├── objaverse/
│   │   │   │   ├── defaults_mm_cap.yaml
│   │   │   │   ├── defaults_mm_cap_instruct.yaml
│   │   │   │   └── defaults_mm_qa.yaml
│   │   │   ├── ocrvqa/
│   │   │   │   ├── defaults.yaml
│   │   │   │   └── defaults_instruct.yaml
│   │   │   ├── okvqa/
│   │   │   │   ├── defaults.yaml
│   │   │   │   └── defaults_instruct.yaml
│   │   │   ├── sbu_caption/
│   │   │   │   ├── defaults.yaml
│   │   │   │   └── defaults_instruct.yaml
│   │   │   ├── scienceqa/
│   │   │   │   ├── defaults.yaml
│   │   │   │   └── defaults_instruct.yaml
│   │   │   ├── shapenet/
│   │   │   │   ├── defaults_mm_cap.yaml
│   │   │   │   └── defaults_mm_cap_instruct.yaml
│   │   │   ├── snli_ve/
│   │   │   │   ├── defaults.yaml
│   │   │   │   └── defaults_instruct.yaml
│   │   │   ├── textcaps/
│   │   │   │   ├── defaults.yaml
│   │   │   │   └── defaults_instruct.yaml
│   │   │   ├── valor/
│   │   │   │   ├── defaults_mm_cap.yaml
│   │   │   │   └── defaults_mm_cap_instruct.yaml
│   │   │   ├── vatex/
│   │   │   │   ├── defaults_cap.yaml
│   │   │   │   └── defaults_cap_instruct.yaml
│   │   │   ├── vg/
│   │   │   │   ├── defaults_caption.yaml
│   │   │   │   ├── defaults_caption_instruct.yaml
│   │   │   │   ├── defaults_vqa.yaml
│   │   │   │   └── defaults_vqa_instruct.yaml
│   │   │   ├── violin/
│   │   │   │   ├── defaults_cap.yaml
│   │   │   │   ├── defaults_cap_instruct.yaml
│   │   │   │   ├── defaults_entail.yaml
│   │   │   │   └── defaults_entail_instruct.yaml
│   │   │   ├── visdial/
│   │   │   │   ├── defaults_dial.yaml
│   │   │   │   └── defaults_dial_instruct.yaml
│   │   │   ├── vizwiz/
│   │   │   │   └── defaults.yaml
│   │   │   ├── vlep/
│   │   │   │   ├── defaults_cap.yaml
│   │   │   │   └── defaults_cap_instruct.yaml
│   │   │   ├── vsr/
│   │   │   │   ├── defaults.yaml
│   │   │   │   ├── defaults_classification.yaml
│   │   │   │   ├── defaults_classification_instruct.yaml
│   │   │   │   └── defaults_instruct.yaml
│   │   │   ├── wavcaps/
│   │   │   │   ├── defaults_mm_cap.yaml
│   │   │   │   └── defaults_mm_cap_instruct.yaml
│   │   │   ├── webvid/
│   │   │   │   ├── defaults_cap.yaml
│   │   │   │   └── defaults_cap_instruct.yaml
│   │   │   ├── youcook/
│   │   │   │   ├── defaults_cap.yaml
│   │   │   │   └── defaults_cap_instruct.yaml
│   │   │   └── yt8m/
│   │   │       └── defaults_mm_dial.yaml
│   │   ├── default.yaml
│   │   └── models/
│   │       ├── albef_classification_ve.yaml
│   │       ├── albef_feature_extractor.yaml
│   │       ├── albef_nlvr.yaml
│   │       ├── albef_pretrain_base.yaml
│   │       ├── albef_retrieval_coco.yaml
│   │       ├── albef_retrieval_flickr.yaml
│   │       ├── albef_vqav2.yaml
│   │       ├── alpro_qa_msrvtt.yaml
│   │       ├── alpro_qa_msvd.yaml
│   │       ├── alpro_retrieval_didemo.yaml
│   │       ├── alpro_retrieval_msrvtt.yaml
│   │       ├── bert_config.json
│   │       ├── bert_config_alpro.json
│   │       ├── blip-diffusion/
│   │       │   ├── blip_diffusion_base.yaml
│   │       │   ├── blip_diffusion_controlnet_canny.yaml
│   │       │   ├── blip_diffusion_controlnet_depth.yaml
│   │       │   └── blip_diffusion_controlnet_hed.yaml
│   │       ├── blip2/
│   │       │   ├── blip2_caption_flant5xl.yaml
│   │       │   ├── blip2_caption_opt2.7b.yaml
│   │       │   ├── blip2_caption_opt6.7b.yaml
│   │       │   ├── blip2_coco.yaml
│   │       │   ├── blip2_instruct_flant5xl.yaml
│   │       │   ├── blip2_instruct_flant5xxl.yaml
│   │       │   ├── blip2_instruct_vicuna13b.yaml
│   │       │   ├── blip2_instruct_vicuna7b.yaml
│   │       │   ├── blip2_pretrain.yaml
│   │       │   ├── blip2_pretrain_flant5xl.yaml
│   │       │   ├── blip2_pretrain_flant5xl_vitL.yaml
│   │       │   ├── blip2_pretrain_flant5xxl.yaml
│   │       │   ├── blip2_pretrain_llama7b.yaml
│   │       │   ├── blip2_pretrain_opt2.7b.yaml
│   │       │   ├── blip2_pretrain_opt6.7b.yaml
│   │       │   ├── blip2_pretrain_vitL.yaml
│   │       │   ├── blip2_xinstruct_vicuna13b.yaml
│   │       │   └── blip2_xinstruct_vicuna7b.yaml
│   │       ├── blip_caption_base_coco.yaml
│   │       ├── blip_caption_large_coco.yaml
│   │       ├── blip_classification_base.yaml
│   │       ├── blip_feature_extractor_base.yaml
│   │       ├── blip_itm_base.yaml
│   │       ├── blip_itm_large.yaml
│   │       ├── blip_nlvr.yaml
│   │       ├── blip_pretrain_base.yaml
│   │       ├── blip_pretrain_large.yaml
│   │       ├── blip_retrieval_coco.yaml
│   │       ├── blip_retrieval_flickr.yaml
│   │       ├── blip_vqa_aokvqa.yaml
│   │       ├── blip_vqa_okvqa.yaml
│   │       ├── blip_vqav2.yaml
│   │       ├── clip/
│   │       │   ├── RN101-quickgelu.json
│   │       │   ├── RN101.json
│   │       │   ├── RN50-quickgelu.json
│   │       │   ├── RN50.json
│   │       │   ├── RN50x16.json
│   │       │   ├── RN50x4.json
│   │       │   ├── ViT-B-16-plus-240.json
│   │       │   ├── ViT-B-16-plus.json
│   │       │   ├── ViT-B-16.json
│   │       │   ├── ViT-B-32-plus-256.json
│   │       │   ├── ViT-B-32-quickgelu.json
│   │       │   ├── ViT-B-32.json
│   │       │   ├── ViT-H-14.json
│   │       │   ├── ViT-H-16.json
│   │       │   ├── ViT-L-14-280.json
│   │       │   ├── ViT-L-14-336.json
│   │       │   ├── ViT-L-14.json
│   │       │   ├── ViT-L-16-320.json
│   │       │   ├── ViT-L-16.json
│   │       │   ├── ViT-g-14.json
│   │       │   ├── timm-efficientnetv2_rw_s.json
│   │       │   ├── timm-resnet50d.json
│   │       │   ├── timm-resnetaa50d.json
│   │       │   ├── timm-resnetblur50.json
│   │       │   ├── timm-swin_base_patch4_window7_224.json
│   │       │   ├── timm-vit_base_patch16_224.json
│   │       │   ├── timm-vit_base_patch32_224.json
│   │       │   └── timm-vit_small_patch16_224.json
│   │       ├── clip_resnet50.yaml
│   │       ├── clip_vit_base16.yaml
│   │       ├── clip_vit_base32.yaml
│   │       ├── clip_vit_large14.yaml
│   │       ├── clip_vit_large14_336.yaml
│   │       ├── gpt_dialogue_base.yaml
│   │       ├── img2prompt-vqa/
│   │       │   └── img2prompt_vqa_base.yaml
│   │       ├── med_config.json
│   │       ├── med_config_albef.json
│   │       ├── med_large_config.json
│   │       └── pnp-vqa/
│   │           ├── pnp_vqa_3b.yaml
│   │           ├── pnp_vqa_base.yaml
│   │           ├── pnp_vqa_large.yaml
│   │           ├── unifiedqav2_3b_config.json
│   │           ├── unifiedqav2_base_config.json
│   │           └── unifiedqav2_large_config.json
│   ├── datasets/
│   │   ├── builders/
│   │   │   ├── __init__.py
│   │   │   ├── audio_caption_builder.py
│   │   │   ├── audio_qa_builder.py
│   │   │   ├── base_dataset_builder.py
│   │   │   ├── caption_builder.py
│   │   │   ├── classification_builder.py
│   │   │   ├── dialogue_builder.py
│   │   │   ├── discrn_builders.py
│   │   │   ├── image_text_pair_builder.py
│   │   │   ├── imagefolder_builder.py
│   │   │   ├── object3d_caption_builder.py
│   │   │   ├── object3d_classification_builder.py
│   │   │   ├── object3d_qa_builder.py
│   │   │   ├── retrieval_builder.py
│   │   │   ├── text_to_image_generation_builder.py
│   │   │   ├── video_qa_builder.py
│   │   │   └── vqa_builder.py
│   │   ├── data_utils.py
│   │   ├── datasets/
│   │   │   ├── aok_vqa_datasets.py
│   │   │   ├── audio_captioning_datasets.py
│   │   │   ├── audio_classification_datasets.py
│   │   │   ├── audio_qa_datasets.py
│   │   │   ├── avsd_dialogue_datasets.py
│   │   │   ├── base_dataset.py
│   │   │   ├── capfilt_dataset.py
│   │   │   ├── caption_datasets.py
│   │   │   ├── coco_caption_datasets.py
│   │   │   ├── coco_vqa_datasets.py
│   │   │   ├── dataloader_utils.py
│   │   │   ├── dialogue_datasets.py
│   │   │   ├── discriminatory_reasoning_datasets.py
│   │   │   ├── gqa_datasets.py
│   │   │   ├── iconqa_datasets.py
│   │   │   ├── image_text_pair_datasets.py
│   │   │   ├── imagefolder_dataset.py
│   │   │   ├── laion_dataset.py
│   │   │   ├── llava150k_dataset.py
│   │   │   ├── multimodal_classification_datasets.py
│   │   │   ├── music_avqa.py
│   │   │   ├── nlvr_datasets.py
│   │   │   ├── object3d_captioning_datasets.py
│   │   │   ├── object3d_classification_datasets.py
│   │   │   ├── object3d_qa_datasets.py
│   │   │   ├── ocr_datasets.py
│   │   │   ├── retrieval_datasets.py
│   │   │   ├── snli_ve_datasets.py
│   │   │   ├── subject_driven_t2i_dataset.py
│   │   │   ├── textcaps_datasets.py
│   │   │   ├── valor_caption.py
│   │   │   ├── vatex_captioning_datasets.py
│   │   │   ├── vg_vqa_datasets.py
│   │   │   ├── video_caption_datasets.py
│   │   │   ├── video_vqa_datasets.py
│   │   │   ├── violin_dataset.py
│   │   │   ├── visdial_dialogue_datasets.py
│   │   │   ├── vizwiz_vqa_datasets.py
│   │   │   ├── vlep_dataset.py
│   │   │   ├── vqa_datasets.py
│   │   │   ├── vsr_datasets.py
│   │   │   └── yt8m_video_dialogue_datasets.py
│   │   └── download_scripts/
│   │       ├── DownloadConceptualCaptions/
│   │       │   ├── LICENSE
│   │       │   ├── README.md
│   │       │   ├── create_annotation_12m.ipynb
│   │       │   ├── create_annotation_3m.ipynb
│   │       │   ├── download_data_cc12m.py
│   │       │   └── download_data_cc3m.py
│   │       ├── download_charade.py
│   │       ├── download_coco.py
│   │       ├── download_coin.py
│   │       ├── download_didemo.py
│   │       ├── download_flickr.py
│   │       ├── download_gqa.py
│   │       ├── download_iconqa.py
│   │       ├── download_msrvtt.py
│   │       ├── download_msvd.py
│   │       ├── download_nocaps.py
│   │       ├── download_sbu.py
│   │       ├── download_vg.py
│   │       └── download_violin.py
│   ├── models/
│   │   ├── __init__.py
│   │   ├── albef_models/
│   │   │   ├── __init__.py
│   │   │   ├── albef_classification.py
│   │   │   ├── albef_feature_extractor.py
│   │   │   ├── albef_nlvr.py
│   │   │   ├── albef_outputs.py
│   │   │   ├── albef_pretrain.py
│   │   │   ├── albef_retrieval.py
│   │   │   └── albef_vqa.py
│   │   ├── alpro_models/
│   │   │   ├── __init__.py
│   │   │   ├── alpro_outputs.py
│   │   │   ├── alpro_qa.py
│   │   │   └── alpro_retrieval.py
│   │   ├── base_model.py
│   │   ├── beats/
│   │   │   ├── BEATs.py
│   │   │   ├── LICENSE_BEATs.txt
│   │   │   ├── README.md
│   │   │   ├── Tokenizers.py
│   │   │   ├── backbone.py
│   │   │   ├── modules.py
│   │   │   └── quantizer.py
│   │   ├── beats_encoder.py
│   │   ├── blip2_models/
│   │   │   ├── Qformer.py
│   │   │   ├── __init__.py
│   │   │   ├── blip2.py
│   │   │   ├── blip2_image_text_matching.py
│   │   │   ├── blip2_opt.py
│   │   │   ├── blip2_qformer.py
│   │   │   ├── blip2_t5.py
│   │   │   ├── blip2_t5_instruct.py
│   │   │   ├── blip2_vicuna_instruct.py
│   │   │   ├── blip2_vicuna_xinstruct.py
│   │   │   ├── modeling_llama.py
│   │   │   ├── modeling_opt.py
│   │   │   └── modeling_t5.py
│   │   ├── blip_diffusion_models/
│   │   │   ├── __init__.py
│   │   │   ├── blip_diffusion.py
│   │   │   ├── modeling_ctx_clip.py
│   │   │   ├── ptp_utils.py
│   │   │   └── utils.py
│   │   ├── blip_models/
│   │   │   ├── __init__.py
│   │   │   ├── blip.py
│   │   │   ├── blip_caption.py
│   │   │   ├── blip_classification.py
│   │   │   ├── blip_feature_extractor.py
│   │   │   ├── blip_image_text_matching.py
│   │   │   ├── blip_nlvr.py
│   │   │   ├── blip_outputs.py
│   │   │   ├── blip_pretrain.py
│   │   │   ├── blip_retrieval.py
│   │   │   ├── blip_vqa.py
│   │   │   └── nlvr_encoder.py
│   │   ├── clip_models/
│   │   │   ├── __init__.py
│   │   │   ├── clip_outputs.py
│   │   │   ├── loss.py
│   │   │   ├── model.py
│   │   │   ├── pretrained.py
│   │   │   ├── timm_model.py
│   │   │   ├── tokenizer.py
│   │   │   ├── transform.py
│   │   │   └── utils.py
│   │   ├── clip_vit.py
│   │   ├── eva_vit.py
│   │   ├── gpt_models/
│   │   │   └── gpt_dialogue.py
│   │   ├── img2prompt_models/
│   │   │   ├── __init__.py
│   │   │   └── img2prompt_vqa.py
│   │   ├── med.py
│   │   ├── pnp_vqa_models/
│   │   │   ├── __init__.py
│   │   │   ├── pnp_unifiedqav2_fid.py
│   │   │   └── pnp_vqa.py
│   │   ├── timesformer/
│   │   │   ├── __init__.py
│   │   │   ├── conv2d_same.py
│   │   │   ├── features.py
│   │   │   ├── helpers.py
│   │   │   ├── linear.py
│   │   │   ├── vit.py
│   │   │   └── vit_utils.py
│   │   ├── ulip_models/
│   │   │   ├── ULIP_models.py
│   │   │   ├── losses.py
│   │   │   ├── pointbert/
│   │   │   │   ├── PointTransformer_8192point.yaml
│   │   │   │   ├── checkpoint.py
│   │   │   │   ├── dvae.py
│   │   │   │   ├── logger.py
│   │   │   │   ├── misc.py
│   │   │   │   └── point_encoder.py
│   │   │   ├── ulip_scaled_up_config.yaml
│   │   │   └── utils/
│   │   │       ├── __init__.py
│   │   │       ├── build.py
│   │   │       ├── config.py
│   │   │       ├── io.py
│   │   │       ├── logger.py
│   │   │       ├── registry.py
│   │   │       ├── tokenizer.py
│   │   │       └── utils.py
│   │   └── vit.py
│   ├── processors/
│   │   ├── __init__.py
│   │   ├── alpro_processors.py
│   │   ├── audio_processors.py
│   │   ├── base_processor.py
│   │   ├── blip_diffusion_processors.py
│   │   ├── blip_processors.py
│   │   ├── clip_processors.py
│   │   ├── functional_video.py
│   │   ├── gpt_processors.py
│   │   ├── instruction_text_processors.py
│   │   ├── randaugment.py
│   │   ├── transforms_video.py
│   │   └── ulip_processors.py
│   ├── projects/
│   │   ├── albef/
│   │   │   ├── eval/
│   │   │   │   ├── nlvr_eval.yaml
│   │   │   │   ├── ret_coco_eval.yaml
│   │   │   │   ├── ret_flickr30k_eval.yaml
│   │   │   │   ├── snli_ve_eval.yaml
│   │   │   │   ├── vqa_test.yaml
│   │   │   │   └── vqa_val.yaml
│   │   │   └── train/
│   │   │       ├── aokvqa_ft.yaml
│   │   │       ├── nlvr_ft.yaml
│   │   │       ├── okvqa_ft.yaml
│   │   │       ├── pretrain.yaml
│   │   │       ├── ret_coco_ft.yaml
│   │   │       ├── ret_flickr30k_ft.yaml
│   │   │       ├── snli_ve_ft.yaml
│   │   │       └── vqa_ft.yaml
│   │   ├── alpro/
│   │   │   ├── eval/
│   │   │   │   ├── didemo_ret_eval.yaml
│   │   │   │   ├── msrvtt_qa_eval.yaml
│   │   │   │   ├── msrvtt_ret_eval.yaml
│   │   │   │   └── msvd_qa_eval.yaml
│   │   │   └── train/
│   │   │       ├── didemo_ret_ft.yaml
│   │   │       ├── msrvtt_qa_ft.yaml
│   │   │       ├── msrvtt_retrieval_ft.yaml
│   │   │       └── msvd_qa_ft.yaml
│   │   ├── blip/
│   │   │   ├── coco_cap_ft_iter.yaml
│   │   │   ├── eval/
│   │   │   │   ├── aokvqa_eval.yaml
│   │   │   │   ├── caption_coco_eval.yaml
│   │   │   │   ├── caption_coco_eval_large.yaml
│   │   │   │   ├── nlvr_eval.yaml
│   │   │   │   ├── nocaps_eval.yaml
│   │   │   │   ├── okvqa_eval.yaml
│   │   │   │   ├── ret_coco_eval.yaml
│   │   │   │   ├── ret_flickr_eval.yaml
│   │   │   │   └── vqav2_eval.yaml
│   │   │   └── train/
│   │   │       ├── aokvqa_ft.yaml
│   │   │       ├── caption_coco_ft.yaml
│   │   │       ├── caption_coco_large_ft.yaml
│   │   │       ├── nlvr_ft.yaml
│   │   │       ├── okvqa_ft.yaml
│   │   │       ├── pretrain_14m.yaml
│   │   │       ├── retrieval_coco_ft.yaml
│   │   │       ├── retrieval_flickr_ft.yaml
│   │   │       └── vqav2_ft.yaml
│   │   ├── blip2/
│   │   │   ├── eval/
│   │   │   │   ├── caption_coco_flant5xl_eval.yaml
│   │   │   │   ├── caption_coco_opt2.7b_eval.yaml
│   │   │   │   ├── caption_coco_opt6.7b_eval.yaml
│   │   │   │   ├── caption_nocaps_out_domain_flant5xl_eval.yaml
│   │   │   │   ├── caption_nocaps_out_domain_flant5xxl_eval.yaml
│   │   │   │   ├── gqa_zeroshot_flant5xl_eval.yaml
│   │   │   │   ├── okvqa_zeroshot_flant5xl_eval.yaml
│   │   │   │   ├── ret_coco_eval.yaml
│   │   │   │   ├── ret_flickr_eval.yaml
│   │   │   │   ├── vqav2_zeroshot_flant5xl_eval.yaml
│   │   │   │   └── vqav2_zeroshot_opt_eval.yaml
│   │   │   └── train/
│   │   │       ├── caption_coco_ft.yaml
│   │   │       ├── pretrain_stage1.yaml
│   │   │       ├── pretrain_stage2.yaml
│   │   │       └── retrieval_coco_ft.yaml
│   │   ├── blip_diffusion/
│   │   │   ├── finetune-db-dog.yaml
│   │   │   ├── finetune-db-pink-dress.yaml
│   │   │   ├── finetune-db-shein-jacket.yaml
│   │   │   └── finetune-db-template.yaml
│   │   ├── clip/
│   │   │   ├── exp_coco_ret_eval.yaml
│   │   │   ├── exp_flickr_ret_eval.yaml
│   │   │   └── exp_imnet_zs_eval.yaml
│   │   ├── gpt/
│   │   │   ├── eval/
│   │   │   │   └── dialogue_avsd_eval.yaml
│   │   │   └── train/
│   │   │       └── dialogue_avsd_ft.yaml
│   │   ├── instructblip/
│   │   │   ├── caption_coco_flant5xl_eval_test.yaml
│   │   │   ├── caption_coco_flant5xl_eval_val.yaml
│   │   │   ├── caption_coco_flant5xxl_eval_test.yaml
│   │   │   ├── caption_coco_flant5xxl_eval_val.yaml
│   │   │   ├── caption_coco_vicuna13b_eval_test.yaml
│   │   │   ├── caption_coco_vicuna13b_eval_val.yaml
│   │   │   ├── caption_coco_vicuna7b_eval_test.yaml
│   │   │   ├── caption_coco_vicuna7b_eval_val.yaml
│   │   │   ├── caption_msrvtt_flant5xl_eval_test.yaml
│   │   │   ├── caption_msrvtt_flant5xl_eval_val.yaml
│   │   │   ├── caption_msrvtt_flant5xxl_eval_test.yaml
│   │   │   ├── caption_msrvtt_flant5xxl_eval_val.yaml
│   │   │   ├── caption_msrvtt_vicuna13b_eval_test.yaml
│   │   │   ├── caption_msrvtt_vicuna13b_eval_val.yaml
│   │   │   ├── caption_msrvtt_vicuna7b_eval_test.yaml
│   │   │   ├── caption_msrvtt_vicuna7b_eval_val.yaml
│   │   │   ├── caption_msvd_flant5xl_eval.yaml
│   │   │   ├── caption_msvd_flant5xxl_eval.yaml
│   │   │   ├── caption_msvd_vicuna13b_eval.yaml
│   │   │   ├── caption_msvd_vicuna7b_eval.yaml
│   │   │   ├── caption_nocaps_out_domain_flant5xl_eval.yaml
│   │   │   ├── caption_nocaps_out_domain_flant5xxl_eval.yaml
│   │   │   ├── caption_nocaps_out_domain_vicuna13b_eval.yaml
│   │   │   ├── caption_nocaps_out_domain_vicuna7b_eval.yaml
│   │   │   ├── caption_vatex_flant5xl_eval.yaml
│   │   │   ├── caption_vatex_flant5xxl_eval.yaml
│   │   │   ├── caption_vatex_vicuna13b_eval.yaml
│   │   │   ├── caption_vatex_vicuna7b_eval.yaml
│   │   │   ├── classification_modelnet40_vicuna13b.yaml
│   │   │   ├── classification_modelnet40_vicuna7b.yaml
│   │   │   ├── classification_snlive_flant5xl.yaml
│   │   │   ├── classification_snlive_flant5xxl.yaml
│   │   │   ├── classification_snlive_vicuna13b.yaml
│   │   │   ├── classification_snlive_vicuna13b_test.yaml
│   │   │   ├── classification_snlive_vicuna7b_test.yaml
│   │   │   ├── classification_snlive_vicuna7b_val.yaml
│   │   │   ├── completion_modelnet40_vicuna13b.yaml
│   │   │   ├── completion_modelnet40_vicuna7b.yaml
│   │   │   ├── qa_msrvtt_flant5xl_eval_test.yaml
│   │   │   ├── qa_msrvtt_flant5xxl_eval_test.yaml
│   │   │   ├── qa_msrvtt_vicuna13b_eval_test.yaml
│   │   │   ├── qa_msrvtt_vicuna7b_eval_test.yaml
│   │   │   ├── qa_msvd_flant5xl_eval.yaml
│   │   │   ├── qa_msvd_flant5xxl_eval.yaml
│   │   │   ├── qa_msvd_vicuna13b_eval.yaml
│   │   │   ├── qa_msvd_vicuna7b_eval.yaml
│   │   │   ├── qa_okvqa_flant5xl_eval.yaml
│   │   │   ├── qa_okvqa_flant5xxl_eval.yaml
│   │   │   ├── qa_okvqa_vicuna13b_eval.yaml
│   │   │   └── qa_okvqa_vicuna7b_eval.yaml
│   │   ├── pnp-vqa/
│   │   │   └── eval/
│   │   │       ├── gqa_eval.yaml
│   │   │       ├── gqa_eval_3b.yaml
│   │   │       ├── gqa_eval_large.yaml
│   │   │       ├── okvqa_eval.yaml
│   │   │       ├── okvqa_eval_3b.yaml
│   │   │       ├── okvqa_eval_large.yaml
│   │   │       ├── vqav2_eval.yaml
│   │   │       ├── vqav2_eval_3b.yaml
│   │   │       ├── vqav2_eval_large.yaml
│   │   │       ├── vqav2_test_eval.yaml
│   │   │       ├── vqav2_test_eval_3b.yaml
│   │   │       └── vqav2_test_eval_large.yaml
│   │   └── xinstruct_blip/
│   │       ├── eval/
│   │       │   ├── discrn/
│   │       │   │   ├── audio_video_caption.yaml
│   │       │   │   ├── audio_video_caption_13b.yaml
│   │       │   │   ├── audio_video_describe.yaml
│   │       │   │   ├── audio_video_describe_13b.yaml
│   │       │   │   ├── audio_video_describe_nocue.yaml
│   │       │   │   ├── audio_video_describe_proj copy.yaml
│   │       │   │   ├── audio_video_describe_proj.yaml
│   │       │   │   ├── audio_video_describe_rand_init.yaml
│   │       │   │   ├── image_3d_caption.yaml
│   │       │   │   ├── image_3d_caption_13b.yaml
│   │       │   │   ├── image_3d_describe.yaml
│   │       │   │   ├── image_3d_describe_13b.yaml
│   │       │   │   ├── image_3d_describe_no_init.yaml
│   │       │   │   ├── image_3d_describe_nocue.yaml
│   │       │   │   └── image_3d_describe_proj.yaml
│   │       │   ├── vicuna13b/
│   │       │   │   ├── audio/
│   │       │   │   │   ├── audiocaps_captioning_qa.yaml
│   │       │   │   │   ├── audiocaps_captioning_test.yaml
│   │       │   │   │   ├── audiocaps_captioning_val.yaml
│   │       │   │   │   ├── clothoQA_captioning.yaml
│   │       │   │   │   ├── clothov1_captioning.yaml
│   │       │   │   │   ├── clothov2_captioning.yaml
│   │       │   │   │   ├── esc50_classification.yaml
│   │       │   │   │   └── esc50_classification_completion.yaml
│   │       │   │   ├── crossmodal/
│   │       │   │   │   ├── musicavqa/
│   │       │   │   │   │   ├── musicavqa_audio_eval.yaml
│   │       │   │   │   │   ├── musicavqa_joint_eval.yaml
│   │       │   │   │   │   └── musicavqa_video_eval.yaml
│   │       │   │   │   └── vatex/
│   │       │   │   │       ├── vatex_audio_captioning.yaml
│   │       │   │   │       ├── vatex_captioning.yaml
│   │       │   │   │       ├── vatex_joint_captioning.yaml
│   │       │   │   │       └── vatex_joint_captioning_interleave.yaml
│   │       │   │   ├── image/
│   │       │   │   │   ├── coco_captioning_test.yaml
│   │       │   │   │   ├── coco_captioning_val.yaml
│   │       │   │   │   ├── flickr30k_captioning.yaml
│   │       │   │   │   ├── gqa_qa.yaml
│   │       │   │   │   ├── nocaps_captioning.yaml
│   │       │   │   │   ├── nocaps_out_domain_captioning.yaml
│   │       │   │   │   ├── okvqa_qa.yaml
│   │       │   │   │   ├── snlive_classification_test.yaml
│   │       │   │   │   ├── snlive_classification_val.yaml
│   │       │   │   │   └── vizwiz_qa.yaml
│   │       │   │   ├── image_with_coco/
│   │       │   │   │   ├── coco_captioning_test.yaml
│   │       │   │   │   ├── coco_captioning_val.yaml
│   │       │   │   │   ├── flickr30k_captioning.yaml
│   │       │   │   │   ├── gqa_qa.yaml
│   │       │   │   │   ├── nocaps_captioning.yaml
│   │       │   │   │   ├── nocaps_out_domain_captioning.yaml
│   │       │   │   │   ├── okvqa_qa.yaml
│   │       │   │   │   ├── snlive_classification_test.yaml
│   │       │   │   │   ├── snlive_classification_val.yaml
│   │       │   │   │   └── vizwiz_qa.yaml
│   │       │   │   ├── pc/
│   │       │   │   │   ├── modelnet40_classification.yaml
│   │       │   │   │   ├── modelnet40_completion.yaml
│   │       │   │   │   ├── objaverse_captioning.yaml
│   │       │   │   │   └── objaverse_qa.yaml
│   │       │   │   ├── video/
│   │       │   │   │   ├── msrvtt_captioning.yaml
│   │       │   │   │   ├── msrvtt_captioning_test.yaml
│   │       │   │   │   ├── msrvtt_captioning_val.yaml
│   │       │   │   │   ├── msrvtt_qa_test.yaml
│   │       │   │   │   ├── msrvtt_qa_val.yaml
│   │       │   │   │   ├── msvd_captioning.yaml
│   │       │   │   │   ├── msvd_qa.yaml
│   │       │   │   │   ├── vatex_audio_captioning.yaml
│   │       │   │   │   ├── vatex_captioning.yaml
│   │       │   │   │   ├── vatex_joint_captioning.yaml
│   │       │   │   │   └── vatex_joint_captioning_interleave.yaml
│   │       │   │   └── video_image/
│   │       │   │       ├── msvd_captioning.yaml
│   │       │   │       ├── msvd_qa.yaml
│   │       │   │       └── vatex_captioning.yaml
│   │       │   ├── vicuna7b/
│   │       │   │   ├── audio/
│   │       │   │   │   ├── audiocaps_captioning_qa.yaml
│   │       │   │   │   ├── audiocaps_captioning_test.yaml
│   │       │   │   │   ├── audiocaps_captioning_val.yaml
│   │       │   │   │   ├── clothoQA_captioning.yaml
│   │       │   │   │   ├── clothov1_captioning.yaml
│   │       │   │   │   ├── clothov2_captioning.yaml
│   │       │   │   │   ├── esc50_classification.yaml
│   │       │   │   │   └── esc50_classification_completion.yaml
│   │       │   │   ├── audio_no_init/
│   │       │   │   │   ├── audiocaps_captioning_qa.yaml
│   │       │   │   │   ├── audiocaps_captioning_test.yaml
│   │       │   │   │   ├── audiocaps_captioning_val.yaml
│   │       │   │   │   ├── clothoQA_captioning.yaml
│   │       │   │   │   ├── clothov1_captioning.yaml
│   │       │   │   │   ├── clothov2_captioning.yaml
│   │       │   │   │   ├── esc50_classification.yaml
│   │       │   │   │   └── esc50_classification_completion.yaml
│   │       │   │   ├── audio_projection_only/
│   │       │   │   │   ├── audiocaps_captioning_qa.yaml
│   │       │   │   │   ├── audiocaps_captioning_test.yaml
│   │       │   │   │   ├── audiocaps_captioning_val.yaml
│   │       │   │   │   ├── clothoQA_captioning.yaml
│   │       │   │   │   ├── clothov1_captioning.yaml
│   │       │   │   │   ├── clothov2_captioning.yaml
│   │       │   │   │   ├── esc50_classification.yaml
│   │       │   │   │   └── esc50_classification_completion.yaml
│   │       │   │   ├── audio_projection_only_nocue/
│   │       │   │   │   ├── audiocaps_captioning_qa.yaml
│   │       │   │   │   ├── audiocaps_captioning_test.yaml
│   │       │   │   │   ├── audiocaps_captioning_val.yaml
│   │       │   │   │   ├── clothoQA_captioning.yaml
│   │       │   │   │   ├── clothov1_captioning.yaml
│   │       │   │   │   ├── clothov2_captioning.yaml
│   │       │   │   │   ├── esc50_classification.yaml
│   │       │   │   │   └── esc50_classification_completion.yaml
│   │       │   │   ├── crossmodal/
│   │       │   │   │   ├── musicavqa/
│   │       │   │   │   │   ├── musicavqa_audio_eval.yaml
│   │       │   │   │   │   ├── musicavqa_joint_eval.yaml
│   │       │   │   │   │   └── musicavqa_video_eval.yaml
│   │       │   │   │   └── vatex/
│   │       │   │   │       ├── vatex_audio_captioning.yaml
│   │       │   │   │       ├── vatex_captioning.yaml
│   │       │   │   │       ├── vatex_joint_captioning.yaml
│   │       │   │   │       └── vatex_joint_captioning_interleave.yaml
│   │       │   │   ├── image/
│   │       │   │   │   ├── coco_captioning_test.yaml
│   │       │   │   │   ├── coco_captioning_val.yaml
│   │       │   │   │   ├── flickr30k_captioning.yaml
│   │       │   │   │   ├── gqa_qa.yaml
│   │       │   │   │   ├── gqa_qa_val.yaml
│   │       │   │   │   ├── nocaps_captioning.yaml
│   │       │   │   │   ├── nocaps_out_domain_captioning.yaml
│   │       │   │   │   ├── okvqa_qa.yaml
│   │       │   │   │   ├── snlive_classification_test.yaml
│   │       │   │   │   ├── snlive_classification_val.yaml
│   │       │   │   │   └── vizwiz_qa.yaml
│   │       │   │   ├── image_full_init/
│   │       │   │   │   ├── coco_captioning_test.yaml
│   │       │   │   │   ├── coco_captioning_val.yaml
│   │       │   │   │   ├── flickr30k_captioning.yaml
│   │       │   │   │   ├── gqa_qa.yaml
│   │       │   │   │   ├── gqa_qa_val.yaml
│   │       │   │   │   ├── nocaps_captioning.yaml
│   │       │   │   │   ├── nocaps_out_domain_captioning.yaml
│   │       │   │   │   ├── okvqa_qa.yaml
│   │       │   │   │   ├── snlive_classification_test.yaml
│   │       │   │   │   ├── snlive_classification_val.yaml
│   │       │   │   │   └── vizwiz_qa.yaml
│   │       │   │   ├── image_no_init/
│   │       │   │   │   ├── coco_captioning_test.yaml
│   │       │   │   │   ├── coco_captioning_val.yaml
│   │       │   │   │   ├── flickr30k_captioning.yaml
│   │       │   │   │   ├── gqa_qa.yaml
│   │       │   │   │   ├── gqa_qa_val.yaml
│   │       │   │   │   ├── nocaps_captioning.yaml
│   │       │   │   │   ├── nocaps_out_domain_captioning.yaml
│   │       │   │   │   ├── okvqa_qa.yaml
│   │       │   │   │   ├── snlive_classification_test.yaml
│   │       │   │   │   ├── snlive_classification_val.yaml
│   │       │   │   │   └── vizwiz_qa.yaml
│   │       │   │   ├── image_pre_coco/
│   │       │   │   │   ├── coco_captioning_test.yaml
│   │       │   │   │   ├── coco_captioning_val.yaml
│   │       │   │   │   ├── flickr30k_captioning.yaml
│   │       │   │   │   ├── gqa_qa.yaml
│   │       │   │   │   ├── nocaps_captioning.yaml
│   │       │   │   │   ├── nocaps_out_domain_captioning.yaml
│   │       │   │   │   ├── okvqa_qa.yaml
│   │       │   │   │   ├── snlive_classification_test.yaml
│   │       │   │   │   ├── snlive_classification_val.yaml
│   │       │   │   │   └── vizwiz_qa.yaml
│   │       │   │   ├── image_projection_only/
│   │       │   │   │   ├── coco_captioning_test.yaml
│   │       │   │   │   ├── coco_captioning_val.yaml
│   │       │   │   │   ├── flickr30k_captioning.yaml
│   │       │   │   │   ├── gqa_qa.yaml
│   │       │   │   │   ├── gqa_qa_val.yaml
│   │       │   │   │   ├── nocaps_captioning.yaml
│   │       │   │   │   ├── nocaps_out_domain_captioning.yaml
│   │       │   │   │   ├── okvqa_qa.yaml
│   │       │   │   │   ├── snlive_classification_test.yaml
│   │       │   │   │   ├── snlive_classification_val.yaml
│   │       │   │   │   └── vizwiz_qa.yaml
│   │       │   │   ├── pc/
│   │       │   │   │   ├── modelnet40_classification.yaml
│   │       │   │   │   ├── modelnet40_completion.yaml
│   │       │   │   │   ├── objaverse_captioning.yaml
│   │       │   │   │   └── objaverse_qa.yaml
│   │       │   │   ├── pc_no_init/
│   │       │   │   │   ├── modelnet40_classification.yaml
│   │       │   │   │   ├── modelnet40_completion.yaml
│   │       │   │   │   ├── objaverse_captioning.yaml
│   │       │   │   │   └── objaverse_qa.yaml
│   │       │   │   ├── pc_projection_only/
│   │       │   │   │   ├── modelnet40_classification.yaml
│   │       │   │   │   ├── modelnet40_completion.yaml
│   │       │   │   │   ├── objaverse_captioning.yaml
│   │       │   │   │   └── objaverse_qa.yaml
│   │       │   │   ├── pc_ulip1/
│   │       │   │   │   ├── modelnet40_classification.yaml
│   │       │   │   │   ├── modelnet40_completion.yaml
│   │       │   │   │   ├── objaverse_captioning.yaml
│   │       │   │   │   └── objaverse_qa.yaml
│   │       │   │   ├── pc_ulip2_scaled_up/
│   │       │   │   │   ├── modelnet40_classification.yaml
│   │       │   │   │   ├── modelnet40_completion.yaml
│   │       │   │   │   ├── objaverse_captioning.yaml
│   │       │   │   │   └── objaverse_qa.yaml
│   │       │   │   ├── pc_ulip_objaverse/
│   │       │   │   │   ├── modelnet40_classification.yaml
│   │       │   │   │   ├── modelnet40_completion.yaml
│   │       │   │   │   ├── objaverse_captioning.yaml
│   │       │   │   │   └── objaverse_qa.yaml
│   │       │   │   ├── pc_ulip_objaverse_shapenet/
│   │       │   │   │   ├── modelnet40_classification.yaml
│   │       │   │   │   ├── modelnet40_completion.yaml
│   │       │   │   │   ├── objaverse_captioning.yaml
│   │       │   │   │   └── objaverse_qa.yaml
│   │       │   │   ├── pc_ulip_shapenet/
│   │       │   │   │   ├── modelnet40_classification.yaml
│   │       │   │   │   ├── modelnet40_completion.yaml
│   │       │   │   │   ├── objaverse_captioning.yaml
│   │       │   │   │   └── objaverse_qa.yaml
│   │       │   │   ├── video/
│   │       │   │   │   ├── msrvtt_captioning_test.yaml
│   │       │   │   │   ├── msrvtt_captioning_val.yaml
│   │       │   │   │   ├── msrvtt_qa_test.yaml
│   │       │   │   │   ├── msrvtt_qa_val.yaml
│   │       │   │   │   ├── msvd_captioning.yaml
│   │       │   │   │   ├── msvd_qa.yaml
│   │       │   │   │   └── vatex_captioning.yaml
│   │       │   │   ├── video_image/
│   │       │   │   │   ├── msvd_captioning.yaml
│   │       │   │   │   ├── msvd_qa.yaml
│   │       │   │   │   └── vatex_captioning.yaml
│   │       │   │   ├── video_image_pre_coco/
│   │       │   │   │   ├── msvd_captioning.yaml
│   │       │   │   │   ├── msvd_qa.yaml
│   │       │   │   │   └── vatex_captioning.yaml
│   │       │   │   └── video_no_upsample/
│   │       │   │       ├── msrvtt_captioning_test.yaml
│   │       │   │       ├── msrvtt_captioning_val.yaml
│   │       │   │       ├── msrvtt_qa_test.yaml
│   │       │   │       ├── msrvtt_qa_val.yaml
│   │       │   │       ├── msvd_captioning.yaml
│   │       │   │       ├── msvd_captioning_up.yaml
│   │       │   │       ├── msvd_qa.yaml
│   │       │   │       ├── msvd_qa_up.yaml
│   │       │   │       ├── vatex_captioning.yaml
│   │       │   │       └── vatex_captioning_up.yaml
│   │       │   └── vicuna7b_nocue/
│   │       │       ├── audio/
│   │       │       │   ├── audiocaps_captioning_qa.yaml
│   │       │       │   ├── audiocaps_captioning_test.yaml
│   │       │       │   ├── audiocaps_captioning_val.yaml
│   │       │       │   ├── clothoQA_captioning.yaml
│   │       │       │   ├── clothov1_captioning.yaml
│   │       │       │   ├── clothov2_captioning.yaml
│   │       │       │   ├── esc50_classification.yaml
│   │       │       │   └── esc50_classification_completion.yaml
│   │       │       ├── crossmodal/
│   │       │       │   ├── musicavqa/
│   │       │       │   │   ├── musicavqa_audio_eval.yaml
│   │       │       │   │   ├── musicavqa_joint_eval.yaml
│   │       │       │   │   └── musicavqa_video_eval.yaml
│   │       │       │   └── vatex/
│   │       │       │       ├── vatex_audio_captioning.yaml
│   │       │       │       ├── vatex_captioning.yaml
│   │       │       │       └── vatex_joint_captioning.yaml
│   │       │       ├── image/
│   │       │       │   ├── coco_captioning_test.yaml
│   │       │       │   ├── coco_captioning_val.yaml
│   │       │       │   ├── flickr30k_captioning.yaml
│   │       │       │   ├── gqa_qa.yaml
│   │       │       │   ├── nocaps_captioning.yaml
│   │       │       │   ├── nocaps_out_domain_captioning.yaml
│   │       │       │   ├── okvqa_qa.yaml
│   │       │       │   ├── snlive_classification_test.yaml
│   │       │       │   ├── snlive_classification_val.yaml
│   │       │       │   └── vizwiz_qa.yaml
│   │       │       ├── pc/
│   │       │       │   ├── modelnet40_classification.yaml
│   │       │       │   ├── modelnet40_completion.yaml
│   │       │       │   ├── objaverse_captioning.yaml
│   │       │       │   └── objaverse_qa.yaml
│   │       │       ├── video/
│   │       │       │   ├── msrvtt_captioning_test.yaml
│   │       │       │   ├── msrvtt_captioning_val.yaml
│   │       │       │   ├── msrvtt_qa_test.yaml
│   │       │       │   ├── msrvtt_qa_val.yaml
│   │       │       │   ├── msvd_captioning.yaml
│   │       │       │   ├── msvd_qa.yaml
│   │       │       │   └── vatex_captioning.yaml
│   │       │       └── video_image/
│   │       │           ├── msvd_captioning.yaml
│   │       │           ├── msvd_qa.yaml
│   │       │           └── vatex_captioning.yaml
│   │       ├── prompt_variation/
│   │       │   └── nocaps/
│   │       │       ├── instructblip/
│   │       │       │   ├── original.yaml
│   │       │       │   ├── template_1.yaml
│   │       │       │   ├── template_2.yaml
│   │       │       │   ├── template_3.yaml
│   │       │       │   ├── template_4.yaml
│   │       │       │   └── template_5.yaml
│   │       │       └── xinstructblip/
│   │       │           ├── template_1.yaml
│   │       │           ├── template_2.yaml
│   │       │           ├── template_3.yaml
│   │       │           ├── template_4.yaml
│   │       │           └── template_5.yaml
│   │       └── train/
│   │           ├── vicuna13b/
│   │           │   ├── audio_training.yaml
│   │           │   ├── audio_training_continue.yaml
│   │           │   ├── image_train.yaml
│   │           │   ├── image_train_continue.yaml
│   │           │   ├── pc_training.yaml
│   │           │   └── video_training.yaml
│   │           ├── vicuna7b/
│   │           │   ├── audio_training.yaml
│   │           │   ├── audio_training_improved.yaml
│   │           │   ├── audio_training_no_init.yaml
│   │           │   ├── audio_training_projection_only.yaml
│   │           │   ├── audio_training_projection_only_nocue.yaml
│   │           │   ├── image_train.yaml
│   │           │   ├── image_train_improved.yaml
│   │           │   ├── image_train_no_init.yaml
│   │           │   ├── image_train_projection_only.yaml
│   │           │   ├── lora_training.yaml
│   │           │   ├── pc_training.yaml
│   │           │   ├── pc_training_improved.yaml
│   │           │   ├── pc_training_no_init.yaml
│   │           │   ├── pc_training_projection_only.yaml
│   │           │   ├── pc_training_projection_only_nocue.yaml
│   │           │   ├── pc_training_scaled_up.yaml
│   │           │   ├── pc_training_ulip1.yaml
│   │           │   ├── pc_training_ulip2_objaverse_shapenet_k_1.yaml
│   │           │   ├── pc_training_ulip_objaverse.yaml
│   │           │   ├── pc_training_ulip_shapenet.yaml
│   │           │   ├── video_training.yaml
│   │           │   └── video_training_no_msrvtt_upsample.yaml
│   │           └── vicuna7b_nocue/
│   │               ├── audio_training.yaml
│   │               ├── image_train.yaml
│   │               ├── pc_training.yaml
│   │               └── video_training.yaml
│   ├── runners/
│   │   ├── __init__.py
│   │   ├── runner_base.py
│   │   └── runner_iter.py
│   └── tasks/
│       ├── __init__.py
│       ├── base_task.py
│       ├── captioning.py
│       ├── dialogue.py
│       ├── image_text_pretrain.py
│       ├── multimodal_classification.py
│       ├── retrieval.py
│       ├── text_to_image_generation.py
│       ├── vqa.py
│       └── vqa_reading_comprehension.py
├── projects/
│   ├── blip-diffusion/
│   │   ├── README.md
│   │   └── notebooks/
│   │       ├── editing_real_finetuned.ipynb
│   │       ├── editing_real_zeroshot.ipynb
│   │       ├── editing_synthetic_zeroshot.ipynb
│   │       ├── editing_tryon_zeroshot.ipynb
│   │       ├── generation_finetuned_dog.ipynb
│   │       ├── generation_zeroshot.ipynb
│   │       └── stylization.ipynb
│   ├── blip2/
│   │   └── README.md
│   ├── img2llm-vqa/
│   │   ├── README.md
│   │   ├── img2llm_vqa.ipynb
│   │   └── img2llm_vqa.py
│   ├── img2prompt-vqa/
│   │   └── README.md
│   ├── instructblip/
│   │   ├── README.md
│   │   └── run_demo.py
│   ├── pnp-vqa/
│   │   ├── README.md
│   │   └── pnp_vqa.ipynb
│   └── xinstructblip/
│       ├── README.md
│       ├── data_aug/
│       │   ├── 3d_qa_data_generation.py
│       │   └── audio_qa_data_generation.py
│       ├── demo/
│       │   ├── configs/
│       │   │   ├── vicuna13b.yaml
│       │   │   ├── vicuna7b.yaml
│       │   │   ├── vicuna7b_blip_init.yaml
│       │   │   ├── vicuna7b_no_init.yaml
│       │   │   ├── vicuna7b_nocue.yaml
│       │   │   ├── vicuna7b_projection.yaml
│       │   │   ├── vicuna7b_rand.yaml
│       │   │   └── vicuna7b_v2.yaml
│       │   ├── demo.ipynb
│       │   ├── examples/
│       │   │   └── point_cloud/
│       │   │       └── banana.glb
│       │   └── run_demo.py
│       ├── discrn/
│       │   ├── caption_baseline/
│       │   │   ├── predict_audio.py
│       │   │   ├── predict_image.py
│       │   │   ├── predict_pc.py
│       │   │   ├── predict_video.py
│       │   │   └── render_images.py
│       │   └── data_generation/
│       │       ├── audiocaps_video_audio.py
│       │       └── objaverse_img_3d.py
│       └── modelnet_baseline/
│           └── render_images.py
├── pyproject.toml
├── requirements.txt
├── run_scripts/
│   ├── albef/
│   │   ├── eval/
│   │   │   ├── eval_albef_nlvr.sh
│   │   │   ├── eval_albef_ve.sh
│   │   │   ├── eval_coco_retrieval.sh
│   │   │   ├── eval_flickr30k_retrieval.sh
│   │   │   ├── test_albef_vqa.sh
│   │   │   └── val_albef_vqa.sh
│   │   └── train/
│   │       ├── pretrain.sh
│   │       ├── train_aokvqa_albef.sh
│   │       ├── train_coco_retrieval_albef.sh
│   │       ├── train_flickr30k_retrieval_albef.sh
│   │       ├── train_nlvr_albef.sh
│   │       ├── train_okvqa_albef.sh
│   │       ├── train_ve_albef.sh
│   │       └── train_vqa_albef.sh
│   ├── alpro/
│   │   ├── eval/
│   │   │   ├── eval_didemo_ret.sh
│   │   │   ├── eval_msrvtt_qa.sh
│   │   │   ├── eval_msrvtt_ret.sh
│   │   │   └── eval_msvd_qa.sh
│   │   └── train/
│   │       ├── train_didemo_ret.sh
│   │       ├── train_msrvtt_qa.sh
│   │       ├── train_msrvtt_ret.sh
│   │       └── train_msvd_qa.sh
│   ├── blip/
│   │   ├── eval/
│   │   │   ├── eval_aokvqa.sh
│   │   │   ├── eval_coco_cap.sh
│   │   │   ├── eval_coco_cap_large.sh
│   │   │   ├── eval_nlvr.sh
│   │   │   ├── eval_nocaps.sh
│   │   │   ├── eval_okvqa.sh
│   │   │   ├── eval_ret_coco.sh
│   │   │   ├── eval_ret_flickr.sh
│   │   │   └── validate_vqa.sh
│   │   └── train/
│   │       ├── pretrain.sh
│   │       ├── train_aokvqa.sh
│   │       ├── train_caption_coco.sh
│   │       ├── train_caption_coco_large.sh
│   │       ├── train_caption_coco_large_iters.sh
│   │       ├── train_nlvr.sh
│   │       ├── train_okvqa.sh
│   │       ├── train_retrieval_coco.sh
│   │       ├── train_retrieval_flickr.sh
│   │       └── train_vqa.sh
│   ├── blip-diffusion/
│   │   ├── train_db.sh
│   │   ├── train_db_dog.sh
│   │   ├── train_db_jacket_s.sh
│   │   ├── train_db_pink_dress.sh
│   │   └── train_db_shein_jacket.sh
│   ├── blip2/
│   │   ├── eval/
│   │   │   ├── eval_cap_coco_flant5xl.sh
│   │   │   ├── eval_cap_coco_opt2.7b.sh
│   │   │   ├── eval_cap_coco_opt6.7b.sh
│   │   │   ├── eval_gqa_zeroshot_flant5xl.sh
│   │   │   ├── eval_okvqa_zeroshot_flant5xl.sh
│   │   │   ├── eval_ret_coco.sh
│   │   │   ├── eval_ret_flickr.sh
│   │   │   ├── validate_vqa_zeroshot_flant5xl.sh
│   │   │   └── validate_vqa_zeroshot_opt.sh
│   │   └── train/
│   │       ├── pretrain_stage1.sh
│   │       ├── pretrain_stage2.sh
│   │       ├── train_caption_coco.sh
│   │       └── train_retrieval_coco.sh
│   ├── clip/
│   │   └── eval/
│   │       ├── eval_clip_ret_coco.sh
│   │       ├── eval_clip_ret_flickr.sh
│   │       └── eval_clip_zs_imnet.sh
│   ├── gpt/
│   │   ├── eval/
│   │   │   └── eval_video_dialogue_avsd.sh
│   │   └── train/
│   │       └── train_video_dialogue_avsd.sh
│   ├── pnp-vqa/
│   │   └── eval/
│   │       ├── eval_gqa.sh
│   │       ├── eval_gqa_3b.sh
│   │       ├── eval_gqa_large.sh
│   │       ├── eval_okvqa.sh
│   │       ├── eval_okvqa_3b.sh
│   │       ├── eval_okvqa_large.sh
│   │       ├── eval_vqav2.sh
│   │       ├── eval_vqav2_3b.sh
│   │       ├── eval_vqav2_large.sh
│   │       ├── eval_vqav2_test.sh
│   │       ├── eval_vqav2_test_3b.sh
│   │       └── eval_vqav2_test_large.sh
│   ├── run_browser.sh
│   └── run_demo.sh
├── setup.py
├── tests/
│   └── models/
│       ├── test_albef.py
│       ├── test_blip.py
│       ├── test_blip2.py
│       └── test_pnp_vqa.py
└── train.py