gitextract_cto194sv/

├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── configs/
│   ├── _base_bicaptioning_R_50_L1_H1024.yaml
│   ├── backbone_ablations/
│   │   ├── bicaptioning_R_101_L1_H1024.yaml
│   │   ├── bicaptioning_R_50W2X_L1_H1024.yaml
│   │   └── bicaptioning_R_50_L1_H1024.yaml
│   ├── depth_ablations/
│   │   ├── bicaptioning_R_50_L1_H1024.yaml
│   │   ├── bicaptioning_R_50_L2_H1024.yaml
│   │   ├── bicaptioning_R_50_L3_H1024.yaml
│   │   └── bicaptioning_R_50_L4_H1024.yaml
│   ├── detectron2/
│   │   ├── _base_faster_rcnn_R_50_C4_BN.yaml
│   │   ├── _base_mask_rcnn_R_50_FPN.yaml
│   │   ├── coco_segm_default_init_2x.yaml
│   │   ├── lvis_segm_default_init_2x.yaml
│   │   ├── lvis_segm_imagenet_init_2x.yaml
│   │   └── voc_det_default_init_24k.yaml
│   ├── downstream/
│   │   ├── imagenet_clf.yaml
│   │   ├── inaturalist_clf.yaml
│   │   └── voc07_clf.yaml
│   ├── task_ablations/
│   │   ├── bicaptioning_R_50_L1_H2048.yaml
│   │   ├── captioning_R_50_L1_H2048.yaml
│   │   ├── masked_lm_R_50_L1_H2048.yaml
│   │   ├── multilabel_classification_R_50.yaml
│   │   └── token_classification_R_50.yaml
│   └── width_ablations/
│       ├── bicaptioning_R_50_L1_H1024.yaml
│       ├── bicaptioning_R_50_L1_H2048.yaml
│       ├── bicaptioning_R_50_L1_H512.yaml
│       └── bicaptioning_R_50_L1_H768.yaml
├── docs/
│   ├── Makefile
│   ├── _templates/
│   │   └── layout.html
│   ├── conf.py
│   ├── index.rst
│   └── virtex/
│       ├── config.rst
│       ├── data.datasets.rst
│       ├── data.rst
│       ├── data.tokenizers.rst
│       ├── data.transforms.rst
│       ├── factories.rst
│       ├── model_zoo.rst
│       ├── models.rst
│       ├── modules.embedding.rst
│       ├── modules.rst
│       ├── modules.textual_heads.rst
│       ├── modules.visual_backbones.rst
│       ├── optim.lookahead.rst
│       ├── optim.lr_scheduler.rst
│       ├── optim.rst
│       ├── usage/
│       │   ├── downstream.rst
│       │   ├── model_zoo.rst
│       │   ├── pretrain.rst
│       │   └── setup_dependencies.rst
│       ├── utils.beam_search.rst
│       ├── utils.checkpointing.rst
│       ├── utils.common.rst
│       ├── utils.distributed.rst
│       ├── utils.metrics.rst
│       ├── utils.rst
│       └── utils.timer.rst
├── hubconf.py
├── requirements.txt
├── scripts/
│   ├── build_vocabulary.py
│   ├── clf_linear.py
│   ├── clf_voc07.py
│   ├── eval_captioning.py
│   ├── eval_detectron2.py
│   └── pretrain_virtex.py
├── setup.py
└── virtex/
    ├── __init__.py
    ├── config.py
    ├── data/
    │   ├── __init__.py
    │   ├── datasets/
    │   │   ├── captioning.py
    │   │   ├── classification.py
    │   │   ├── coco_captions.py
    │   │   ├── downstream.py
    │   │   └── masked_lm.py
    │   ├── tokenizers.py
    │   └── transforms.py
    ├── factories.py
    ├── model_zoo/
    │   ├── __init__.py
    │   └── model_zoo.py
    ├── models/
    │   ├── __init__.py
    │   ├── captioning.py
    │   ├── classification.py
    │   └── masked_lm.py
    ├── modules/
    │   ├── embedding.py
    │   ├── textual_heads.py
    │   └── visual_backbones.py
    ├── optim/
    │   ├── __init__.py
    │   ├── lookahead.py
    │   └── lr_scheduler.py
    └── utils/
        ├── beam_search.py
        ├── checkpointing.py
        ├── common.py
        ├── distributed.py
        ├── metrics.py
        ├── nucleus_sampling.py
        └── timer.py