Full Code of lxtGH/CAE for AI

master d72597143e48 cached

236 files

1.2 MB

299.9k tokens

1281 symbols

1 requests

Download .txt

Showing preview only (1,272K chars total). Download the full file or copy to clipboard to get everything.

Repository: lxtGH/CAE
Branch: master
Commit: d72597143e48
Files: 236
Total size: 1.2 MB

Directory structure:
gitextract_wj4p2u9n/

├── .gitignore
├── .gitignore.swp
├── README.md
├── dall_e/
│   ├── __init__.py
│   ├── decoder.py
│   ├── encoder.py
│   └── utils.py
├── downstream_tasks/
│   ├── detection/
│   │   ├── README.md
│   │   ├── evaluation/
│   │   │   └── object_detection/
│   │   │       ├── configs/
│   │   │       │   ├── _base_/
│   │   │       │   │   ├── datasets/
│   │   │       │   │   │   └── coco_instance.py
│   │   │       │   │   ├── default_runtime.py
│   │   │       │   │   ├── models/
│   │   │       │   │   │   ├── cascade_mask_rcnn_r50_fpn.py
│   │   │       │   │   │   ├── cascade_mask_rcnn_swin_fpn.py
│   │   │       │   │   │   ├── cascade_mask_rcnn_vit_fpn.py
│   │   │       │   │   │   ├── mask_rcnn_r50_fpn.py
│   │   │       │   │   │   └── mask_rcnn_vit_fpn.py
│   │   │       │   │   └── schedules/
│   │   │       │   │       └── schedule_1x.py
│   │   │       │   └── mask_rcnn/
│   │   │       │       ├── vit_base_giou_4conv1f_coco_maskrcnn_1x_cae_sincos_init0.1_lr00003.py
│   │   │       │       └── vit_large_giou_4conv1f_coco_maskrcnn_1x_cae_sincos_init0.1_lr00002_lrdr0.85_dp0.2.py
│   │   │       ├── mmcv_custom/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── checkpoint.py
│   │   │       │   ├── layer_decay_optimizer_constructor.py
│   │   │       │   ├── prepare_rpe.py
│   │   │       │   ├── register_backbone.py
│   │   │       │   └── runner/
│   │   │       │       ├── __init__.py
│   │   │       │       ├── checkpoint.py
│   │   │       │       └── epoch_based_runner.py
│   │   │       ├── test.py
│   │   │       └── train.py
│   │   ├── loader.py
│   │   ├── models/
│   │   │   ├── __init__.py
│   │   │   ├── head.py
│   │   │   ├── swin_transformer.py
│   │   │   └── vision_transformer.py
│   │   ├── scripts/
│   │   │   ├── run_eval.sh
│   │   │   ├── run_train_maskrcnn_vit_base.sh
│   │   │   └── run_train_maskrcnn_vit_large.sh
│   │   └── utils.py
│   └── semantic_segmentation/
│       ├── README.md
│       ├── backbone/
│       │   ├── beit.py
│       │   ├── beit_fapn.py
│       │   ├── cae.py
│       │   ├── fapn.py
│       │   └── mae.py
│       ├── configs_local/
│       │   ├── _base_/
│       │   │   ├── datasets/
│       │   │   │   ├── ade20k.py
│       │   │   │   ├── ade20k_640x640.py
│       │   │   │   ├── chase_db1.py
│       │   │   │   ├── cityscapes.py
│       │   │   │   ├── cityscapes_769x769.py
│       │   │   │   ├── coco-stuff10k.py
│       │   │   │   ├── drive.py
│       │   │   │   ├── hrf.py
│       │   │   │   ├── pascal_context.py
│       │   │   │   ├── pascal_voc12.py
│       │   │   │   ├── pascal_voc12_aug.py
│       │   │   │   └── stare.py
│       │   │   ├── default_runtime.py
│       │   │   ├── models/
│       │   │   │   ├── ann_r50-d8.py
│       │   │   │   ├── apcnet_r50-d8.py
│       │   │   │   ├── ccnet_r50-d8.py
│       │   │   │   ├── cgnet.py
│       │   │   │   ├── danet_r50-d8.py
│       │   │   │   ├── deeplabv3_r50-d8.py
│       │   │   │   ├── deeplabv3_unet_s5-d16.py
│       │   │   │   ├── deeplabv3plus_r50-d8.py
│       │   │   │   ├── dmnet_r50-d8.py
│       │   │   │   ├── dnl_r50-d8.py
│       │   │   │   ├── emanet_r50-d8.py
│       │   │   │   ├── encnet_r50-d8.py
│       │   │   │   ├── fast_scnn.py
│       │   │   │   ├── fcn_hr18.py
│       │   │   │   ├── fcn_r50-d8.py
│       │   │   │   ├── fcn_unet_s5-d16.py
│       │   │   │   ├── fpn_r50.py
│       │   │   │   ├── gcnet_r50-d8.py
│       │   │   │   ├── lraspp_m-v3-d8.py
│       │   │   │   ├── nonlocal_r50-d8.py
│       │   │   │   ├── ocrnet_hr18.py
│       │   │   │   ├── ocrnet_r50-d8.py
│       │   │   │   ├── pointrend_r50.py
│       │   │   │   ├── psanet_r50-d8.py
│       │   │   │   ├── pspnet_r50-d8.py
│       │   │   │   ├── pspnet_unet_s5-d16.py
│       │   │   │   ├── upernet_cae.py
│       │   │   │   └── upernet_r50.py
│       │   │   └── schedules/
│       │   │       ├── schedule_160k.py
│       │   │       ├── schedule_20k.py
│       │   │       ├── schedule_320k.py
│       │   │       ├── schedule_40k.py
│       │   │       └── schedule_80k.py
│       │   ├── beit/
│       │   │   └── upernet_beit_base_12_512_slide_160k_ade20k_pt_4e-4.py
│       │   ├── cae/
│       │   │   └── upernet/
│       │   │       ├── upernet_cae_base_12_512_slide_160k_ade20k_pt_1e-4.py
│       │   │       ├── upernet_cae_base_12_512_slide_160k_ade20k_pt_2e-4.py
│       │   │       ├── upernet_cae_base_12_512_slide_160k_ade20k_pt_3e-4.py
│       │   │       └── upernet_cae_large_24_512_slide_160k_ade20k_pt_decay095_4e-5_dp015.py
│       │   └── mae/
│       │       └── upernet_mae_large_12_512_slide_160k_ade20k_pt_4e-4.py
│       ├── mmcv_custom/
│       │   ├── __init__.py
│       │   ├── apex_runner/
│       │   │   ├── __init__.py
│       │   │   ├── apex_iter_based_runner.py
│       │   │   ├── checkpoint.py
│       │   │   └── optimizer.py
│       │   ├── checkpoint.py
│       │   ├── checkpoint_beit.py
│       │   ├── layer_decay_optimizer_constructor.py
│       │   ├── resize_transform.py
│       │   └── train_api.py
│       ├── mmseg/
│       │   ├── __init__.py
│       │   ├── apis/
│       │   │   ├── __init__.py
│       │   │   ├── inference.py
│       │   │   ├── test.py
│       │   │   └── train.py
│       │   ├── core/
│       │   │   ├── __init__.py
│       │   │   ├── evaluation/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── class_names.py
│       │   │   │   ├── eval_hooks.py
│       │   │   │   └── metrics.py
│       │   │   ├── seg/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── builder.py
│       │   │   │   └── sampler/
│       │   │   │       ├── __init__.py
│       │   │   │       ├── base_pixel_sampler.py
│       │   │   │       └── ohem_pixel_sampler.py
│       │   │   └── utils/
│       │   │       ├── __init__.py
│       │   │       └── misc.py
│       │   ├── datasets/
│       │   │   ├── __init__.py
│       │   │   ├── ade.py
│       │   │   ├── builder.py
│       │   │   ├── chase_db1.py
│       │   │   ├── cityscapes.py
│       │   │   ├── coco_stuff.py
│       │   │   ├── custom.py
│       │   │   ├── dataset_wrappers.py
│       │   │   ├── drive.py
│       │   │   ├── hrf.py
│       │   │   ├── pascal_context.py
│       │   │   ├── pipelines/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── compose.py
│       │   │   │   ├── formating.py
│       │   │   │   ├── loading.py
│       │   │   │   ├── test_time_aug.py
│       │   │   │   └── transforms.py
│       │   │   ├── stare.py
│       │   │   └── voc.py
│       │   ├── models/
│       │   │   ├── __init__.py
│       │   │   ├── backbones/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── cgnet.py
│       │   │   │   ├── fast_scnn.py
│       │   │   │   ├── hrnet.py
│       │   │   │   ├── mobilenet_v2.py
│       │   │   │   ├── mobilenet_v3.py
│       │   │   │   ├── resnest.py
│       │   │   │   ├── resnet.py
│       │   │   │   ├── resnext.py
│       │   │   │   └── unet.py
│       │   │   ├── builder.py
│       │   │   ├── decode_heads/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── ann_head.py
│       │   │   │   ├── apc_head.py
│       │   │   │   ├── aspp_head.py
│       │   │   │   ├── cascade_decode_head.py
│       │   │   │   ├── cc_head.py
│       │   │   │   ├── da_head.py
│       │   │   │   ├── decode_head.py
│       │   │   │   ├── dm_head.py
│       │   │   │   ├── dnl_head.py
│       │   │   │   ├── ema_head.py
│       │   │   │   ├── enc_head.py
│       │   │   │   ├── fcn_head.py
│       │   │   │   ├── fpn_head.py
│       │   │   │   ├── gc_head.py
│       │   │   │   ├── lraspp_head.py
│       │   │   │   ├── nl_head.py
│       │   │   │   ├── ocr_head.py
│       │   │   │   ├── point_head.py
│       │   │   │   ├── psa_head.py
│       │   │   │   ├── psp_head.py
│       │   │   │   ├── sep_aspp_head.py
│       │   │   │   ├── sep_fcn_head.py
│       │   │   │   └── uper_head.py
│       │   │   ├── losses/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── accuracy.py
│       │   │   │   ├── cross_entropy_loss.py
│       │   │   │   ├── lovasz_loss.py
│       │   │   │   └── utils.py
│       │   │   ├── necks/
│       │   │   │   ├── __init__.py
│       │   │   │   └── fpn.py
│       │   │   ├── segmentors/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── base.py
│       │   │   │   ├── cascade_encoder_decoder.py
│       │   │   │   └── encoder_decoder.py
│       │   │   └── utils/
│       │   │       ├── __init__.py
│       │   │       ├── inverted_residual.py
│       │   │       ├── make_divisible.py
│       │   │       ├── res_layer.py
│       │   │       ├── se_layer.py
│       │   │       ├── self_attention_block.py
│       │   │       └── up_conv_block.py
│       │   ├── ops/
│       │   │   ├── __init__.py
│       │   │   ├── encoding.py
│       │   │   └── wrappers.py
│       │   ├── utils/
│       │   │   ├── __init__.py
│       │   │   ├── collect_env.py
│       │   │   └── logger.py
│       │   └── version.py
│       └── tools/
│           ├── dist_test.sh
│           ├── dist_train.sh
│           ├── test.py
│           └── train.py
├── furnace/
│   ├── dataset_folder.py
│   ├── datasets.py
│   ├── engine_for_finetuning.py
│   ├── engine_for_pretraining.py
│   ├── masking_generator.py
│   ├── optim_factory.py
│   ├── transforms.py
│   └── utils.py
├── linear_util/
│   ├── crop.py
│   ├── datasets.py
│   ├── engine_finetune.py
│   ├── lars.py
│   ├── lr_decay.py
│   ├── lr_sched.py
│   ├── misc.py
│   └── pos_embed.py
├── models/
│   ├── modeling_cae.py
│   ├── modeling_cae_helper.py
│   ├── modeling_discrete_vae.py
│   └── modeling_finetune.py
├── requirements.txt
├── scripts/
│   ├── cae_base_800e.sh
│   ├── cae_base_finetune.sh
│   ├── cae_large_1600e.sh
│   └── cae_large_finetune.sh
├── tokenizer-weights/
│   └── README
└── tools/
    ├── run_attentive.py
    ├── run_class_finetuning.py
    ├── run_linear.py
    └── run_pretraining.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
.DS_Store


================================================
FILE: README.md
================================================
# CAE: Context AutoEncoder for Self-Supervised Representation Learning 

<p align="center">
  <img src='furnace/CAE.png'>
</p>

This is a PyTorch implementation of [CAE: Context AutoEncoder for Self-Supervised Representation Learning](https://arxiv.org/abs/2202.03026).

## Highlights

- State-of-the-art MIM performance. Results in the paper are successfully reproduced.

## Installation

Clone the repo and install required packages.
```bash
pip install -r requirements.txt

# install apex
git clone https://github.com/NVIDIA/apex
cd apex
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
```

## Data Preparation
First, download ImageNet-1k from http://image-net.org/.

The directory structure is the standard layout of torchvision's datasets.ImageFolder. The training and validation data are expected to be in the train/ folder and val folder, respectively:

```
/path/to/imagenet/
  train/
    class1/
      img1.jpeg
    class2/
      img2.jpeg
  val/
    class1/
      img3.jpeg
    class/2
      img4.jpeg
```

Second, download the pretrained tokenizer.

```bash
TOKENIZER_PATH=/path/to/save/dall_e_tokenizer_weight
mkdir -p $TOKENIZER_PATH
wget -o $TOKENIZER_PATH/encoder.pkl https://cdn.openai.com/dall-e/encoder.pkl
wget -o $TOKENIZER_PATH/decoder.pkl https://cdn.openai.com/dall-e/decoder.pkl
```


## Pretraining

Here is an example that pretrains CAE-base on ImageNet-1K with 32 GPUs. Please see [scripts/cae_base_800e.sh](scripts/cae_base_800e.sh) for complete script.
```bash
OMP_NUM_THREADS=1 $PYTHON -m torch.distributed.launch \
  --nproc_per_node=8 \
  tools/run_pretraining.py \
  --data_path ${DATA_PATH} \
  --output_dir ${OUTPUT_DIR} \
  --model cae_base_patch16_224_8k_vocab --discrete_vae_weight_path ${TOKENIZER_PATH} \
  --batch_size 64 --lr 1.5e-3 --warmup_epochs 20 --epochs 800 \
  --clip_grad 3.0 --layer_scale_init_value 0.1 \
  --imagenet_default_mean_and_std \
  --color_jitter 0 \
  --drop_path 0.1 \
  --sincos_pos_emb \
  --mask_generator block \
  --num_mask_patches 98 \
  --decoder_layer_scale_init_value 0.1 \
  --no_auto_resume \
  --save_ckpt_freq 100 \
  --exp_name $my_name \
  --regressor_depth 4 \
  --decoder_depth 4 \
  --align_loss_weight 2
```
- `--num_mask_patches`: number of the input patches need be masked. 
- `--batch_size`: batch size per GPU.
- Effective batch size = `number of GPUs` * `--batch_size`. So in the above example, the effective batch size is `64*32 = 2048`.
- `--lr`: learning rate.
- `--warmup_epochs`: learning rate warmup epochs. Warm up [10, 20, 40] epochs for [300, 800, 1600] pretrain epochs respectively.
- `--epochs`: total pretraining epochs.
- `--clip_grad`: clip gradient norm.
- `--drop_path`: stochastic depth rate.
- `--imagenet_default_mean_and_std`: enable this for ImageNet-1k pretraining, i.e., `(0.485, 0.456, 0.406)` for mean and `(0.229, 0.224, 0.225)` for std. For other pretraining data, use `(0.5, 0.5, 0.5)` for mean and `(0.5, 0.5, 0.5)` for std by default.
- `--layer_scale_init_value`: 0.1 for base, 1e-5 for large, set 0 to disable layerscale. We set `--decoder_layer_scale_init_value` the same as this.
- `--sincos_pos_emb`: adopt sin-cos positional embedding during pretraining.
- `--regressor_depth`: length of the regressor.
- `--decoder_depth`: length of the decoder.
- `--align_loss_weight`: weight for alignment loss. 2 by default.

Warmup epochs for 300/800/1600 epochs pretraining are 10/20/40.

For CAE-large, please refer to [scripts/cae_large_1600e.sh](scripts/cae_large_1600e.sh). 


## Results
Here provides the results of CAE-base/CAE-large for these evaluation tasks:
- Linear probing
- Attentive probing
- Fine-tuning
- Semantic segmentation
- Object detection and instance segmentation

Pretrained weights and logs are available ([Google Drive](https://drive.google.com/drive/folders/1wwhg7nj2GQuU9uthVuQLkEEXEjx90G7g?usp=sharing), [Baidu Cloud [Code: 4kil]](https://pan.baidu.com/s/15eZGoI72iLupLrOHqmOM9w)). *: from CAE paper.

| Model      | Pretraining data | #Epoch | Linear | Attentive | Fine-tuning | ADE Seg | COCO Det | COCO InstSeg |
| ---------- | ---------------- | ------ | ------ | --------- | ----------- | ------- | -------- | ------------ |
| MAE-base*  | ImageNet-1K      | 1600   | 67.8   | 74.2      | 83.6        | 48.1    | 48.4     | 42.6         |
| MAE-large* | ImageNet-1K      | 1600   | 76.0   | 78.8      | 86.0        | 53.6    | 54.0     | 47.1         |
| CAE-base   | ImageNet-1K      | 300    | 64.5   | 74.0      | 83.6        | 48.1    | 48.3     | 42.7         |
| CAE-base   | ImageNet-1K      | 800    | 68.9   | 75.9      | 83.8        | 49.7    | 49.9     | 43.9         |
| CAE-base   | ImageNet-1K      | 1600   | 70.3   | 77.2      | 83.9        | 50.3    | 50.3     | 44.2         |
| CAE-large  | ImageNet-1K      | 1600   | 77.8   | 81.2      | 86.2        | 54.9    | 54.5     | 47.5         |


### Linear Probing
- Please refer to [scripts/cae_base_800e.sh](scripts/cae_base_800e.sh) (32 GPUs).  
- For CAE-large, just replace `--model cae_base_patch16_224` with `--model cae_large_patch16_224`.

### Attentive Probing

- Please refer to [scripts/cae_base_800e.sh](scripts/cae_base_800e.sh) (32 GPUs). 
- For CAE-large, just replace `--model cae_base_patch16_224` with `--model cae_large_patch16_224`.

### Fine-tuning
- Please refer to [scripts/cae_base_finetune.sh](scripts/cae_base_finetune.sh) (32 GPUs). 
- For CAE-large, please refer to [scripts/cae_large_finetune.sh](scripts/cae_large_finetune.sh) (32 GPUs).

### Segmentation & Detection
- Please refer to [downstream_tasks](./downstream_tasks) dir to get started.

## Acknowledgement

This repository is built using the [BEiT](https://github.com/microsoft/unilm/edit/master/beit) and [MMSelfSup](https://github.com/open-mmlab/mmselfsup), thanks for their open-source code! Thanks also to the CAE authors for their excellent work!

## Citation
```bibtex
@article{ContextAutoencoder2022,
  title={Context Autoencoder for Self-Supervised Representation Learning},
  author={Chen, Xiaokang and Ding, Mingyu and Wang, Xiaodi and Xin, Ying and Mo, Shentong and Wang, Yunhao and Han, Shumin and Luo, Ping and Zeng, Gang and Wang, Jingdong},
  journal={arXiv preprint arXiv:2202.03026},
  year={2022}
}
```


================================================
FILE: dall_e/__init__.py
================================================
import io, requests
import torch
import torch.nn as nn

from dall_e.encoder import Encoder
from dall_e.decoder import Decoder
from dall_e.utils   import map_pixels, unmap_pixels

def load_model(path: str, device: torch.device = None) -> nn.Module:
    if path.startswith('http://') or path.startswith('https://'):
        resp = requests.get(path)
        resp.raise_for_status()
            
        with io.BytesIO(resp.content) as buf:
            return torch.load(buf, map_location=device)
    else:
        with open(path, 'rb') as f:
            return torch.load(f, map_location=device)


================================================
FILE: dall_e/decoder.py
================================================
import attr
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

from collections  import OrderedDict
from functools    import partial
from dall_e.utils import Conv2d

@attr.s(eq=False, repr=False)
class DecoderBlock(nn.Module):
	n_in:     int = attr.ib(validator=lambda i, a, x: x >= 1)
	n_out:    int = attr.ib(validator=lambda i, a, x: x >= 1 and x % 4 ==0)
	n_layers: int = attr.ib(validator=lambda i, a, x: x >= 1)

	device:        torch.device = attr.ib(default=None)
	requires_grad: bool         = attr.ib(default=False)

	def __attrs_post_init__(self) -> None:
		super().__init__()
		self.n_hid = self.n_out // 4
		self.post_gain = 1 / (self.n_layers ** 2)

		make_conv     = partial(Conv2d, device=self.device, requires_grad=self.requires_grad)
		self.id_path  = make_conv(self.n_in, self.n_out, 1) if self.n_in != self.n_out else nn.Identity()
		self.res_path = nn.Sequential(OrderedDict([
				('relu_1', nn.ReLU()),
				('conv_1', make_conv(self.n_in,  self.n_hid, 1)),
				('relu_2', nn.ReLU()),
				('conv_2', make_conv(self.n_hid, self.n_hid, 3)),
				('relu_3', nn.ReLU()),
				('conv_3', make_conv(self.n_hid, self.n_hid, 3)),
				('relu_4', nn.ReLU()),
				('conv_4', make_conv(self.n_hid, self.n_out, 3)),]))

	def forward(self, x: torch.Tensor) -> torch.Tensor:
		return self.id_path(x) + self.post_gain * self.res_path(x)

@attr.s(eq=False, repr=False)
class Decoder(nn.Module):
	group_count:     int = 4
	n_init:          int = attr.ib(default=128,  validator=lambda i, a, x: x >= 8)
	n_hid:           int = attr.ib(default=256,  validator=lambda i, a, x: x >= 64)
	n_blk_per_group: int = attr.ib(default=2,    validator=lambda i, a, x: x >= 1)
	output_channels: int = attr.ib(default=3,    validator=lambda i, a, x: x >= 1)
	vocab_size:      int = attr.ib(default=8192, validator=lambda i, a, x: x >= 512)

	device:              torch.device = attr.ib(default=torch.device('cpu'))
	requires_grad:       bool         = attr.ib(default=False)
	use_mixed_precision: bool         = attr.ib(default=True)

	def __attrs_post_init__(self) -> None:
		super().__init__()

		blk_range  = range(self.n_blk_per_group)
		n_layers   = self.group_count * self.n_blk_per_group
		make_conv  = partial(Conv2d, device=self.device, requires_grad=self.requires_grad)
		make_blk   = partial(DecoderBlock, n_layers=n_layers, device=self.device,
				requires_grad=self.requires_grad)

		self.blocks = nn.Sequential(OrderedDict([
			('input', make_conv(self.vocab_size, self.n_init, 1, use_float16=False)),
			('group_1', nn.Sequential(OrderedDict([
				*[(f'block_{i + 1}', make_blk(self.n_init if i == 0 else 8 * self.n_hid, 8 * self.n_hid)) for i in blk_range],
				('upsample', nn.Upsample(scale_factor=2, mode='nearest')),
			]))),
			('group_2', nn.Sequential(OrderedDict([
				*[(f'block_{i + 1}', make_blk(8 * self.n_hid if i == 0 else 4 * self.n_hid, 4 * self.n_hid)) for i in blk_range],
				('upsample', nn.Upsample(scale_factor=2, mode='nearest')),
			]))),
			('group_3', nn.Sequential(OrderedDict([
				*[(f'block_{i + 1}', make_blk(4 * self.n_hid if i == 0 else 2 * self.n_hid, 2 * self.n_hid)) for i in blk_range],
				('upsample', nn.Upsample(scale_factor=2, mode='nearest')),
			]))),
			('group_4', nn.Sequential(OrderedDict([
				*[(f'block_{i + 1}', make_blk(2 * self.n_hid if i == 0 else 1 * self.n_hid, 1 * self.n_hid)) for i in blk_range],
			]))),
			('output', nn.Sequential(OrderedDict([
				('relu', nn.ReLU()),
				('conv', make_conv(1 * self.n_hid, 2 * self.output_channels, 1)),
			]))),
		]))

	def forward(self, x: torch.Tensor) -> torch.Tensor:
		if len(x.shape) != 4:
			raise ValueError(f'input shape {x.shape} is not 4d')
		if x.shape[1] != self.vocab_size:
			raise ValueError(f'input has {x.shape[1]} channels but model built for {self.vocab_size}')
		if x.dtype != torch.float32:
			raise ValueError('input must have dtype torch.float32')

		return self.blocks(x)


================================================
FILE: dall_e/encoder.py
================================================
import attr
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

from collections  import OrderedDict
from functools    import partial
from dall_e.utils import Conv2d

@attr.s(eq=False, repr=False)
class EncoderBlock(nn.Module):
	n_in:     int = attr.ib(validator=lambda i, a, x: x >= 1)
	n_out:    int = attr.ib(validator=lambda i, a, x: x >= 1 and x % 4 ==0)
	n_layers: int = attr.ib(validator=lambda i, a, x: x >= 1)

	device:        torch.device = attr.ib(default=None)
	requires_grad: bool         = attr.ib(default=False)

	def __attrs_post_init__(self) -> None:
		super().__init__()
		self.n_hid = self.n_out // 4
		self.post_gain = 1 / (self.n_layers ** 2)

		make_conv     = partial(Conv2d, device=self.device, requires_grad=self.requires_grad)
		self.id_path  = make_conv(self.n_in, self.n_out, 1) if self.n_in != self.n_out else nn.Identity()
		self.res_path = nn.Sequential(OrderedDict([
				('relu_1', nn.ReLU()),
				('conv_1', make_conv(self.n_in,  self.n_hid, 3)),
				('relu_2', nn.ReLU()),
				('conv_2', make_conv(self.n_hid, self.n_hid, 3)),
				('relu_3', nn.ReLU()),
				('conv_3', make_conv(self.n_hid, self.n_hid, 3)),
				('relu_4', nn.ReLU()),
				('conv_4', make_conv(self.n_hid, self.n_out, 1)),]))

	def forward(self, x: torch.Tensor) -> torch.Tensor:
		return self.id_path(x) + self.post_gain * self.res_path(x)

@attr.s(eq=False, repr=False)
class Encoder(nn.Module):
	group_count:     int = 4
	n_hid:           int = attr.ib(default=256,  validator=lambda i, a, x: x >= 64)
	n_blk_per_group: int = attr.ib(default=2,    validator=lambda i, a, x: x >= 1)
	input_channels:  int = attr.ib(default=3,    validator=lambda i, a, x: x >= 1)
	vocab_size:      int = attr.ib(default=8192, validator=lambda i, a, x: x >= 512)

	device:              torch.device = attr.ib(default=torch.device('cpu'))
	requires_grad:       bool         = attr.ib(default=False)
	use_mixed_precision: bool         = attr.ib(default=True)

	def __attrs_post_init__(self) -> None:
		super().__init__()

		blk_range  = range(self.n_blk_per_group)
		n_layers   = self.group_count * self.n_blk_per_group
		make_conv  = partial(Conv2d, device=self.device, requires_grad=self.requires_grad)
		make_blk   = partial(EncoderBlock, n_layers=n_layers, device=self.device,
				requires_grad=self.requires_grad)

		self.blocks = nn.Sequential(OrderedDict([
			('input', make_conv(self.input_channels, 1 * self.n_hid, 7)),
			('group_1', nn.Sequential(OrderedDict([
				*[(f'block_{i + 1}', make_blk(1 * self.n_hid, 1 * self.n_hid)) for i in blk_range],
				('pool', nn.MaxPool2d(kernel_size=2)),
			]))),
			('group_2', nn.Sequential(OrderedDict([
				*[(f'block_{i + 1}', make_blk(1 * self.n_hid if i == 0 else 2 * self.n_hid, 2 * self.n_hid)) for i in blk_range],
				('pool', nn.MaxPool2d(kernel_size=2)),
			]))),
			('group_3', nn.Sequential(OrderedDict([
				*[(f'block_{i + 1}', make_blk(2 * self.n_hid if i == 0 else 4 * self.n_hid, 4 * self.n_hid)) for i in blk_range],
				('pool', nn.MaxPool2d(kernel_size=2)),
			]))),
			('group_4', nn.Sequential(OrderedDict([
				*[(f'block_{i + 1}', make_blk(4 * self.n_hid if i == 0 else 8 * self.n_hid, 8 * self.n_hid)) for i in blk_range],
			]))),
			('output', nn.Sequential(OrderedDict([
				('relu', nn.ReLU()),
				('conv', make_conv(8 * self.n_hid, self.vocab_size, 1, use_float16=False)),
			]))),
		]))

	def forward(self, x: torch.Tensor) -> torch.Tensor:
		if len(x.shape) != 4:
			raise ValueError(f'input shape {x.shape} is not 4d')
		if x.shape[1] != self.input_channels:
			raise ValueError(f'input has {x.shape[1]} channels but model built for {self.input_channels}')
		if x.dtype != torch.float32:
			raise ValueError('input must have dtype torch.float32')

		return self.blocks(x)


================================================
FILE: dall_e/utils.py
================================================
import attr
import math

import torch
import torch.nn as nn
import torch.nn.functional as F

logit_laplace_eps: float = 0.1

@attr.s(eq=False)
class Conv2d(nn.Module):
	n_in:  int = attr.ib(validator=lambda i, a, x: x >= 1)
	n_out: int = attr.ib(validator=lambda i, a, x: x >= 1)
	kw:    int = attr.ib(validator=lambda i, a, x: x >= 1 and x % 2 == 1)

	use_float16:   bool         = attr.ib(default=True)
	device:        torch.device = attr.ib(default=torch.device('cpu'))
	requires_grad: bool         = attr.ib(default=False)

	def __attrs_post_init__(self) -> None:
		super().__init__()

		w = torch.empty((self.n_out, self.n_in, self.kw, self.kw), dtype=torch.float32,
			device=self.device, requires_grad=self.requires_grad)
		w.normal_(std=1 / math.sqrt(self.n_in * self.kw ** 2))

		b = torch.zeros((self.n_out,), dtype=torch.float32, device=self.device,
			requires_grad=self.requires_grad)
		self.w, self.b = nn.Parameter(w), nn.Parameter(b)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
		if self.use_float16 and 'cuda' in self.w.device.type:
			if x.dtype != torch.float16:
				x = x.half()

			w, b = self.w.half(), self.b.half()
		else:
			if x.dtype != torch.float32:
				x = x.float()

			w, b = self.w, self.b

		return F.conv2d(x, w, b, padding=(self.kw - 1) // 2)

def map_pixels(x: torch.Tensor) -> torch.Tensor:
	if x.dtype != torch.float:
		raise ValueError('expected input to have type float')

	return (1 - 2 * logit_laplace_eps) * x + logit_laplace_eps

def unmap_pixels(x: torch.Tensor) -> torch.Tensor:
	if len(x.shape) != 4:
		raise ValueError('expected input to be 4d')
	if x.dtype != torch.float:
		raise ValueError('expected input to have type float')

	return torch.clamp((x - logit_laplace_eps) / (1 - 2 * logit_laplace_eps), 0, 1)


================================================
FILE: downstream_tasks/detection/README.md
================================================

# COCO Detection and Instance segmentation with CAE

# Installation

Please install [PyTorch](https://pytorch.org/). This codebase has been developed with python version 3.6, PyTorch version 1.7.1, CUDA 11.0 and torchvision 0.8.2. To get the full dependencies, please run:

```bash
pip3 install -f https://download.openmmlab.com/mmcv/dist/cu110/torch1.7.1/index.html mmcv-full==1.3.9
pip3 install pytest-runner scipy tensorboardX faiss-gpu==1.6.1 tqdm lmdb sklearn pyarrow==2.0.0 timm DALL-E munkres six einops

# install apex
pip3 install git+https://github.com/NVIDIA/apex \
    --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext"

# install mmdetection for object detection & instance segmentation
git clone https://github.com/SwinTransformer/Swin-Transformer-Object-Detection
cd Swin-Transformer-Object-Detection
pip3 install -r requirements/build.txt
pip3 install -v -e .
cd ..
```


## Fine-tuning with Mask R-CNN
#### We use 16 GPUs for these experiments, $NNODES = 2.

- To train ViT-B/16 with Mask R-CNN as the task layer, run:
```bash
python -m torch.distributed.launch --nproc_per_node=8 \
    --nnodes=$NNODES \
    --node_rank=$RANK \
    --master_addr=$ADDRESS \
    --master_port=$PORT \
    evaluation/object_detection/train.py evaluation/object_detection/configs/mask_rcnn/vit_base_giou_4conv1f_coco_maskrcnn_1x_cae_sincos_init0.1_lr00003.py \
    --launcher pytorch \
    --work-dir $OUTPUT_DIR \
    --no-validate \
    --deterministic \
    --cfg-options model.backbone.use_checkpoint=True \
    model.pretrained=$PRETRAINED \
    ${@:6}
```

- To train ViT-L/16 with Mask R-CNN as the task layer, run:
```bash
python -m torch.distributed.launch --nproc_per_node=8 \
    --nnodes=$NNODES \
    --node_rank=$RANK \
    --master_addr=$ADDRESS \
    --master_port=$PORT \
    evaluation/object_detection/train.py evaluation/object_detection/configs/mask_rcnn/vit_large_giou_4conv1f_coco_maskrcnn_1x_cae_sincos_init0.1_lr00002_lrdr0.85_dp0.2.py \
    --launcher pytorch \
    --work-dir $OUTPUT_DIR \
    --no-validate \
    --deterministic \
    --cfg-options model.backbone.use_checkpoint=True \
	model.pretrained=$PRETRAINED \
    ${@:6}
```

- To evaluate Mask R-CNN, run:
```bash
python -m torch.distributed.launch --nproc_per_node=8 \
    evaluation/object_detection/test.py \
    $CONFIG \
    $MODEL \
    --launcher pytorch \
    --eval bbox segm \
    --cfg-options model.backbone.use_checkpoint=True \
    ${@:6}
```

## Results (pretrined models are trained on ImageNet-1K without label)
| Backbone | #Pretrained Epoch | Object Det | Instance Seg |
| -------- | ----------------- | ---------- | ------------ |
| ViT-B    | 300               | 48.3       | 42.7         |
| ViT-B    | 800               | 49.9       | 43.9         |
| ViT-B    | 1600              | 50.3       | 44.2         |
| ViT-L    | 1600              | 54.5       | 47.5         |


## Acknowledgement

This repository is built using the [IBOT repository](https://github.com/bytedance/ibot). Thanks for their open-source code!


================================================
FILE: downstream_tasks/detection/evaluation/object_detection/configs/_base_/datasets/coco_instance.py
================================================
dataset_type = 'CocoDataset'
data_root = '/path/to/coco/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1333, 800),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_train2017.json',
        img_prefix=data_root + 'train2017/',
        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline))
evaluation = dict(metric=['bbox', 'segm'])


================================================
FILE: downstream_tasks/detection/evaluation/object_detection/configs/_base_/default_runtime.py
================================================
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
        # dict(type='TensorboardLoggerHook')
    ])
# yapf:enable
custom_hooks = [dict(type='NumClassCheckHook')]

dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]

================================================
FILE: downstream_tasks/detection/evaluation/object_detection/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
================================================
ettings
model = dict(
    type='CascadeRCNN',
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        num_outs=5),
    rpn_head=dict(
        type='RPNHead',
        in_channels=256,
        feat_channels=256,
        anchor_generator=dict(
            type='AnchorGenerator',
            scales=[8],
            ratios=[0.5, 1.0, 2.0],
            strides=[4, 8, 16, 32, 64]),
        bbox_coder=dict(
            type='DeltaXYWHBBoxCoder',
            target_means=[.0, .0, .0, .0],
            target_stds=[1.0, 1.0, 1.0, 1.0]),
        loss_cls=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
    roi_head=dict(
        type='CascadeRoIHead',
        num_stages=3,
        stage_loss_weights=[1, 0.5, 0.25],
        bbox_roi_extractor=dict(
            type='SingleRoIExtractor',
            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
            out_channels=256,
            featmap_strides=[4, 8, 16, 32]),
        bbox_head=[
            dict(
                type='Shared2FCBBoxHead',
                in_channels=256,
                fc_out_channels=1024,
                roi_feat_size=7,
                num_classes=80,
                bbox_coder=dict(
                    type='DeltaXYWHBBoxCoder',
                    target_means=[0., 0., 0., 0.],
                    target_stds=[0.1, 0.1, 0.2, 0.2]),
                reg_class_agnostic=True,
                loss_cls=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=1.0),
                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
                               loss_weight=1.0)),
            dict(
                type='Shared2FCBBoxHead',
                in_channels=256,
                fc_out_channels=1024,
                roi_feat_size=7,
                num_classes=80,
                bbox_coder=dict(
                    type='DeltaXYWHBBoxCoder',
                    target_means=[0., 0., 0., 0.],
                    target_stds=[0.05, 0.05, 0.1, 0.1]),
                reg_class_agnostic=True,
                loss_cls=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=1.0),
                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
                               loss_weight=1.0)),
            dict(
                type='Shared2FCBBoxHead',
                in_channels=256,
                fc_out_channels=1024,
                roi_feat_size=7,
                num_classes=80,
                bbox_coder=dict(
                    type='DeltaXYWHBBoxCoder',
                    target_means=[0., 0., 0., 0.],
                    target_stds=[0.033, 0.033, 0.067, 0.067]),
                reg_class_agnostic=True,
                loss_cls=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=1.0),
                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
        ],
        mask_roi_extractor=dict(
            type='SingleRoIExtractor',
            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
            out_channels=256,
            featmap_strides=[4, 8, 16, 32]),
        mask_head=dict(
            type='FCNMaskHead',
            num_convs=4,
            in_channels=256,
            conv_out_channels=256,
            num_classes=80,
            loss_mask=dict(
                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
    # model training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaxIoUAssigner',
                pos_iou_thr=0.7,
                neg_iou_thr=0.3,
                min_pos_iou=0.3,
                match_low_quality=True,
                ignore_iof_thr=-1),
            sampler=dict(
                type='RandomSampler',
                num=256,
                pos_fraction=0.5,
                neg_pos_ub=-1,
                add_gt_as_proposals=False),
            allowed_border=0,
            pos_weight=-1,
            debug=False),
        rpn_proposal=dict(
            nms_pre=2000,
            max_per_img=2000,
            nms=dict(type='nms', iou_threshold=0.7),
            min_bbox_size=0),
        rcnn=[
            dict(
                assigner=dict(
                    type='MaxIoUAssigner',
                    pos_iou_thr=0.5,
                    neg_iou_thr=0.5,
                    min_pos_iou=0.5,
                    match_low_quality=False,
                    ignore_iof_thr=-1),
                sampler=dict(
                    type='RandomSampler',
                    num=512,
                    pos_fraction=0.25,
                    neg_pos_ub=-1,
                    add_gt_as_proposals=True),
                mask_size=28,
                pos_weight=-1,
                debug=False),
            dict(
                assigner=dict(
                    type='MaxIoUAssigner',
                    pos_iou_thr=0.6,
                    neg_iou_thr=0.6,
                    min_pos_iou=0.6,
                    match_low_quality=False,
                    ignore_iof_thr=-1),
                sampler=dict(
                    type='RandomSampler',
                    num=512,
                    pos_fraction=0.25,
                    neg_pos_ub=-1,
                    add_gt_as_proposals=True),
                mask_size=28,
                pos_weight=-1,
                debug=False),
            dict(
                assigner=dict(
                    type='MaxIoUAssigner',
                    pos_iou_thr=0.7,
                    neg_iou_thr=0.7,
                    min_pos_iou=0.7,
                    match_low_quality=False,
                    ignore_iof_thr=-1),
                sampler=dict(
                    type='RandomSampler',
                    num=512,
                    pos_fraction=0.25,
                    neg_pos_ub=-1,
                    add_gt_as_proposals=True),
                mask_size=28,
                pos_weight=-1,
                debug=False)
        ]),
    test_cfg=dict(
        rpn=dict(
            nms_pre=1000,
            max_per_img=1000,
            nms=dict(type='nms', iou_threshold=0.7),
            min_bbox_size=0),
        rcnn=dict(
            score_thr=0.05,
            nms=dict(type='nms', iou_threshold=0.5),
            max_per_img=100,
            mask_thr_binary=0.5)))



================================================
FILE: downstream_tasks/detection/evaluation/object_detection/configs/_base_/models/cascade_mask_rcnn_swin_fpn.py
================================================
# model settings
model = dict(
    type='CascadeRCNN',
    pretrained=None,
    backbone=dict(
        type='SwinTransformer',
        embed_dim=96,
        depths=[2, 2, 6, 2],
        num_heads=[3, 6, 12, 24],
        window_size=7,
        mlp_ratio=4.,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.,
        attn_drop_rate=0.,
        drop_path_rate=0.2,
        ape=False,
        patch_norm=True,
        out_indices=(0, 1, 2, 3),
        use_checkpoint=False),
    neck=dict(
        type='FPN',
        in_channels=[96, 192, 384, 768],
        out_channels=256,
        num_outs=5),
    rpn_head=dict(
        type='RPNHead',
        in_channels=256,
        feat_channels=256,
        anchor_generator=dict(
            type='AnchorGenerator',
            scales=[8],
            ratios=[0.5, 1.0, 2.0],
            strides=[4, 8, 16, 32, 64]),
        bbox_coder=dict(
            type='DeltaXYWHBBoxCoder',
            target_means=[.0, .0, .0, .0],
            target_stds=[1.0, 1.0, 1.0, 1.0]),
        loss_cls=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
    roi_head=dict(
        type='CascadeRoIHead',
        num_stages=3,
        stage_loss_weights=[1, 0.5, 0.25],
        bbox_roi_extractor=dict(
            type='SingleRoIExtractor',
            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
            out_channels=256,
            featmap_strides=[4, 8, 16, 32]),
        bbox_head=[
            dict(
                type='Shared2FCBBoxHead',
                in_channels=256,
                fc_out_channels=1024,
                roi_feat_size=7,
                num_classes=80,
                bbox_coder=dict(
                    type='DeltaXYWHBBoxCoder',
                    target_means=[0., 0., 0., 0.],
                    target_stds=[0.1, 0.1, 0.2, 0.2]),
                reg_class_agnostic=True,
                loss_cls=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=1.0),
                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
                               loss_weight=1.0)),
            dict(
                type='Shared2FCBBoxHead',
                in_channels=256,
                fc_out_channels=1024,
                roi_feat_size=7,
                num_classes=80,
                bbox_coder=dict(
                    type='DeltaXYWHBBoxCoder',
                    target_means=[0., 0., 0., 0.],
                    target_stds=[0.05, 0.05, 0.1, 0.1]),
                reg_class_agnostic=True,
                loss_cls=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=1.0),
                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
                               loss_weight=1.0)),
            dict(
                type='Shared2FCBBoxHead',
                in_channels=256,
                fc_out_channels=1024,
                roi_feat_size=7,
                num_classes=80,
                bbox_coder=dict(
                    type='DeltaXYWHBBoxCoder',
                    target_means=[0., 0., 0., 0.],
                    target_stds=[0.033, 0.033, 0.067, 0.067]),
                reg_class_agnostic=True,
                loss_cls=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=1.0),
                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
        ],
        mask_roi_extractor=dict(
            type='SingleRoIExtractor',
            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
            out_channels=256,
            featmap_strides=[4, 8, 16, 32]),
        mask_head=dict(
            type='FCNMaskHead',
            num_convs=4,
            in_channels=256,
            conv_out_channels=256,
            num_classes=80,
            loss_mask=dict(
                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
    # model training and testing settings
    train_cfg = dict(
        rpn=dict(
            assigner=dict(
                type='MaxIoUAssigner',
                pos_iou_thr=0.7,
                neg_iou_thr=0.3,
                min_pos_iou=0.3,
                match_low_quality=True,
                ignore_iof_thr=-1),
            sampler=dict(
                type='RandomSampler',
                num=256,
                pos_fraction=0.5,
                neg_pos_ub=-1,
                add_gt_as_proposals=False),
            allowed_border=0,
            pos_weight=-1,
            debug=False),
        rpn_proposal=dict(
            nms_across_levels=False,
            nms_pre=2000,
            nms_post=2000,
            max_per_img=2000,
            nms=dict(type='nms', iou_threshold=0.7),
            min_bbox_size=0),
        rcnn=[
            dict(
                assigner=dict(
                    type='MaxIoUAssigner',
                    pos_iou_thr=0.5,
                    neg_iou_thr=0.5,
                    min_pos_iou=0.5,
                    match_low_quality=False,
                    ignore_iof_thr=-1),
                sampler=dict(
                    type='RandomSampler',
                    num=512,
                    pos_fraction=0.25,
                    neg_pos_ub=-1,
                    add_gt_as_proposals=True),
                mask_size=28,
                pos_weight=-1,
                debug=False),
            dict(
                assigner=dict(
                    type='MaxIoUAssigner',
                    pos_iou_thr=0.6,
                    neg_iou_thr=0.6,
                    min_pos_iou=0.6,
                    match_low_quality=False,
                    ignore_iof_thr=-1),
                sampler=dict(
                    type='RandomSampler',
                    num=512,
                    pos_fraction=0.25,
                    neg_pos_ub=-1,
                    add_gt_as_proposals=True),
                mask_size=28,
                pos_weight=-1,
                debug=False),
            dict(
                assigner=dict(
                    type='MaxIoUAssigner',
                    pos_iou_thr=0.7,
                    neg_iou_thr=0.7,
                    min_pos_iou=0.7,
                    match_low_quality=False,
                    ignore_iof_thr=-1),
                sampler=dict(
                    type='RandomSampler',
                    num=512,
                    pos_fraction=0.25,
                    neg_pos_ub=-1,
                    add_gt_as_proposals=True),
                mask_size=28,
                pos_weight=-1,
                debug=False)
        ]),
    test_cfg = dict(
        rpn=dict(
            nms_across_levels=False,
            nms_pre=1000,
            nms_post=1000,
            max_per_img=1000,
            nms=dict(type='nms', iou_threshold=0.7),
            min_bbox_size=0),
        rcnn=dict(
            score_thr=0.05,
            nms=dict(type='nms', iou_threshold=0.5),
            max_per_img=100,
            mask_thr_binary=0.5)))

================================================
FILE: downstream_tasks/detection/evaluation/object_detection/configs/_base_/models/cascade_mask_rcnn_vit_fpn.py
================================================
# model settings
model = dict(
    type='CascadeRCNN',
    pretrained=None,
    backbone=dict(
        type='VisionTransformer',
        img_size=[672, 1092],
        patch_size=16,
        embed_dim=384,
        depth=12,
        num_heads=6,
        mlp_ratio=4.,
        qkv_bias=True,
        drop_path_rate=0.1,
        out_indices=(3, 5, 7, 11),
        use_checkpoint=False),
    neck=dict(
        type='FPN',
        in_channels=[384, 384, 384, 384],
        out_channels=256,
        num_outs=5),
    rpn_head=dict(
        type='RPNHead',
        in_channels=256,
        feat_channels=256,
        anchor_generator=dict(
            type='AnchorGenerator',
            scales=[8],
            ratios=[0.5, 1.0, 2.0],
            strides=[4, 8, 16, 32, 64]),
        bbox_coder=dict(
            type='DeltaXYWHBBoxCoder',
            target_means=[.0, .0, .0, .0],
            target_stds=[1.0, 1.0, 1.0, 1.0]),
        loss_cls=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
    roi_head=dict(
        type='CascadeRoIHead',
        num_stages=3,
        stage_loss_weights=[1, 0.5, 0.25],
        bbox_roi_extractor=dict(
            type='SingleRoIExtractor',
            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
            out_channels=256,
            featmap_strides=[4, 8, 16, 32]),
        bbox_head=[
            dict(
                type='Shared2FCBBoxHead',
                in_channels=256,
                fc_out_channels=1024,
                roi_feat_size=7,
                num_classes=80,
                bbox_coder=dict(
                    type='DeltaXYWHBBoxCoder',
                    target_means=[0., 0., 0., 0.],
                    target_stds=[0.1, 0.1, 0.2, 0.2]),
                reg_class_agnostic=True,
                loss_cls=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=1.0),
                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
                               loss_weight=1.0)),
            dict(
                type='Shared2FCBBoxHead',
                in_channels=256,
                fc_out_channels=1024,
                roi_feat_size=7,
                num_classes=80,
                bbox_coder=dict(
                    type='DeltaXYWHBBoxCoder',
                    target_means=[0., 0., 0., 0.],
                    target_stds=[0.05, 0.05, 0.1, 0.1]),
                reg_class_agnostic=True,
                loss_cls=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=1.0),
                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
                               loss_weight=1.0)),
            dict(
                type='Shared2FCBBoxHead',
                in_channels=256,
                fc_out_channels=1024,
                roi_feat_size=7,
                num_classes=80,
                bbox_coder=dict(
                    type='DeltaXYWHBBoxCoder',
                    target_means=[0., 0., 0., 0.],
                    target_stds=[0.033, 0.033, 0.067, 0.067]),
                reg_class_agnostic=True,
                loss_cls=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=1.0),
                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
        ],
        mask_roi_extractor=dict(
            type='SingleRoIExtractor',
            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
            out_channels=256,
            featmap_strides=[4, 8, 16, 32]),
        mask_head=dict(
            type='FCNMaskHead',
            num_convs=4,
            in_channels=256,
            conv_out_channels=256,
            num_classes=80,
            loss_mask=dict(
                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
    # model training and testing settings
    train_cfg = dict(
        rpn=dict(
            assigner=dict(
                type='MaxIoUAssigner',
                pos_iou_thr=0.7,
                neg_iou_thr=0.3,
                min_pos_iou=0.3,
                match_low_quality=True,
                ignore_iof_thr=-1),
            sampler=dict(
                type='RandomSampler',
                num=256,
                pos_fraction=0.5,
                neg_pos_ub=-1,
                add_gt_as_proposals=False),
            allowed_border=0,
            pos_weight=-1,
            debug=False),
        rpn_proposal=dict(
            nms_across_levels=False,
            nms_pre=2000,
            nms_post=2000,
            max_per_img=2000,
            nms=dict(type='nms', iou_threshold=0.7),
            min_bbox_size=0),
        rcnn=[
            dict(
                assigner=dict(
                    type='MaxIoUAssigner',
                    pos_iou_thr=0.5,
                    neg_iou_thr=0.5,
                    min_pos_iou=0.5,
                    match_low_quality=False,
                    ignore_iof_thr=-1),
                sampler=dict(
                    type='RandomSampler',
                    num=512,
                    pos_fraction=0.25,
                    neg_pos_ub=-1,
                    add_gt_as_proposals=True),
                mask_size=28,
                pos_weight=-1,
                debug=False),
            dict(
                assigner=dict(
                    type='MaxIoUAssigner',
                    pos_iou_thr=0.6,
                    neg_iou_thr=0.6,
                    min_pos_iou=0.6,
                    match_low_quality=False,
                    ignore_iof_thr=-1),
                sampler=dict(
                    type='RandomSampler',
                    num=512,
                    pos_fraction=0.25,
                    neg_pos_ub=-1,
                    add_gt_as_proposals=True),
                mask_size=28,
                pos_weight=-1,
                debug=False),
            dict(
                assigner=dict(
                    type='MaxIoUAssigner',
                    pos_iou_thr=0.7,
                    neg_iou_thr=0.7,
                    min_pos_iou=0.7,
                    match_low_quality=False,
                    ignore_iof_thr=-1),
                sampler=dict(
                    type='RandomSampler',
                    num=512,
                    pos_fraction=0.25,
                    neg_pos_ub=-1,
                    add_gt_as_proposals=True),
                mask_size=28,
                pos_weight=-1,
                debug=False)
        ]),
    test_cfg = dict(
        rpn=dict(
            nms_across_levels=False,
            nms_pre=1000,
            nms_post=1000,
            max_per_img=1000,
            nms=dict(type='nms', iou_threshold=0.7),
            min_bbox_size=0),
        rcnn=dict(
            score_thr=0.05,
            nms=dict(type='nms', iou_threshold=0.5),
            max_per_img=100,
            mask_thr_binary=0.5)))

================================================
FILE: downstream_tasks/detection/evaluation/object_detection/configs/_base_/models/mask_rcnn_r50_fpn.py
================================================
# model settings
model = dict(
    type='MaskRCNN',
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        num_outs=5),
    rpn_head=dict(
        type='RPNHead',
        in_channels=256,
        feat_channels=256,
        anchor_generator=dict(
            type='AnchorGenerator',
            scales=[8],
            ratios=[0.5, 1.0, 2.0],
            strides=[4, 8, 16, 32, 64]),
        bbox_coder=dict(
            type='DeltaXYWHBBoxCoder',
            target_means=[.0, .0, .0, .0],
            target_stds=[1.0, 1.0, 1.0, 1.0]),
        loss_cls=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
    roi_head=dict(
        type='StandardRoIHead',
        bbox_roi_extractor=dict(
            type='SingleRoIExtractor',
            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
            out_channels=256,
            featmap_strides=[4, 8, 16, 32]),
        bbox_head=dict(
            type='Shared2FCBBoxHead',
            in_channels=256,
            fc_out_channels=1024,
            roi_feat_size=7,
            num_classes=80,
            bbox_coder=dict(
                type='DeltaXYWHBBoxCoder',
                target_means=[0., 0., 0., 0.],
                target_stds=[0.1, 0.1, 0.2, 0.2]),
            reg_class_agnostic=False,
            loss_cls=dict(
                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
            loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
        mask_roi_extractor=dict(
            type='SingleRoIExtractor',
            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
            out_channels=256,
            featmap_strides=[4, 8, 16, 32]),
        mask_head=dict(
            type='FCNMaskHead',
            num_convs=4,
            in_channels=256,
            conv_out_channels=256,
            num_classes=80,
            loss_mask=dict(
                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
    # model training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaxIoUAssigner',
                pos_iou_thr=0.7,
                neg_iou_thr=0.3,
                min_pos_iou=0.3,
                match_low_quality=True,
                ignore_iof_thr=-1),
            sampler=dict(
                type='RandomSampler',
                num=256,
                pos_fraction=0.5,
                neg_pos_ub=-1,
                add_gt_as_proposals=False),
            allowed_border=-1,
            pos_weight=-1,
            debug=False),
        rpn_proposal=dict(
            nms_pre=2000,
            max_per_img=1000,
            nms=dict(type='nms', iou_threshold=0.7),
            min_bbox_size=0),
        rcnn=dict(
            assigner=dict(
                type='MaxIoUAssigner',
                pos_iou_thr=0.5,
                neg_iou_thr=0.5,
                min_pos_iou=0.5,
                match_low_quality=True,
                ignore_iof_thr=-1),
            sampler=dict(
                type='RandomSampler',
                num=512,
                pos_fraction=0.25,
                neg_pos_ub=-1,
                add_gt_as_proposals=True),
            mask_size=28,
            pos_weight=-1,
            debug=False)),
    test_cfg=dict(
        rpn=dict(
            nms_pre=1000,
            max_per_img=1000,
            nms=dict(type='nms', iou_threshold=0.7),
            min_bbox_size=0),
        rcnn=dict(
            score_thr=0.05,
            nms=dict(type='nms', iou_threshold=0.5),
            max_per_img=100,
            mask_thr_binary=0.5)))



================================================
FILE: downstream_tasks/detection/evaluation/object_detection/configs/_base_/models/mask_rcnn_vit_fpn.py
================================================
# model settings
model = dict(
    type='MaskRCNN',
    pretrained=None,
    backbone=dict(
        type='VisionTransformer',
        img_size=[672, 1092],
        patch_size=16,
        embed_dim=384,
        depth=12,
        num_heads=6,
        mlp_ratio=4.,
        qkv_bias=True,
        drop_path_rate=0.1,
        out_indices=(3, 5, 7, 11),
        use_checkpoint=False),
    neck=dict(
        type='FPN',
        in_channels=[384, 384, 384, 384],
        out_channels=256,
        num_outs=5),
    rpn_head=dict(
        type='RPNHead',
        in_channels=256,
        feat_channels=256,
        anchor_generator=dict(
            type='AnchorGenerator',
            scales=[8],
            ratios=[0.5, 1.0, 2.0],
            strides=[4, 8, 16, 32, 64]),
        bbox_coder=dict(
            type='DeltaXYWHBBoxCoder',
            target_means=[.0, .0, .0, .0],
            target_stds=[1.0, 1.0, 1.0, 1.0]),
        loss_cls=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
    roi_head=dict(
        type='StandardRoIHead',
        bbox_roi_extractor=dict(
            type='SingleRoIExtractor',
            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
            out_channels=256,
            featmap_strides=[4, 8, 16, 32]),
        bbox_head=dict(
            type='Shared2FCBBoxHead',
            in_channels=256,
            fc_out_channels=1024,
            roi_feat_size=7,
            num_classes=80,
            bbox_coder=dict(
                type='DeltaXYWHBBoxCoder',
                target_means=[0., 0., 0., 0.],
                target_stds=[0.1, 0.1, 0.2, 0.2]),
            reg_class_agnostic=False,
            loss_cls=dict(
                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
            loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
        mask_roi_extractor=dict(
            type='SingleRoIExtractor',
            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
            out_channels=256,
            featmap_strides=[4, 8, 16, 32]),
        mask_head=dict(
            type='FCNMaskHead',
            num_convs=4,
            in_channels=256,
            conv_out_channels=256,
            num_classes=80,
            loss_mask=dict(
                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
    # model training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaxIoUAssigner',
                pos_iou_thr=0.7,
                neg_iou_thr=0.3,
                min_pos_iou=0.3,
                match_low_quality=True,
                ignore_iof_thr=-1),
            sampler=dict(
                type='RandomSampler',
                num=256,
                pos_fraction=0.5,
                neg_pos_ub=-1,
                add_gt_as_proposals=False),
            allowed_border=-1,
            pos_weight=-1,
            debug=False),
        rpn_proposal=dict(
            nms_across_levels=False,
            nms_pre=2000,
            nms_post=2000,
            max_per_img=2000,
            nms=dict(type='nms', iou_threshold=0.7),
            min_bbox_size=0),
        rcnn=dict(
            assigner=dict(
                type='MaxIoUAssigner',
                pos_iou_thr=0.5,
                neg_iou_thr=0.5,
                min_pos_iou=0.5,
                match_low_quality=True,
                ignore_iof_thr=-1),
            sampler=dict(
                type='RandomSampler',
                num=512,
                pos_fraction=0.25,
                neg_pos_ub=-1,
                add_gt_as_proposals=True),
            mask_size=28,
            pos_weight=-1,
            debug=False)),
    test_cfg=dict(
        rpn=dict(
            nms_across_levels=False,
            nms_pre=1000,
            nms_post=1000,
            max_per_img=1000,
            nms=dict(type='nms', iou_threshold=0.7),
            min_bbox_size=0),
        rcnn=dict(
            score_thr=0.05,
            nms=dict(type='nms', iou_threshold=0.5),
            max_per_img=100,
            mask_thr_binary=0.5)))



================================================
FILE: downstream_tasks/detection/evaluation/object_detection/configs/_base_/schedules/schedule_1x.py
================================================
# optimizer
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=0.001,
    step=[8, 11])
runner = dict(type='EpochBasedRunner', max_epochs=12)

================================================
FILE: downstream_tasks/detection/evaluation/object_detection/configs/mask_rcnn/vit_base_giou_4conv1f_coco_maskrcnn_1x_cae_sincos_init0.1_lr00003.py
================================================
# Copyright (c) ByteDance, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

"""
Mostly copy-paste from timm, mmdet, and swin code bases
https://github.com/rwightman/pytorch-image-models/tree/master/timm
https://github.com/open-mmlab/mmdetection
https://github.com/SwinTransformer/Swin-Transformer-Object-Detection
"""

_base_ = [
    '../_base_/models/mask_rcnn_vit_fpn.py',
    '../_base_/datasets/coco_instance.py',
    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
]

model = dict(
    backbone=dict(
        embed_dim=768,
        depth=12,
        num_heads=12,
        init_values=0.1, 
        mlp_ratio=4.,
        drop_path_rate=0.2, #see if 0.1 larger than vit-small is better
		use_abs_pos_emb=False,
		use_sincos_pos_emb=True,
		use_rel_pos_bias=False,
    ),
    neck=dict(in_channels=[768, 768, 768, 768]),
    roi_head=dict(
        bbox_head=dict(
                type='ConvFCBBoxHead',
                num_shared_convs=4,
                num_shared_fcs=1,
                in_channels=256,
                conv_out_channels=256,
                fc_out_channels=1024,
                roi_feat_size=7,
                num_classes=80,
                bbox_coder=dict(
                    type='DeltaXYWHBBoxCoder',
                    target_means=[0., 0., 0., 0.],
                    target_stds=[0.1, 0.1, 0.2, 0.2]),
                reg_class_agnostic=False,
                reg_decoded_bbox=True,
                norm_cfg=dict(type='SyncBN', requires_grad=True),
                loss_cls=dict(
                    type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
                loss_bbox=dict(type='GIoULoss', loss_weight=10.0))))

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

# augmentation strategy originates from DETR / Sparse RCNN
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='AutoAugment',
         policies=[
             [
                 dict(type='Resize',
                      img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
                                 (608, 1333), (640, 1333), (672, 1333), (704, 1333),
                                 (736, 1333), (768, 1333), (800, 1333)],
                      multiscale_mode='value',
                      keep_ratio=True)
             ],
             [
                 dict(type='Resize',
                      img_scale=[(400, 1333), (500, 1333), (600, 1333)],
                      multiscale_mode='value',
                      keep_ratio=True),
                 dict(type='RandomCrop',
                      crop_type='absolute_range',
                      crop_size=(384, 600),
                      allow_negative_crop=True),
                 dict(type='Resize',
                      img_scale=[(480, 1333), (512, 1333), (544, 1333),
                                 (576, 1333), (608, 1333), (640, 1333),
                                 (672, 1333), (704, 1333), (736, 1333),
                                 (768, 1333), (800, 1333)],
                      multiscale_mode='value',
                      override=True,
                      keep_ratio=True)
             ]
         ]),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
]
data = dict(
    		samples_per_gpu=2,
   			workers_per_gpu=2,
			train=dict(pipeline=train_pipeline))

optimizer = dict(_delete_=True, type='AdamW', lr=0.0003, betas=(0.9, 0.999), weight_decay=0.05,
                 constructor='LayerDecayOptimizerConstructor', 
                 paramwise_cfg=dict(num_layers=12, layer_decay_rate=0.75))

lr_config = dict(step=[9, 11])
runner = dict(type='EpochBasedRunnerAmp', max_epochs=12)

# do not use mmdet version fp16
fp16 = None
optimizer_config = dict(
    type="DistOptimizerHook",
    update_interval=1,
    grad_clip=None,
    coalesce=True,
    bucket_size_mb=-1,
    use_fp16=True,
)


================================================
FILE: downstream_tasks/detection/evaluation/object_detection/configs/mask_rcnn/vit_large_giou_4conv1f_coco_maskrcnn_1x_cae_sincos_init0.1_lr00002_lrdr0.85_dp0.2.py
================================================
#Copyright (c) ByteDance, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

"""
Mostly copy-paste from timm, mmdet, and swin code bases
https://github.com/rwightman/pytorch-image-models/tree/master/timm
https://github.com/open-mmlab/mmdetection
https://github.com/SwinTransformer/Swin-Transformer-Object-Detection
"""

_base_ = [
    '../_base_/models/mask_rcnn_vit_fpn.py',
    '../_base_/datasets/coco_instance.py',
    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
]

find_unused_parameters = False
model = dict(
    backbone=dict(
        embed_dim=1024,
        depth=24,
        num_heads=16,
        init_values=0.00001, 
        mlp_ratio=4.,
        drop_path_rate=0.2, #see if 0.1 larger than vit-small is better
		use_abs_pos_emb=False,
		use_sincos_pos_emb=True,
		use_rel_pos_bias=False,
		out_indices=[7, 11, 15, 23],
    ),
    neck=dict(in_channels=[1024, 1024, 1024, 1024]),
    roi_head=dict(
        bbox_head=dict(
                type='ConvFCBBoxHead',
                num_shared_convs=4,
                num_shared_fcs=1,
                in_channels=256,
                conv_out_channels=256,
                fc_out_channels=1024,
                roi_feat_size=7,
                num_classes=80,
                bbox_coder=dict(
                    type='DeltaXYWHBBoxCoder',
                    target_means=[0., 0., 0., 0.],
                    target_stds=[0.1, 0.1, 0.2, 0.2]),
                reg_class_agnostic=False,
                reg_decoded_bbox=True,
                norm_cfg=dict(type='SyncBN', requires_grad=True),
                loss_cls=dict(
                    type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
                loss_bbox=dict(type='GIoULoss', loss_weight=10.0))))

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

# augmentation strategy originates from DETR / Sparse RCNN
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='AutoAugment',
         policies=[
             [
                 dict(type='Resize',
                      img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
                                 (608, 1333), (640, 1333), (672, 1333), (704, 1333),
                                 (736, 1333), (768, 1333), (800, 1333)],
                      multiscale_mode='value',
                      keep_ratio=True)
             ],
             [
                 dict(type='Resize',
                      img_scale=[(400, 1333), (500, 1333), (600, 1333)],
                      multiscale_mode='value',
                      keep_ratio=True),
                 dict(type='RandomCrop',
                      crop_type='absolute_range',
                      crop_size=(384, 600),
                      allow_negative_crop=True),
                 dict(type='Resize',
                      img_scale=[(480, 1333), (512, 1333), (544, 1333),
                                 (576, 1333), (608, 1333), (640, 1333),
                                 (672, 1333), (704, 1333), (736, 1333),
                                 (768, 1333), (800, 1333)],
                      multiscale_mode='value',
                      override=True,
                      keep_ratio=True)
             ]
         ]),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
]
data = dict(
    		samples_per_gpu=2,
   			workers_per_gpu=2,
			train=dict(pipeline=train_pipeline))

optimizer = dict(_delete_=True, type='AdamW', lr=0.0002, betas=(0.9, 0.999), weight_decay=0.05,
                 constructor='LayerDecayOptimizerConstructor', 
                 paramwise_cfg=dict(num_layers=24, layer_decay_rate=0.85))

lr_config = dict(step=[9, 11])
runner = dict(type='EpochBasedRunnerAmp', max_epochs=12)

# do not use mmdet version fp16
fp16 = None
optimizer_config = dict(
    type="DistOptimizerHook",
    update_interval=1,
    grad_clip=None,
    coalesce=True,
    bucket_size_mb=-1,
    use_fp16=True,
)


================================================
FILE: downstream_tasks/detection/evaluation/object_detection/mmcv_custom/__init__.py
================================================
# -*- coding: utf-8 -*-

from .checkpoint import load_checkpoint
from .layer_decay_optimizer_constructor import LayerDecayOptimizerConstructor
from .register_backbone import VisionTransformer

__all__ = ['load_checkpoint', 'LayerDecayOptimizerConstructor']


================================================
FILE: downstream_tasks/detection/evaluation/object_detection/mmcv_custom/checkpoint.py
================================================
# Copyright (c) ByteDance, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

"""
Copy-paste from mmcv library:
https://github.com/open-mmlab/mmcv/
"""

import io
import os
import os.path as osp
import pkgutil
import time
import warnings
from collections import OrderedDict
from importlib import import_module
from tempfile import TemporaryDirectory

import torch
import torchvision
from torch.optim import Optimizer
from torch.nn import functional as F

import mmcv
from mmcv.fileio import FileClient
from mmcv.fileio import load as load_file
from mmcv.parallel import is_module_wrapper
from mmcv.utils import mkdir_or_exist
from mmcv.runner import get_dist_info

from scipy import interpolate
import numpy as np
import math

ENV_MMCV_HOME = 'MMCV_HOME'
ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
DEFAULT_CACHE_DIR = '~/.cache'


def _get_mmcv_home():
    mmcv_home = os.path.expanduser(
        os.getenv(
            ENV_MMCV_HOME,
            os.path.join(
                os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv')))

    mkdir_or_exist(mmcv_home)
    return mmcv_home


def load_state_dict(module, state_dict, strict=False, logger=None):
    """Load state_dict to a module.

    This method is modified from :meth:`torch.nn.Module.load_state_dict`.
    Default value for ``strict`` is set to ``False`` and the message for
    param mismatch will be shown even if strict is False.

    Args:
        module (Module): Module that receives the state_dict.
        state_dict (OrderedDict): Weights.
        strict (bool): whether to strictly enforce that the keys
            in :attr:`state_dict` match the keys returned by this module's
            :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
        logger (:obj:`logging.Logger`, optional): Logger to log the error
            message. If not specified, print function will be used.
    """
    unexpected_keys = []
    all_missing_keys = []
    err_msg = []

    metadata = getattr(state_dict, '_metadata', None)
    state_dict = state_dict.copy()
    if metadata is not None:
        state_dict._metadata = metadata

    # use _load_from_state_dict to enable checkpoint version control
    def load(module, prefix=''):
        # recursively check parallel module in case that the model has a
        # complicated structure, e.g., nn.Module(nn.Module(DDP))
        if is_module_wrapper(module):
            module = module.module
        local_metadata = {} if metadata is None else metadata.get(
            prefix[:-1], {})
        module._load_from_state_dict(state_dict, prefix, local_metadata, True,
                                     all_missing_keys, unexpected_keys,
                                     err_msg)
        for name, child in module._modules.items():
            if child is not None:
                load(child, prefix + name + '.')

    load(module)
    load = None  # break load->load reference cycle

    # ignore "num_batches_tracked" of BN layers
    missing_keys = [
        key for key in all_missing_keys if 'num_batches_tracked' not in key
    ]

    if unexpected_keys:
        err_msg.append('unexpected key in source '
                       f'state_dict: {", ".join(unexpected_keys)}\n')
    if missing_keys:
        err_msg.append(
            f'missing keys in source state_dict: {", ".join(missing_keys)}\n')

    rank, _ = get_dist_info()
    if len(err_msg) > 0 and rank == 0:
        err_msg.insert(
            0, 'The model and loaded state dict do not match exactly\n')
        err_msg = '\n'.join(err_msg)
        if strict:
            raise RuntimeError(err_msg)
        elif logger is not None:
            logger.warning(err_msg)
        else:
            print(err_msg)


def load_url_dist(url, model_dir=None, map_location="cpu"):
    """In distributed setting, this function only download checkpoint at local
    rank 0."""
    rank, world_size = get_dist_info()
    rank = int(os.environ.get('LOCAL_RANK', rank))
    if rank == 0:
        checkpoint = model_zoo.load_url(url, model_dir=model_dir, map_location=map_location)
    if world_size > 1:
        torch.distributed.barrier()
        if rank > 0:
            checkpoint = model_zoo.load_url(url, model_dir=model_dir, map_location=map_location)
    return checkpoint


def load_pavimodel_dist(model_path, map_location=None):
    """In distributed setting, this function only download checkpoint at local
    rank 0."""
    try:
        from pavi import modelscloud
    except ImportError:
        raise ImportError(
            'Please install pavi to load checkpoint from modelcloud.')
    rank, world_size = get_dist_info()
    rank = int(os.environ.get('LOCAL_RANK', rank))
    if rank == 0:
        model = modelcloud.get(model_path)
        with TemporaryDirectory() as tmp_dir:
            downloaded_file = osp.join(tmp_dir, model.name)
            model.download(downloaded_file)
            checkpoint = torch.load(downloaded_file, map_location=map_location)
    if world_size > 1:
        torch.distributed.barrier()
        if rank > 0:
            model = modelcloud.get(model_path)
            with TemporaryDirectory() as tmp_dir:
                downloaded_file = osp.join(tmp_dir, model.name)
                model.download(downloaded_file)
                checkpoint = torch.load(
                    downloaded_file, map_location=map_location)
    return checkpoint


def load_fileclient_dist(filename, backend, map_location):
    """In distributed setting, this function only download checkpoint at local
    rank 0."""
    rank, world_size = get_dist_info()
    rank = int(os.environ.get('LOCAL_RANK', rank))
    allowed_backends = ['ceph']
    if backend not in allowed_backends:
        raise ValueError(f'Load from Backend {backend} is not supported.')
    if rank == 0:
        fileclient = FileClient(backend=backend)
        buffer = io.BytesIO(fileclient.get(filename))
        checkpoint = torch.load(buffer, map_location=map_location)
    if world_size > 1:
        torch.distributed.barrier()
        if rank > 0:
            fileclient = FileClient(backend=backend)
            buffer = io.BytesIO(fileclient.get(filename))
            checkpoint = torch.load(buffer, map_location=map_location)
    return checkpoint


def get_torchvision_models():
    model_urls = dict()
    for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
        if ispkg:
            continue
        _zoo = import_module(f'torchvision.models.{name}')
        if hasattr(_zoo, 'model_urls'):
            _urls = getattr(_zoo, 'model_urls')
            model_urls.update(_urls)
    return model_urls


def get_external_models():
    mmcv_home = _get_mmcv_home()
    default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json')
    default_urls = load_file(default_json_path)
    assert isinstance(default_urls, dict)
    external_json_path = osp.join(mmcv_home, 'open_mmlab.json')
    if osp.exists(external_json_path):
        external_urls = load_file(external_json_path)
        assert isinstance(external_urls, dict)
        default_urls.update(external_urls)

    return default_urls


def get_mmcls_models():
    mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json')
    mmcls_urls = load_file(mmcls_json_path)

    return mmcls_urls


def get_deprecated_model_names():
    deprecate_json_path = osp.join(mmcv.__path__[0],
                                   'model_zoo/deprecated.json')
    deprecate_urls = load_file(deprecate_json_path)
    assert isinstance(deprecate_urls, dict)

    return deprecate_urls


def _process_mmcls_checkpoint(checkpoint):
    state_dict = checkpoint['state_dict']
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        if k.startswith('backbone.'):
            new_state_dict[k[9:]] = v
    new_checkpoint = dict(state_dict=new_state_dict)

    return new_checkpoint


def _load_checkpoint(filename, map_location=None):
    """Load checkpoint from somewhere (modelzoo, file, url).

    Args:
        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
            details.
        map_location (str | None): Same as :func:`torch.load`. Default: None.

    Returns:
        dict | OrderedDict: The loaded checkpoint. It can be either an
            OrderedDict storing model weights or a dict containing other
            information, which depends on the checkpoint.
    """
    if filename.startswith('modelzoo://'):
        warnings.warn('The URL scheme of "modelzoo://" is deprecated, please '
                      'use "torchvision://" instead')
        model_urls = get_torchvision_models()
        model_name = filename[11:]
        checkpoint = load_url_dist(model_urls[model_name])
    elif filename.startswith('torchvision://'):
        model_urls = get_torchvision_models()
        model_name = filename[14:]
        checkpoint = load_url_dist(model_urls[model_name])
    elif filename.startswith('open-mmlab://'):
        model_urls = get_external_models()
        model_name = filename[13:]
        deprecated_urls = get_deprecated_model_names()
        if model_name in deprecated_urls:
            warnings.warn(f'open-mmlab://{model_name} is deprecated in favor '
                          f'of open-mmlab://{deprecated_urls[model_name]}')
            model_name = deprecated_urls[model_name]
        model_url = model_urls[model_name]
        # check if is url
        if model_url.startswith(('http://', 'https://')):
            checkpoint = load_url_dist(model_url)
        else:
            filename = osp.join(_get_mmcv_home(), model_url)
            if not osp.isfile(filename):
                raise IOError(f'{filename} is not a checkpoint file')
            checkpoint = torch.load(filename, map_location=map_location)
    elif filename.startswith('mmcls://'):
        model_urls = get_mmcls_models()
        model_name = filename[8:]
        checkpoint = load_url_dist(model_urls[model_name])
        checkpoint = _process_mmcls_checkpoint(checkpoint)
    elif filename.startswith(('http://', 'https://')):
        checkpoint = load_url_dist(filename)
    elif filename.startswith('pavi://'):
        model_path = filename[7:]
        checkpoint = load_pavimodel_dist(model_path, map_location=map_location)
    elif filename.startswith('s3://'):
        checkpoint = load_fileclient_dist(
            filename, backend='ceph', map_location=map_location)
    else:
        if not osp.isfile(filename):
            raise IOError(f'{filename} is not a checkpoint file')
        checkpoint = torch.load(filename, map_location=map_location)
    return checkpoint


def cosine_scheduler(base_value, final_value, epochs, niter_per_ep, warmup_epochs=0,
                     start_warmup_value=0, warmup_steps=-1):
    warmup_schedule = np.array([])
    warmup_iters = warmup_epochs * niter_per_ep
    if warmup_steps > 0:
        warmup_iters = warmup_steps
    print("Set warmup steps = %d" % warmup_iters)
    if warmup_epochs > 0:
        warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)

    iters = np.arange(epochs * niter_per_ep - warmup_iters)
    schedule = np.array(
        [final_value + 0.5 * (base_value - final_value) * (1 + math.cos(math.pi * i / (len(iters)))) for i in iters])

    schedule = np.concatenate((warmup_schedule, schedule))

    assert len(schedule) == epochs * niter_per_ep
    return schedule


def load_checkpoint(model,
                    filename,
                    map_location='cpu',
                    strict=False,
                    logger=None):
    """Load checkpoint from a file or URI.

    Args:
        model (Module): Module to load checkpoint.
        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
            details.
        map_location (str): Same as :func:`torch.load`.
        strict (bool): Whether to allow different params for the model and
            checkpoint.
        logger (:mod:`logging.Logger` or None): The logger for error message.

    Returns:
        dict or OrderedDict: The loaded checkpoint.
    """
    checkpoint = _load_checkpoint(filename, map_location)
    # OrderedDict is a subclass of dict
    if not isinstance(checkpoint, dict):
        raise RuntimeError(
            f'No state_dict found in checkpoint file {filename}')
    # get state_dict from checkpoint
    if 'state_dict' in checkpoint:
        state_dict = checkpoint['state_dict']
    elif 'model' in checkpoint:
        state_dict = checkpoint['model']
    elif 'module' in checkpoint:
        state_dict = checkpoint['module']
    else:
        state_dict = checkpoint
    # strip prefix of state_dict
    if list(state_dict.keys())[0].startswith('module.'):
        state_dict = {k[7:]: v for k, v in state_dict.items()}

    # for MoBY, load model of online branch
    if sorted(list(state_dict.keys()))[0].startswith('encoder'):
        state_dict = {k.replace('encoder.', ''): v for k, v in state_dict.items() if k.startswith('encoder.')}


    all_keys = list(state_dict.keys())
    if all_keys[-1].startswith('encoder_to_decoder') or all_keys[-1].startswith('decoder'):
        # NOTE: remove all decoder keys
        all_keys = [key for key in all_keys if key.startswith('encoder.')]
        for key in all_keys:
            new_key = key.replace('encoder.','')
            state_dict[new_key] = state_dict[key]
            state_dict.pop(key)
            
        for key in list(state_dict.keys()):
            if key.startswith('decoder.'):
                state_dict.pop(key)

        # NOTE: replace norm with fc_norm
        for key in list(state_dict.keys()):
            if key.startswith('norm.'):
                new_key = key.replace('norm.','fc_norm.')
                state_dict[new_key] = state_dict[key]
                state_dict.pop(key)

    # reshape absolute position embedding for Swin
    if state_dict.get('absolute_pos_embed') is not None:
        absolute_pos_embed = state_dict['absolute_pos_embed']
        N1, L, C1 = absolute_pos_embed.size()
        N2, C2, H, W = model.absolute_pos_embed.size()
        if N1 != N2 or C1 != C2 or L != H*W:
            logger.warning("Error in loading absolute_pos_embed, pass")
        else:
            state_dict['absolute_pos_embed'] = absolute_pos_embed.view(N2, H, W, C2).permute(0, 3, 1, 2)
    
    rank, _ = get_dist_info()
    if "rel_pos_bias.relative_position_bias_table" in state_dict:
        if rank == 0:
            rel_pos_bias = state_dict["rel_pos_bias.relative_position_bias_table"]
            state_dict["relative_position_bias_table"] = rel_pos_bias
            state_dict.pop("rel_pos_bias.relative_position_bias_table")
    all_keys = list(state_dict.keys())
    for key in all_keys:
        if "relative_position_index" in key:
            state_dict.pop(key)

        if "relative_position_bias_table" in key and key in model.state_dict():
            rel_pos_bias = state_dict[key]
            src_num_pos, num_attn_heads = rel_pos_bias.size()
            dst_num_pos, _ = model.state_dict()[key].size()
            dst_patch_shape = model.patch_embed.patch_shape
            if dst_patch_shape[0] != dst_patch_shape[1]:
                num_extra_tokens = dst_num_pos - (dst_patch_shape[0] * 2 - 1) * (dst_patch_shape[1] * 2 - 1)

                src_size = int((src_num_pos - num_extra_tokens) ** 0.5)   # 27 
                
                dst_size_0 = dst_patch_shape[0] * 2 - 1   # 42
                dst_size_1 = dst_patch_shape[1] * 2 - 1   # 68

                if src_size != dst_size_0:
                    if rank == 0:
                        print("Position interpolate for %s from %dx%d to %dx%d" % (
                            key, src_size, src_size, dst_size_0, dst_size_1))

                    extra_tokens = rel_pos_bias[-num_extra_tokens:, :]
                    rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :]

                    def geometric_progression(a, r, n):
                        return a * (1.0 - r ** n) / (1.0 - r)

                    left, right = 1.01, 1.5
                    while right - left > 1e-6:
                        q = (left + right) / 2.0
                        gp = geometric_progression(1, q, src_size // 2)
                        if gp > dst_size_0 // 2:
                            right = q
                        else:
                            left = q

                    dis_0 = []
                    cur = 1
                    for i in range(src_size // 2):
                        dis_0.append(cur)
                        cur += q ** (i + 1)
                    
                    r_ids_0 = [-_ for _ in reversed(dis_0)]

                    top, bottom = 1.01, 1.5
                    while bottom - top > 1e-6:
                        q = (top + bottom) / 2.0
                        gp = geometric_progression(1, q, src_size // 2)
                        if gp > dst_size_1 // 2:
                            bottom = q
                        else:
                            top = q

                    dis_1 = []
                    cur = 1
                    for i in range(src_size // 2):
                        dis_1.append(cur)
                        cur += q ** (i + 1)
                    
                    r_ids_1 = [-_ for _ in reversed(dis_1)]

                    # if q > 1.13492:
                    #     q = 1.13492

                    x = r_ids_0 + [0] + dis_0
                    y = r_ids_1 + [0] + dis_1

                    t_0 = dst_size_0 // 2.0
                    t_1 = dst_size_1 // 2.0

                    dx = np.arange(-t_0, t_0 + 0.1, 1.0)
                    dy = np.arange(-t_1, t_1 + 0.1, 1.0)

                    if rank == 0:
                        print("x = {}".format(x))
                        print("dx = {}".format(dx))

                    all_rel_pos_bias = []

                    for i in range(num_attn_heads):
                        z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy()
                        f = interpolate.interp2d(x, y, z, kind='cubic')
                        all_rel_pos_bias.append(
                            torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to(rel_pos_bias.device))

                    rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1)
                    new_rel_pos_bias = torch.cat((rel_pos_bias, extra_tokens), dim=0)
                    state_dict[key] = new_rel_pos_bias

            else:
                num_extra_tokens = dst_num_pos - (dst_patch_shape[0] * 2 - 1) * (dst_patch_shape[1] * 2 - 1)

                src_size = int((src_num_pos - num_extra_tokens) ** 0.5)   # 27 
                dst_size = int((dst_num_pos - num_extra_tokens) ** 0.5)   # 

                if src_size != dst_size:
                    if rank == 0:
                        print("Position interpolate for %s from %dx%d to %dx%d" % (
                            key, src_size, src_size, dst_size, dst_size))

                    extra_tokens = rel_pos_bias[-num_extra_tokens:, :]
                    rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :]

                    def geometric_progression(a, r, n):
                        return a * (1.0 - r ** n) / (1.0 - r)

                    left, right = 1.01, 1.5
                    while right - left > 1e-6:
                        q = (left + right) / 2.0
                        gp = geometric_progression(1, q, src_size // 2)
                        if gp > dst_size // 2:
                            right = q
                        else:
                            left = q

                    # if q > 1.13492:
                    #     q = 1.13492

                    dis = []
                    cur = 1
                    for i in range(src_size // 2):
                        dis.append(cur)
                        cur += q ** (i + 1)

                    r_ids = [-_ for _ in reversed(dis)]

                    x = r_ids + [0] + dis
                    y = r_ids + [0] + dis

                    t = dst_size // 2.0
                    dx = np.arange(-t, t + 0.1, 1.0)
                    dy = np.arange(-t, t + 0.1, 1.0)
                    if rank == 0:
                        print("x = {}".format(x))
                        print("dx = {}".format(dx))

                    all_rel_pos_bias = []

                    for i in range(num_attn_heads):
                        z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy()
                        f = interpolate.interp2d(x, y, z, kind='cubic')
                        all_rel_pos_bias.append(
                            torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to(rel_pos_bias.device))

                    rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1)
                    new_rel_pos_bias = torch.cat((rel_pos_bias, extra_tokens), dim=0)
                    state_dict[key] = new_rel_pos_bias


    if 'pos_embed' in state_dict:
        pos_embed_checkpoint = state_dict['pos_embed']
        embedding_size = pos_embed_checkpoint.shape[-1]
        num_patches = model.patch_embed.num_patches
        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
        # height (== width) for the checkpoint position embedding
        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
        # height (== width) for the new position embedding
        #new_size = int(num_patches ** 0.5)
        new_size_w = model.patch_embed.num_patches_w
        new_size_h = model.patch_embed.num_patches_h
        # class_token and dist_token are kept unchanged
        if orig_size != new_size_h or orig_size != new_size_w:
            if rank == 0:
                print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size_w, new_size_h))
            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
            # only the position tokens are interpolated
            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
            pos_tokens = torch.nn.functional.interpolate(
                pos_tokens, size=(new_size_w, new_size_h), mode='bicubic', align_corners=False)
            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
            state_dict['pos_embed'] = new_pos_embed

    # interpolate position bias table if needed
    relative_position_bias_table_keys = [k for k in state_dict.keys() if "relative_position_bias_table" in k and  k in model.state_dict()]
    for table_key in relative_position_bias_table_keys:
        table_pretrained = state_dict[table_key]
        table_current = model.state_dict()[table_key]
        L1, nH1 = table_pretrained.size()
        L2, nH2 = table_current.size()
        if nH1 != nH2:
            logger.warning(f"Error in loading {table_key}, pass")
        else:
            if L1 != L2:
                S1 = int(L1 ** 0.5)
                S2 = int(L2 ** 0.5)
                table_pretrained_resized = F.interpolate(
                     table_pretrained.permute(1, 0).view(1, nH1, S1, S1),
                     size=(S2, S2), mode='bicubic')
                state_dict[table_key] = table_pretrained_resized.view(nH2, L2).permute(1, 0)

    # load state_dict
    load_state_dict(model, state_dict, strict, logger)
    return checkpoint


def weights_to_cpu(state_dict):
    """Copy a model state_dict to cpu.

    Args:
        state_dict (OrderedDict): Model weights on GPU.

    Returns:
        OrderedDict: Model weights on GPU.
    """
    state_dict_cpu = OrderedDict()
    for key, val in state_dict.items():
        state_dict_cpu[key] = val.cpu()
    return state_dict_cpu


def _save_to_state_dict(module, destination, prefix, keep_vars):
    """Saves module state to `destination` dictionary.

    This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.

    Args:
        module (nn.Module): The module to generate state_dict.
        destination (dict): A dict where state will be stored.
        prefix (str): The prefix for parameters and buffers used in this
            module.
    """
    for name, param in module._parameters.items():
        if param is not None:
            destination[prefix + name] = param if keep_vars else param.detach()
    for name, buf in module._buffers.items():
        # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d
        if buf is not None:
            destination[prefix + name] = buf if keep_vars else buf.detach()


def get_state_dict(module, destination=None, prefix='', keep_vars=False):
    """Returns a dictionary containing a whole state of the module.

    Both parameters and persistent buffers (e.g. running averages) are
    included. Keys are corresponding parameter and buffer names.

    This method is modified from :meth:`torch.nn.Module.state_dict` to
    recursively check parallel module in case that the model has a complicated
    structure, e.g., nn.Module(nn.Module(DDP)).

    Args:
        module (nn.Module): The module to generate state_dict.
        destination (OrderedDict): Returned dict for the state of the
            module.
        prefix (str): Prefix of the key.
        keep_vars (bool): Whether to keep the variable property of the
            parameters. Default: False.

    Returns:
        dict: A dictionary containing a whole state of the module.
    """
    # recursively check parallel module in case that the model has a
    # complicated structure, e.g., nn.Module(nn.Module(DDP))
    if is_module_wrapper(module):
        module = module.module

    # below is the same as torch.nn.Module.state_dict()
    if destination is None:
        destination = OrderedDict()
        destination._metadata = OrderedDict()
    destination._metadata[prefix[:-1]] = local_metadata = dict(
        version=module._version)
    _save_to_state_dict(module, destination, prefix, keep_vars)
    for name, child in module._modules.items():
        if child is not None:
            get_state_dict(
                child, destination, prefix + name + '.', keep_vars=keep_vars)
    for hook in module._state_dict_hooks.values():
        hook_result = hook(module, destination, prefix, local_metadata)
        if hook_result is not None:
            destination = hook_result
    return destination


def save_checkpoint(model, filename, optimizer=None, meta=None):
    """Save checkpoint to file.

    The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
    ``optimizer``. By default ``meta`` will contain version and time info.

    Args:
        model (Module): Module whose params are to be saved.
        filename (str): Checkpoint filename.
        optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
        meta (dict, optional): Metadata to be saved in checkpoint.
    """
    if meta is None:
        meta = {}
    elif not isinstance(meta, dict):
        raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
    meta.update(mmcv_version=mmcv.__version__, time=time.asctime())

    if is_module_wrapper(model):
        model = model.module

    if hasattr(model, 'CLASSES') and model.CLASSES is not None:
        # save class name to the meta
        meta.update(CLASSES=model.CLASSES)

    checkpoint = {
        'meta': meta,
        'state_dict': weights_to_cpu(get_state_dict(model))
    }
    # save optimizer state dict in the checkpoint
    if isinstance(optimizer, Optimizer):
        checkpoint['optimizer'] = optimizer.state_dict()
    elif isinstance(optimizer, dict):
        checkpoint['optimizer'] = {}
        for name, optim in optimizer.items():
            checkpoint['optimizer'][name] = optim.state_dict()

    if filename.startswith('pavi://'):
        try:
            from pavi import modelscloud
            from pavi.exception import NodeNotFoundError
        except ImportError:
            raise ImportError(
                'Please install pavi to load checkpoint from modelcloud.')
        model_path = filename[7:]
        root = modelcloud.Folder()
        model_dir, model_name = osp.split(model_path)
        try:
            model = modelcloud.get(model_dir)
        except NodeNotFoundError:
            model = root.create_training_model(model_dir)
        with TemporaryDirectory() as tmp_dir:
            checkpoint_file = osp.join(tmp_dir, model_name)
            with open(checkpoint_file, 'wb') as f:
                torch.save(checkpoint, f)
                f.flush()
            model.create_file(checkpoint_file, name=model_name)
    else:
        mmcv.mkdir_or_exist(osp.dirname(filename))
        # immediately flush buffer
        with open(filename, 'wb') as f:
            torch.save(checkpoint, f)
            f.flush()


================================================
FILE: downstream_tasks/detection/evaluation/object_detection/mmcv_custom/layer_decay_optimizer_constructor.py
================================================
# Copyright (c) ByteDance, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

"""
Mostly copy-paste from BEiT library:
https://github.com/microsoft/unilm/blob/master/beit/semantic_segmentation/mmcv_custom/layer_decay_optimizer_constructor.py
"""

import json

from mmcv.runner import OPTIMIZER_BUILDERS, DefaultOptimizerConstructor
from mmcv.runner import get_dist_info

def get_num_layer_for_vit(var_name, num_max_layer):
    if var_name in ("backbone.cls_token", "backbone.mask_token", "backbone.pos_embed"):
        return 0
    elif var_name.startswith("backbone.patch_embed"):
        return 0
    elif var_name.startswith("backbone.blocks"):
        layer_id = int(var_name.split('.')[2])
        return layer_id + 1
    else:
        return num_max_layer - 1


@OPTIMIZER_BUILDERS.register_module()
class LayerDecayOptimizerConstructor(DefaultOptimizerConstructor):
    def add_params(self, params, module, prefix='', is_dcn_module=None):
        """Add all parameters of module to the params list.
        The parameters of the given module will be added to the list of param
        groups, with specific rules defined by paramwise_cfg.
        Args:
            params (list[dict]): A list of param groups, it will be modified
                in place.
            module (nn.Module): The module to be added.
            prefix (str): The prefix of the module
            is_dcn_module (int|float|None): If the current module is a
                submodule of DCN, `is_dcn_module` will be passed to
                control conv_offset layer's learning rate. Defaults to None.
        """
        parameter_groups = {}
        print(self.paramwise_cfg)
        num_layers = self.paramwise_cfg.get('num_layers') + 2
        layer_decay_rate = self.paramwise_cfg.get('layer_decay_rate')
        print("Build LayerDecayOptimizerConstructor %f - %d" % (layer_decay_rate, num_layers))
        weight_decay = self.base_wd

        for name, param in module.named_parameters():
            if not param.requires_grad:
                continue  # frozen weights
            if len(param.shape) == 1 or name.endswith(".bias") or name in ('pos_embed', 'cls_token'):
                group_name = "no_decay"
                this_weight_decay = 0.
            else:
                group_name = "decay"
                this_weight_decay = weight_decay

            layer_id = get_num_layer_for_vit(name, num_layers)
            group_name = "layer_%d_%s" % (layer_id, group_name)

            if group_name not in parameter_groups:
                scale = layer_decay_rate ** (num_layers - layer_id - 1)

                parameter_groups[group_name] = {
                    "weight_decay": this_weight_decay,
                    "params": [],
                    "param_names": [], 
                    "lr_scale": scale, 
                    "group_name": group_name, 
                    "lr": scale * self.base_lr, 
                }

            parameter_groups[group_name]["params"].append(param)
            parameter_groups[group_name]["param_names"].append(name)
        rank, _ = get_dist_info()
        if rank == 0:
            to_display = {}
            for key in parameter_groups:
                to_display[key] = {
                    "param_names": parameter_groups[key]["param_names"], 
                    "lr_scale": parameter_groups[key]["lr_scale"], 
                    "lr": parameter_groups[key]["lr"], 
                    "weight_decay": parameter_groups[key]["weight_decay"], 
                }
            print("Param groups = %s" % json.dumps(to_display, indent=2))
        
        # state_dict = module.state_dict()
        # for group_name in parameter_groups:
        #     group = parameter_groups[group_name]
        #     for name in group["param_names"]:
        #         group["params"].append(state_dict[name])
        params.extend(parameter_groups.values())


================================================
FILE: downstream_tasks/detection/evaluation/object_detection/mmcv_custom/prepare_rpe.py
================================================
import torch

import numpy as np
from scipy import interpolate

from mmcv.runner import get_dist_info
import torch.nn as nn

def rpe_index(window_size):
    num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3

    # get pair-wise relative position index for each token inside the window
    coords_h = torch.arange(window_size[0])
    coords_w = torch.arange(window_size[1])
    coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
    coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
    relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
    relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
    relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
    relative_coords[:, :, 1] += window_size[1] - 1
    relative_coords[:, :, 0] *= 2 * window_size[1] - 1
    relative_position_index = \
        torch.zeros(size=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
    relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
    relative_position_index[0, 0:] = num_relative_distance - 3
    relative_position_index[0:, 0] = num_relative_distance - 2
    relative_position_index[0, 0] = num_relative_distance - 1

    return relative_position_index

def prepare_rpe(rel_pos_bias, src_patch_shape, dst_patch_shape):
    src_num_pos, num_attn_heads = rel_pos_bias.size()   # 732

    rank, _ = get_dist_info()
    dst_num_pos = (dst_patch_shape[0]*2 -1) * (dst_patch_shape[1]*2 -1) + 3 

    if dst_patch_shape[0] != src_patch_shape[0] or dst_patch_shape[1] != src_patch_shape[1]:

        num_extra_tokens = dst_num_pos - (dst_patch_shape[0] * 2 - 1) * (dst_patch_shape[1] * 2 - 1)

        # src_size = int((src_num_pos - num_extra_tokens) ** 0.5)   # 27 
        src_size_0, src_size_1 = src_patch_shape[0] * 2 - 1, src_patch_shape[1]*2 -1
        extra_tokens = rel_pos_bias[-num_extra_tokens:, :]
        rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :]

        dst_size_0 = dst_patch_shape[0] * 2 - 1   # 42
        dst_size_1 = dst_patch_shape[1] * 2 - 1   # 68

        dim = rel_pos_bias.shape[-1]
        rel_pos_bias = rel_pos_bias.reshape(1 , src_size_0, src_size_1, dim).permute(0, 3, 1, 2)
        new_rel_pos_bias = nn.functional.interpolate(rel_pos_bias, scale_factor=(dst_size_0 / src_size_0, dst_size_1 / dst_size_1), mode='bicubic',) 
        new_rel_pos_bias = new_rel_pos_bias.permute(0, 2, 3, 1).view(1, -1, dim).squeeze(0)
        new_rel_pos_bias = torch.cat((new_rel_pos_bias, extra_tokens), dim=0)
    else:
        new_rel_pos_bias = rel_pos_bias
    # get rpe_index
    relative_position_index = rpe_index(dst_patch_shape)
    new_rel_pos_bias = new_rel_pos_bias[relative_position_index.view(-1)].view(
                    dst_patch_shape[0] * dst_patch_shape[1] + 1,
                    dst_patch_shape[0] * dst_patch_shape[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
    new_rel_pos_bias = new_rel_pos_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
    return new_rel_pos_bias


================================================
FILE: downstream_tasks/detection/evaluation/object_detection/mmcv_custom/register_backbone.py
================================================
# Copyright (c) ByteDance, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

import os
import torch
import torch.nn as nn
import torch.utils.checkpoint as checkpoint

from mmcv_custom import load_checkpoint
from mmdet.utils import get_root_logger
from mmdet.models.builder import BACKBONES
from models import VisionTransformer
from .prepare_rpe import prepare_rpe
import time

class PatchEmbed(nn.Module):
    """ Image to Patch Embedding
    """
    def __init__(self, img_size=[224, 224], patch_size=16, in_chans=3, embed_dim=768):
        super().__init__()
        self.num_patches_w = img_size[0] // patch_size
        self.num_patches_h = img_size[1] // patch_size

        num_patches = self.num_patches_w * self.num_patches_h
        self.patch_shape = (img_size[0] // patch_size, img_size[1] // patch_size)
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = num_patches
        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
            
    def forward(self, x, mask=None):
        B, C, H, W = x.shape
        return self.proj(x)

@BACKBONES.register_module()
class VisionTransformer(VisionTransformer):
    def __init__(self,
                 img_size,
                 patch_size,
                 embed_dim,
                 in_chans=3,
                 with_fpn=True,
                 frozen_stages=-1,
                 out_indices=[3, 5, 7, 11],
                 out_with_norm=False,
                 use_checkpoint=False,
                 **kwargs):
        super(VisionTransformer, self).__init__(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dim, 
            **kwargs)
        
        # support non-square image as input
        if len(img_size) == 1:
            img_size = img_size * 2
        self.patch_embed = PatchEmbed(
            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
        num_patches = self.patch_embed.num_patches
        if self.use_abs_pos_emb:
            self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
        elif self.use_sincos_pos_emb:
            self.pos_embed = self.build_2d_sincos_position_embedding(embed_dim)
        else:
            self.pos_embed = None

        
        self.patch_size = patch_size
        self.with_fpn = with_fpn
        self.frozen_stages = frozen_stages
        self.out_indices = out_indices
        self.use_checkpoint = use_checkpoint

        if not out_with_norm:
            self.norm = nn.Identity()

        if with_fpn and patch_size == 16:
            self.fpn1 = nn.Sequential(
                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
                nn.SyncBatchNorm(embed_dim),
                nn.GELU(),
                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
            )

            self.fpn2 = nn.Sequential(
                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
            )

            self.fpn3 = nn.Identity()

            self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2)
        elif with_fpn and patch_size == 8:
            self.fpn1 = nn.Sequential(
                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
            )

            self.fpn2 = nn.Identity()

            self.fpn3 = nn.Sequential(
                nn.MaxPool2d(kernel_size=2, stride=2),
            )

            self.fpn4 = nn.Sequential(
                nn.MaxPool2d(kernel_size=4, stride=4),
            )
        else:
            logger = get_root_logger()
            logger.info('Build model without FPN.')


    def build_2d_sincos_position_embedding(self, embed_dim=768, temperature=10000., decode=False):
        h, w = self.patch_embed.patch_shape 
        grid_w = torch.arange(w, dtype=torch.float32)
        grid_h = torch.arange(h, dtype=torch.float32)
        grid_w, grid_h = torch.meshgrid(grid_w, grid_h)
        assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
        pos_dim = embed_dim // 4
        omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
        omega = 1. / (temperature ** omega)
        out_w = torch.einsum('m,d->md', [grid_w.flatten(), omega])
        out_h = torch.einsum('m,d->md', [grid_h.flatten(), omega])
        pos_emb = torch.cat([torch.sin(out_w), torch.cos(out_w), torch.sin(out_h), torch.cos(out_h)], dim=1)[None, :, :]

        pe_token = torch.zeros([1, 1, embed_dim], dtype=torch.float32)
        pos_embed = nn.Parameter(torch.cat([pe_token, pos_emb], dim=1))
        pos_embed.requires_grad = False
        return pos_embed

    def train(self, mode=True):
        """Convert the model into training mode while keep layers freezed."""
        super(VisionTransformer, self).train(mode)
        self._freeze_stages()
        if self.pos_embed is not None:
            if self.pos_embed.requires_grad:
                print("=================pos_embed update ================")
            else:
                print("=================pos_embed static ================")
            

    def _freeze_stages(self):
        if self.frozen_stages >= 0:
            self.patch_embed.eval()
            for param in self.patch_embed.parameters():
                param.requires_grad = False
            self.cls_token.requires_grad = False
            if self.pos_embed is not None and self.use_sincos_pos_emb == True:
                self.pos_embed.requires_grad = False
            self.pos_drop.eval()

        for i in range(1, self.frozen_stages + 1):
            
            if i  == len(self.blocks):
                norm_layer = getattr(self, 'norm') #f'norm{i-1}')
                norm_layer.eval()
                for param in norm_layer.parameters():
                    param.requires_grad = False

            m = self.blocks[i - 1]
            m.eval()
            for param in m.parameters():
                param.requires_grad = False
            
    def init_weights(self, pretrained=None):
        """Initialize the weights in backbone.
        Args:
            pretrained (str, optional): Path to pre-trained weights.
                Defaults to None.
        """

        if isinstance(pretrained, str):
            self.apply(self._init_weights)
            logger = get_root_logger()
            if  os.path.isfile(pretrained):
                load_checkpoint(self, pretrained, strict=False, logger=logger)
            else:
                logger.info(f"checkpoint path {pretrained} is invalid, we skip it and initialize net randomly")
        elif pretrained is None:
            self.apply(self._init_weights)
        else:
            raise TypeError('pretrained must be a str or None')

    def interpolate_pos_encoding(self, x, w, h):
        npatch = x.shape[1] - 1
        N = self.pos_embed.shape[1] - 1
        w0 = w // self.patch_embed.patch_size
        h0 = h // self.patch_embed.patch_size
        if npatch == N and w0 == self.patch_embed.num_patches_w and h0 == self.patch_embed.num_patches_h:
            return self.pos_embed
        class_pos_embed = self.pos_embed[:, 0]
        patch_pos_embed = self.pos_embed[:, 1:]
        dim = x.shape[-1]
        # we add a small number to avoid floating point error in the interpolation
        # see discussion at https://github.com/facebookresearch/dino/issues/8
        w0, h0 = w0 + 0.1, h0 + 0.1
        
        tmp=patch_pos_embed.reshape(1, self.patch_embed.num_patches_w, self.patch_embed.num_patches_h, dim).permute(0, 3, 1, 2)
        patch_pos_embed = nn.functional.interpolate(
            patch_pos_embed.reshape(1, self.patch_embed.num_patches_w, self.patch_embed.num_patches_h, dim).permute(0, 3, 1, 2),
            scale_factor=(w0 / self.patch_embed.num_patches_w, h0 / self.patch_embed.num_patches_h),
            mode='bicubic',
        )
        assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1]
        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)

    def forward(self, x):
        B, _, H, W = x.shape
        Hp, Wp = H // self.patch_size, W // self.patch_size
            
        x = self.prepare_tokens(x)
        features = []
        
        time_begin = time.time()
        if self.relative_position_bias_table is None:
            x_rpe = None
        else:
            dst_rpe_shape = (Wp, Hp) if H <= W else(Hp, Wp) 
            x_rpe = prepare_rpe(self.relative_position_bias_table, self.patch_embed.patch_shape, dst_rpe_shape)
        for i, blk in enumerate(self.blocks):
            if self.use_checkpoint:
                x = checkpoint.checkpoint(blk, x, x_rpe)
            else:
                x = blk(x, x_rpe)
            if i in self.out_indices:
                xp = self.norm(x[:, 1:, :]).permute(0, 2, 1).reshape(B, -1, Hp, Wp)       
                features.append(xp.contiguous())
        time_backbone = time.time()
        
        if self.with_fpn:
            ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
            for i in range(len(features)):
                features[i] = ops[i](features[i])
        time_end = time.time()
        return tuple(features)


================================================
FILE: downstream_tasks/detection/evaluation/object_detection/mmcv_custom/runner/__init__.py
================================================

# Copyright (c) Open-MMLab. All rights reserved.
from .checkpoint import save_checkpoint
from .epoch_based_runner import EpochBasedRunnerAmp


__all__ = [
    'EpochBasedRunnerAmp', 'save_checkpoint'
]

================================================
FILE: downstream_tasks/detection/evaluation/object_detection/mmcv_custom/runner/checkpoint.py
================================================
# Copyright (c) ByteDance, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

"""
Copy-paste from mmcv library:
https://github.com/open-mmlab/mmcv/
"""

import os.path as osp
import time
import torch
import mmcv
try:
    import apex
except:
    print('apex is not installed')

from tempfile import TemporaryDirectory
from torch.optim import Optimizer
from mmcv.parallel import is_module_wrapper
from mmcv.runner.checkpoint import weights_to_cpu, get_state_dict

def save_checkpoint(model, filename, optimizer=None, meta=None):
    """Save checkpoint to file.
    The checkpoint will have 4 fields: ``meta``, ``state_dict`` and
    ``optimizer``, ``amp``. By default ``meta`` will contain version
    and time info.
    Args:
        model (Module): Module whose params are to be saved.
        filename (str): Checkpoint filename.
        optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
        meta (dict, optional): Metadata to be saved in checkpoint.
    """
    if meta is None:
        meta = {}
    elif not isinstance(meta, dict):
        raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
    meta.update(mmcv_version=mmcv.__version__, time=time.asctime())

    if is_module_wrapper(model):
        model = model.module

    if hasattr(model, 'CLASSES') and model.CLASSES is not None:
        # save class name to the meta
        meta.update(CLASSES=model.CLASSES)

    checkpoint = {
        'meta': meta,
        'state_dict': weights_to_cpu(get_state_dict(model))
    }
    # save optimizer state dict in the checkpoint
    if isinstance(optimizer, Optimizer):
        checkpoint['optimizer'] = optimizer.state_dict()
    elif isinstance(optimizer, dict):
        checkpoint['optimizer'] = {}
        for name, optim in optimizer.items():
            checkpoint['optimizer'][name] = optim.state_dict()

    # save amp state dict in the checkpoint
    checkpoint['amp'] = apex.amp.state_dict()

    if filename.startswith('pavi://'):
        try:
            from pavi import modelscloud
            from pavi.exception import NodeNotFoundError
        except ImportError:
            raise ImportError(
                'Please install pavi to load checkpoint from modelcloud.')
        model_path = filename[7:]
        root = modelcloud.Folder()
        model_dir, model_name = osp.split(model_path)
        try:
            model = modelcloud.get(model_dir)
        except NodeNotFoundError:
            model = root.create_training_model(model_dir)
        with TemporaryDirectory() as tmp_dir:
            checkpoint_file = osp.join(tmp_dir, model_name)
            with open(checkpoint_file, 'wb') as f:
                torch.save(checkpoint, f)
                f.flush()
            model.create_file(checkpoint_file, name=model_name)
    else:
        mmcv.mkdir_or_exist(osp.dirname(filename))
        # immediately flush buffer
        with open(filename, 'wb') as f:
            torch.save(checkpoint, f)
            f.flush()


================================================
FILE: downstream_tasks/detection/evaluation/object_detection/mmcv_custom/runner/epoch_based_runner.py
================================================
# Copyright (c) ByteDance, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

"""
Copy-paste from mmcv library:
https://github.com/open-mmlab/mmcv/
"""

import os.path as osp
import platform
import shutil
import torch
import mmcv
try:
    import apex
except:
    print('apex is not installed')

from torch.optim import Optimizer
from mmcv.runner import RUNNERS, EpochBasedRunner
from .checkpoint import save_checkpoint

@RUNNERS.register_module()
class EpochBasedRunnerAmp(EpochBasedRunner):
    """Epoch-based Runner with AMP support.
    This runner train models epoch by epoch.
    """

    def save_checkpoint(self,
                        out_dir,
                        filename_tmpl='epoch_{}.pth',
                        save_optimizer=True,
                        meta=None,
                        create_symlink=True):
        """Save the checkpoint.
        Args:
            out_dir (str): The directory that checkpoints are saved.
            filename_tmpl (str, optional): The checkpoint filename template,
                which contains a placeholder for the epoch number.
                Defaults to 'epoch_{}.pth'.
            save_optimizer (bool, optional): Whether to save the optimizer to
                the checkpoint. Defaults to True.
            meta (dict, optional): The meta information to be saved in the
                checkpoint. Defaults to None.
            create_symlink (bool, optional): Whether to create a symlink
                "latest.pth" to point to the latest checkpoint.
                Defaults to True.
        """
        if meta is None:
            meta = dict(epoch=self.epoch + 1, iter=self.iter)
        elif isinstance(meta, dict):
            meta.update(epoch=self.epoch + 1, iter=self.iter)
        else:
            raise TypeError(
                f'meta should be a dict or None, but got {type(meta)}')
        if self.meta is not None:
            meta.update(self.meta)

        filename = filename_tmpl.format(self.epoch + 1)
        filepath = osp.join(out_dir, filename)
        optimizer = self.optimizer if save_optimizer else None
        save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta)
        # in some environments, `os.symlink` is not supported, you may need to
        # set `create_symlink` to False
        if create_symlink:
            dst_file = osp.join(out_dir, 'latest.pth')
            if platform.system() != 'Windows':
                mmcv.symlink(filename, dst_file)
            else:
                shutil.copy(filepath, dst_file)

    def resume(self,
               checkpoint,
               resume_optimizer=True,
               map_location='default'):
        if map_location == 'default':
            if torch.cuda.is_available():
                device_id = torch.cuda.current_device()
                checkpoint = self.load_checkpoint(
                    checkpoint,
                    map_location=lambda storage, loc: storage.cuda(device_id))
            else:
                checkpoint = self.load_checkpoint(checkpoint)
        else:
            checkpoint = self.load_checkpoint(
                checkpoint, map_location=map_location)

        self._epoch = checkpoint['meta']['epoch']
        self._iter = checkpoint['meta']['iter']
        if 'optimizer' in checkpoint and resume_optimizer:
            if isinstance(self.optimizer, Optimizer):
                self.optimizer.load_state_dict(checkpoint['optimizer'])
            elif isinstance(self.optimizer, dict):
                for k in self.optimizer.keys():
                    self.optimizer[k].load_state_dict(
                        checkpoint['optimizer'][k])
            else:
                raise TypeError(
                    'Optimizer should be dict or torch.optim.Optimizer '
                    f'but got {type(self.optimizer)}')

        if 'amp' in checkpoint:
            apex.amp.load_state_dict(checkpoint['amp'])
            self.logger.info('load amp state dict')

        self.logger.info('resumed epoch %d, iter %d', self.epoch, self.iter)

================================================
FILE: downstream_tasks/detection/evaluation/object_detection/test.py
================================================
# Copyright (c) ByteDance, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

"""
Mostly copy-paste from mmdetection library:
https://github.com/open-mmlab/mmdetection/blob/master/tools/test.py
"""

import argparse
import os
import warnings
import mmcv
import torch

from mmcv import Config, DictAction
from mmcv.cnn import fuse_conv_bn
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
                         wrap_fp16_model)
from mmdet.apis import multi_gpu_test, single_gpu_test
from mmdet.datasets import (build_dataloader, build_dataset,
                            replace_ImageToTensor)
from mmdet.models import build_detector

def parse_args():
    parser = argparse.ArgumentParser(
        description='MMDet test (and eval) a model')
    parser.add_argument('config', help='test config file path')
    parser.add_argument('checkpoint', help='checkpoint file')
    parser.add_argument('--out', help='output result file in pickle format')
    parser.add_argument(
        '--fuse-conv-bn',
        action='store_true',
        help='Whether to fuse conv and bn, this will slightly increase'
        'the inference speed')
    parser.add_argument(
        '--format-only',
        action='store_true',
        help='Format the output results without perform evaluation. It is'
        'useful when you want to format the result to a specific format and '
        'submit it to the test server')
    parser.add_argument(
        '--eval',
        type=str,
        nargs='+',
        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
    parser.add_argument('--show', action='store_true', help='show results')
    parser.add_argument(
        '--show-dir', help='directory where painted images will be saved')
    parser.add_argument(
        '--show-score-thr',
        type=float,
        default=0.3,
        help='score threshold (default: 0.3)')
    parser.add_argument(
        '--gpu-collect',
        action='store_true',
        help='whether to use gpu to collect results.')
    parser.add_argument(
        '--tmpdir',
        help='tmp directory used for collecting results from multiple '
        'workers, available when gpu-collect is not specified')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. If the value to '
        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
        'Note that the quotation marks are necessary and that no white space '
        'is allowed.')
    parser.add_argument(
        '--options',
        nargs='+',
        action=DictAction,
        help='custom options for evaluation, the key-value pair in xxx=yyy '
        'format will be kwargs for dataset.evaluate() function (deprecate), '
        'change to --eval-options instead.')
    parser.add_argument(
        '--eval-options',
        nargs='+',
        action=DictAction,
        help='custom options for evaluation, the key-value pair in xxx=yyy '
        'format will be kwargs for dataset.evaluate() function')
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='none',
        help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)

    if args.options and args.eval_options:
        raise ValueError(
            '--options and --eval-options cannot be both '
            'specified, --options is deprecated in favor of --eval-options')
    if args.options:
        warnings.warn('--options is deprecated in favor of --eval-options')
        args.eval_options = args.options
    return args


def main():
    args = parse_args()

    assert args.out or args.eval or args.format_only or args.show \
        or args.show_dir, \
        ('Please specify at least one operation (save/eval/format/show the '
         'results / save the results) with the argument "--out", "--eval"'
         ', "--format-only", "--show" or "--show-dir"')

    if args.eval and args.format_only:
        raise ValueError('--eval and --format_only cannot be both specified')

    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
        raise ValueError('The output file must be a pkl file.')

    cfg = Config.fromfile(args.config)
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)
    # import modules from string list.
    if cfg.get('custom_imports', None):
        from mmcv.utils import import_modules_from_strings
        import_modules_from_strings(**cfg['custom_imports'])
    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True
    cfg.model.pretrained = None
    if cfg.model.get('neck'):
        if isinstance(cfg.model.neck, list):
            for neck_cfg in cfg.model.neck:
                if neck_cfg.get('rfp_backbone'):
                    if neck_cfg.rfp_backbone.get('pretrained'):
                        neck_cfg.rfp_backbone.pretrained = None
        elif cfg.model.neck.get('rfp_backbone'):
            if cfg.model.neck.rfp_backbone.get('pretrained'):
                cfg.model.neck.rfp_backbone.pretrained = None

    # in case the test dataset is concatenated
    samples_per_gpu = 1
    if isinstance(cfg.data.test, dict):
        cfg.data.test.test_mode = True
        samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
        if samples_per_gpu > 1:
            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
            cfg.data.test.pipeline = replace_ImageToTensor(
                cfg.data.test.pipeline)
    elif isinstance(cfg.data.test, list):
        for ds_cfg in cfg.data.test:
            ds_cfg.test_mode = True
        samples_per_gpu = max(
            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
        if samples_per_gpu > 1:
            for ds_cfg in cfg.data.test:
                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)

    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)

    # build the dataloader
    dataset = build_dataset(cfg.data.test)
    data_loader = build_dataloader(
        dataset,
        samples_per_gpu=samples_per_gpu,
        workers_per_gpu=cfg.data.workers_per_gpu,
        dist=distributed,
        shuffle=False)

    # build the model and load checkpoint
    cfg.model.train_cfg = None
    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        wrap_fp16_model(model)
    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
    if args.fuse_conv_bn:
        model = fuse_conv_bn(model)
    # old versions did not save class info in checkpoints, this walkaround is
    # for backward compatibility
    if 'CLASSES' in checkpoint.get('meta', {}):
        model.CLASSES = checkpoint['meta']['CLASSES']
    else:
        model.CLASSES = dataset.CLASSES

    if not distributed:
        model = MMDataParallel(model, device_ids=[0])
        outputs = single_gpu_test(model, data_loader, args.show, args.show_dir,
                                  args.show_score_thr)
    else:
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False)
        outputs = multi_gpu_test(model, data_loader, args.tmpdir,
                                 args.gpu_collect)

    rank, _ = get_dist_info()
    if rank == 0:
        if args.out:
            print(f'\nwriting results to {args.out}')
            mmcv.dump(outputs, args.out)
        kwargs = {} if args.eval_options is None else args.eval_options
        if args.format_only:
            dataset.format_results(outputs, **kwargs)
        if args.eval:
            eval_kwargs = cfg.get('evaluation', {}).copy()
            # hard-code way to remove EvalHook args
            for key in [
                    'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
                    'rule'
            ]:
                eval_kwargs.pop(key, None)
            eval_kwargs.update(dict(metric=args.eval, **kwargs))
            print(dataset.evaluate(outputs, **eval_kwargs))


if __name__ == '__main__':
    main()

================================================
FILE: downstream_tasks/detection/evaluation/object_detection/train.py
================================================
# Copyright (c) ByteDance, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

"""
Mostly copy-paste from mmdetection library:
https://github.com/open-mmlab/mmdetection/blob/master/tools/train.py
"""

import argparse
import copy
import os
import os.path as osp
import time
import warnings
import mmcv
import torch

from mmcv import Config, DictAction
from mmcv.runner import get_dist_info, init_dist
from mmcv.utils import get_git_hash
from mmdet import __version__
from mmdet.apis import set_random_seed, train_detector
from mmdet.datasets import build_dataset
from mmdet.models import build_detector
from mmdet.utils import collect_env, get_root_logger

def parse_args():
    parser = argparse.ArgumentParser(description='Train a detector')
    parser.add_argument('config', help='train config file path')
    parser.add_argument('--work-dir', help='the dir to save logs and models')
    parser.add_argument(
        '--resume-from', help='the checkpoint file to resume from')
    parser.add_argument(
        '--no-validate',
        action='store_true',
        help='whether not to evaluate the checkpoint during training')
    group_gpus = parser.add_mutually_exclusive_group()
    group_gpus.add_argument(
        '--gpus',
        type=int,
        help='number of gpus to use '
        '(only applicable to non-distributed training)')
    group_gpus.add_argument(
        '--gpu-ids',
        type=int,
        nargs='+',
        help='ids of gpus to use '
        '(only applicable to non-distributed training)')
    parser.add_argument('--seed', type=int, default=None, help='random seed')
    parser.add_argument(
        '--deterministic',
        action='store_true',
        help='whether to set deterministic options for CUDNN backend.')
    parser.add_argument(
        '--options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file (deprecate), '
        'change to --cfg-options instead.')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. If the value to '
        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
        'Note that the quotation marks are necessary and that no white space '
        'is allowed.')
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='none',
        help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)

    if args.options and args.cfg_options:
        raise ValueError(
            '--options and --cfg-options cannot be both '
            'specified, --options is deprecated in favor of --cfg-options')
    if args.options:
        warnings.warn('--options is deprecated in favor of --cfg-options')
        args.cfg_options = args.options

    return args


def main():
    args = parse_args()

    cfg = Config.fromfile(args.config)
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)
    # import modules from string list.
    if cfg.get('custom_imports', None):
        from mmcv.utils import import_modules_from_strings
        import_modules_from_strings(**cfg['custom_imports'])
    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True

    # work_dir is determined in this priority: CLI > segment in file > filename
    if args.work_dir is not None:
        # update configs according to CLI args if args.work_dir is not None
        cfg.work_dir = args.work_dir
    elif cfg.get('work_dir', None) is None:
        # use config filename as default work_dir if cfg.work_dir is None
        cfg.work_dir = osp.join('./work_dirs',
                                osp.splitext(osp.basename(args.config))[0])
    if args.resume_from is not None:
        cfg.resume_from = args.resume_from
    if args.gpu_ids is not None:
        cfg.gpu_ids = args.gpu_ids
    else:
        cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)

    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)
        # re-set gpu_ids with distributed training mode
        _, world_size = get_dist_info()
        cfg.gpu_ids = range(world_size)

    # create work_dir
    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
    # dump config
    cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
    # init the logger before other steps
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)

    # init the meta dict to record some important information such as
    # environment info and seed, which will be logged
    meta = dict()
    # log env info
    env_info_dict = collect_env()
    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
    dash_line = '-' * 60 + '\n'
    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
                dash_line)
    meta['env_info'] = env_info
    meta['config'] = cfg.pretty_text
    # log some basic info
    logger.info(f'Distributed training: {distributed}')
    logger.info(f'Config:\n{cfg.pretty_text}')

    # set random seeds
    if args.seed is not None:
        logger.info(f'Set random seed to {args.seed}, '
                    f'deterministic: {args.deterministic}')
        set_random_seed(args.seed, deterministic=args.deterministic)
    cfg.seed = args.seed
    meta['seed'] = args.seed
    meta['exp_name'] = osp.basename(args.config)

    model = build_detector(
        cfg.model,
        train_cfg=cfg.get('train_cfg'),
        test_cfg=cfg.get('test_cfg'))

    datasets = [build_dataset(cfg.data.train)]
    if len(cfg.workflow) == 2:
        val_dataset = copy.deepcopy(cfg.data.val)
        val_dataset.pipeline = cfg.data.train.pipeline
        datasets.append(build_dataset(val_dataset))
    if cfg.checkpoint_config is not None:
        # save mmdet version, config file content and class names in
        # checkpoints as meta data
        cfg.checkpoint_config.meta = dict(
            mmdet_version=__version__ + get_git_hash()[:7],
            CLASSES=datasets[0].CLASSES)
    # add an attribute for visualization convenience
    model.CLASSES = datasets[0].CLASSES
    train_detector(
        model,
        datasets,
        cfg,
        distributed=distributed,
        validate=(not args.no_validate),
        timestamp=timestamp,
        meta=meta)


if __name__ == '__main__':
    main()

================================================
FILE: downstream_tasks/detection/loader.py
================================================
# Copyright (c) ByteDance, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

import random
import math
import numpy as np

from torchvision.datasets import ImageFolder

class ImageFolderInstance(ImageFolder):
    def __getitem__(self, index):
        img, target = super(ImageFolderInstance, self).__getitem__(index)
        return img, target, index

class ImageFolderMask(ImageFolder):
    def __init__(self, *args, patch_size, pred_ratio, pred_ratio_var, pred_aspect_ratio, 
                 pred_shape='block', pred_start_epoch=0, **kwargs):
        super(ImageFolderMask, self).__init__(*args, **kwargs)
        self.psz = patch_size
        self.pred_ratio = pred_ratio[0] if isinstance(pred_ratio, list) and \
            len(pred_ratio) == 1 else pred_ratio
        self.pred_ratio_var = pred_ratio_var[0] if isinstance(pred_ratio_var, list) and \
            len(pred_ratio_var) == 1 else pred_ratio_var
        if isinstance(self.pred_ratio, list) and not isinstance(self.pred_ratio_var, list):
            self.pred_ratio_var = [self.pred_ratio_var] * len(self.pred_ratio)
        self.log_aspect_ratio = tuple(map(lambda x: math.log(x), pred_aspect_ratio))
        self.pred_shape = pred_shape
        self.pred_start_epoch = pred_start_epoch

    def get_pred_ratio(self):
        if hasattr(self, 'epoch') and self.epoch < self.pred_start_epoch:
            return 0

        if isinstance(self.pred_ratio, list):
            pred_ratio = []
            for prm, prv in zip(self.pred_ratio, self.pred_ratio_var):
                assert prm >= prv
                pr = random.uniform(prm - prv, prm + prv) if prv > 0 else prm
                pred_ratio.append(pr)
            pred_ratio = random.choice(pred_ratio)
        else:
            assert self.pred_ratio >= self.pred_ratio_var
            pred_ratio = random.uniform(self.pred_ratio - self.pred_ratio_var, self.pred_ratio + \
                self.pred_ratio_var) if self.pred_ratio_var > 0 else self.pred_ratio
        
        return pred_ratio

    def set_epoch(self, epoch):
        self.epoch = epoch

    def __getitem__(self, index):
        output = super(ImageFolderMask, self).__getitem__(index)
                
        masks = []
        for img in output[0]:
            try:
                H, W = img.shape[1] // self.psz, img.shape[2] // self.psz
            except:
                # skip non-image
                continue
            
            high = self.get_pred_ratio() * H * W
            
            if self.pred_shape == 'block':
                # following BEiT (https://arxiv.org/abs/2106.08254), see at
                # https://github.com/microsoft/unilm/blob/b94ec76c36f02fb2b0bf0dcb0b8554a2185173cd/beit/masking_generator.py#L55
                mask = np.zeros((H, W), dtype=bool)
                mask_count = 0
                while mask_count < high:
                    max_mask_patches = high - mask_count

                    delta = 0
                    for attempt in range(10):
                        low = (min(H, W) // 3) ** 2 
                        target_area = random.uniform(low, max_mask_patches)
                        aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
                        h = int(round(math.sqrt(target_area * aspect_ratio)))
                        w = int(round(math.sqrt(target_area / aspect_ratio)))
                        if w < W and h < H:
                            top = random.randint(0, H - h)
                            left = random.randint(0, W - w)

                            num_masked = mask[top: top + h, left: left + w].sum()
                            if 0 < h * w - num_masked <= max_mask_patches:
                                for i in range(top, top + h):
                                    for j in range(left, left + w):
                                        if mask[i, j] == 0:
                                            mask[i, j] = 1
                                            delta += 1

                        if delta > 0:
                            break

                    if delta == 0:
                        break
                    else:
                        mask_count += delta
            
            elif self.pred_shape == 'rand':
                mask = np.hstack([
                    np.zeros(H * W - int(high)),
                    np.ones(int(high)),
                ]).astype(bool)
                np.random.shuffle(mask)
                mask = mask.reshape(H, W)

            else:
                # no implementation
                assert False

            masks.append(mask)

        return output + (masks,)

================================================
FILE: downstream_tasks/detection/models/__init__.py
================================================
from .vision_transformer import VisionTransformer, vit_tiny, vit_small, vit_base, vit_large
from .swin_transformer import SwinTransformer, swin_tiny, swin_small, swin_base, swin_large

================================================
FILE: downstream_tasks/detection/models/head.py
================================================
# Copyright (c) ByteDance, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

import torch
import torch.nn as nn
import utils

from utils import trunc_normal_

class CSyncBatchNorm(nn.SyncBatchNorm):
    def __init__(self,
                 *args,
                 with_var=False,
                 **kwargs):
        super(CSyncBatchNorm, self).__init__(*args, **kwargs)
        self.with_var = with_var

    def forward(self, x):
        # center norm
        self.training = False
        if not self.with_var:
            self.running_var = torch.ones_like(self.running_var)
        normed_x = super(CSyncBatchNorm, self).forward(x)
        # udpate center
        self.training = True
        _ = super(CSyncBatchNorm, self).forward(x)
        return normed_x

class PSyncBatchNorm(nn.SyncBatchNorm):
    def __init__(self,
                 *args,
                 bunch_size,
                 **kwargs):
        procs_per_bunch = min(bunch_size, utils.get_world_size())
        assert utils.get_world_size() % procs_per_bunch == 0
        n_bunch = utils.get_world_size() // procs_per_bunch
        #
        ranks = list(range(utils.get_world_size()))
        print('---ALL RANKS----\n{}'.format(ranks))
        rank_groups = [ranks[i*procs_per_bunch: (i+1)*procs_per_bunch] for i in range(n_bunch)]
        print('---RANK GROUPS----\n{}'.format(rank_groups))
        process_groups = [torch.distributed.new_group(pids) for pids in rank_groups]
        bunch_id = utils.get_rank() // procs_per_bunch
        process_group = process_groups[bunch_id]
        print('---CURRENT GROUP----\n{}'.format(process_group))
        super(PSyncBatchNorm, self).__init__(*args, process_group=process_group, **kwargs)

class CustomSequential(nn.Sequential):
    bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)

    def forward(self, input):
        for module in self:
            dim = len(input.shape)
            if isinstance(module, self.bn_types) and dim > 2:
                perm = list(range(dim - 1)); perm.insert(1, dim - 1)
                inv_perm = list(range(dim)) + [1]; inv_perm.pop(1)
                input = module(input.permute(*perm)).permute(*inv_perm)
            else:
                input = module(input)
        return input

class DINOHead(nn.Module):
    def __init__(self, in_dim, out_dim, norm=None, act='gelu', last_norm=None, 
                 nlayers=3, hidden_dim=2048, bottleneck_dim=256, norm_last_layer=True, **kwargs):
        super().__init__()
        norm = self._build_norm(norm, hidden_dim)
        last_norm = self._build_norm(last_norm, out_dim, affine=False, **kwargs)
        act = self._build_act(act)

        nlayers = max(nlayers, 1)
        if nlayers == 1:
            if bottleneck_dim > 0:
                self.mlp = nn.Linear(in_dim, bottleneck_dim)
            else:
                self.mlp = nn.Linear(in_dim, out_dim)
        else:
            layers = [nn.Linear(in_dim, hidden_dim)]
            if norm is not None:
                layers.append(norm)
            layers.append(act)
            for _ in range(nlayers - 2):
                layers.append(nn.Linear(hidden_dim, hidden_dim))
                if norm is not None:
                    layers.append(norm)
                layers.append(act)
            if bottleneck_dim > 0:
                layers.append(nn.Linear(hidden_dim, bottleneck_dim))
            else:
                layers.append(nn.Linear(hidden_dim, out_dim))
            self.mlp = CustomSequential(*layers)
        self.apply(self._init_weights)
        
        if bottleneck_dim > 0:
            self.last_layer = nn.utils.weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
            self.last_layer.weight_g.data.fill_(1)
            if norm_last_layer:
                self.last_layer.weight_g.requires_grad = False
        else:
            self.last_layer = None

        self.last_norm = last_norm

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.mlp(x)
        if self.last_layer is not None:
            x = nn.functional.normalize(x, dim=-1, p=2)
            x = self.last_layer(x)
        if self.last_norm is not None:
            x = self.last_norm(x)
        return x

    def _build_norm(self, norm, hidden_dim, **kwargs):
        if norm == 'bn':
            norm = nn.BatchNorm1d(hidden_dim, **kwargs)
        elif norm == 'syncbn':
            norm = nn.SyncBatchNorm(hidden_dim, **kwargs)
        elif norm == 'csyncbn':
            norm = CSyncBatchNorm(hidden_dim, **kwargs)
        elif norm == 'psyncbn':
            norm =  PSyncBatchNorm(hidden_dim, **kwargs)
        elif norm == 'ln':
            norm = nn.LayerNorm(hidden_dim, **kwargs)
        else:
            assert norm is None, "unknown norm type {}".format(norm)
        return norm

    def _build_act(self, act):
        if act == 'relu':
            act = nn.ReLU()
        elif act == 'gelu':
            act = nn.GELU()
        else:
            assert False, "unknown act type {}".format(act)
        return act

class iBOTHead(DINOHead):

    def __init__(self, *args, patch_out_dim=8192, norm=None, act='gelu', last_norm=None, 
                 nlayers=3, hidden_dim=2048, bottleneck_dim=256, norm_last_layer=True, 
                 shared_head=False, **kwargs):
        
        super(iBOTHead, self).__init__(*args,
                                        norm=norm,
                                        act=act,
                                        last_norm=last_norm,
                                        nlayers=nlayers,
                                        hidden_dim=hidden_dim,
                                        bottleneck_dim=bottleneck_dim,
                                        norm_last_layer=norm_last_layer, 
                                        **kwargs)

        if not shared_head:
            if bottleneck_dim > 0:
                self.last_layer2 = nn.utils.weight_norm(nn.Linear(bottleneck_dim, patch_out_dim, bias=False))
                self.last_layer2.weight_g.data.fill_(1)
                if norm_last_layer:
                    self.last_layer2.weight_g.requires_grad = False
            else:
                self.mlp2 = nn.Linear(hidden_dim, patch_out_dim)
                self.last_layer2 = None

            self.last_norm2 = self._build_norm(last_norm, patch_out_dim, affine=False, **kwargs)
        else:
            if bottleneck_dim > 0:
                self.last_layer2 = self.last_layer
            else:
                self.mlp2 = self.mlp[-1]
                self.last_layer2 = None

            self.last_norm2 = self.last_norm

    def forward(self, x):
        if len(x.shape) == 2:
            return super(iBOTHead, self).forward(x)

        if self.last_layer is not None:
            x = self.mlp(x)
            x = nn.functional.normalize(x, dim=-1, p=2)
            x1 = self.last_layer(x[:, 0])
            x2 = self.last_layer2(x[:, 1:])
        else:
            x = self.mlp[:-1](x)
            x1 = self.mlp[-1](x[:, 0])
            x2 = self.mlp2(x[:, 1:])
        
        if self.last_norm is not None:
            x1 = self.last_norm(x1)
            x2 = self.last_norm2(x2)
        
        return x1, x2


================================================
FILE: downstream_tasks/detection/models/swin_transformer.py
================================================
# Copyright (c) ByteDance, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

"""
Mostly copy-paste from Swin-Transformer libarary:
https://github.com/facebookresearch/dino
https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
"""

import os
import logging
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.distributed as dist

from math import sqrt
from functools import partial
from timm.models.layers import DropPath, to_2tuple, trunc_normal_
from timm.models.registry import register_model

class Mlp(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
        super(Mlp, self).__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


def window_partition(x, window_size):
    """
    Args:
        x: (B, H, W, C)
        window_size (int): window size

    Returns:
        windows: (num_windows*B, window_size, window_size, C)
    """
    B, H, W, C = x.shape
    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
    return windows


def window_reverse(windows, window_size, H, W):
    """
    Args:
        windows: (num_windows*B, window_size, window_size, C)
        window_size (int): Window size
        H (int): Height of image
        W (int): Width of image

    Returns:
        x: (B, H, W, C)
    """
    B = int(windows.shape[0] / (H * W / window_size / window_size))
    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
    return x


class WindowAttention(nn.Module):
    r"""Window based multi-head self attention (W-MSA) module with relative position bias.
    It supports both of shifted and non-shifted window.

    Args:
        dim (int): Number of input channels.
        window_size (tuple[int]): The height and width of the window.
        num_heads (int): Number of attention heads.
        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
    """

    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):

        super(WindowAttention, self).__init__()
        self.dim = dim
        self.window_size = window_size  # Wh, Ww
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim ** -0.5

        # define a parameter table of relative position bias
        self.relative_position_bias_table = nn.Parameter(
            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH

        # get pair-wise relative position index for each token inside the window
        coords_h = torch.arange(self.window_size[0])
        coords_w = torch.arange(self.window_size[1])
        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
        coords_flatten = torch.flatten(coords, 1)  # 2 Wh*Ww
        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
        relative_coords[:, :, 1] += self.window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
        self.register_buffer("relative_position_index", relative_position_index)

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

        trunc_normal_(self.relative_position_bias_table, std=.02)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, mask=None):
        """
        Args:
            x: input features with shape of (num_windows*B, N, C)
            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
        """
        B_, N, C = x.shape
        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)

        q = q * self.scale
        attn = (q @ k.transpose(-2, -1))

        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
        attn = attn + relative_position_bias.unsqueeze(0)

        if mask is not None:
            nW = mask.shape[0]
            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
            attn = attn.view(-1, self.num_heads, N, N)
            attn = self.softmax(attn)
        else:
            attn = self.softmax(attn)

        attn_out = attn
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x, attn_out

    def extra_repr(self) -> str:
        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'

    def flops(self, N):
        # calculate flops for 1 window with token length of N
        flops = 0
        # qkv = self.qkv(x)
        flops += N * self.dim * 3 * self.dim
        # attn = (q @ k.transpose(-2, -1))
        flops += self.num_heads * N * (self.dim // self.num_heads) * N
        #  x = (attn @ v)
        flops += self.num_heads * N * N * (self.dim // self.num_heads)
        # x = self.proj(x)
        flops += N * self.dim * self.dim
        return flops

    @staticmethod
    def compute_macs(module, input, output):
        B, N, C = input[0].shape

        module.__flops__ += module.flops(N) * B


class SwinTransformerBlock(nn.Module):
    r"""Swin Transformer Block.

    Args:
        dim (int): Number of input channels.
        input_resolution (tuple[int]): Input resulotion.
        num_heads (int): Number of attention heads.
        window_size (int): Window size.
        shift_size (int): Shift size for SW-MSA.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float, optional): Stochastic depth rate. Default: 0.0
        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
    """

    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
        super().__init__()
        self.dim = dim
        self.input_resolution = input_resolution
        self.num_heads = num_heads
        self.window_size = window_size
        self.shift_size = shift_size
        self.mlp_ratio = mlp_ratio
        if min(self.input_resolution) <= self.window_size:
            # if window size is larger than input resolution, we don't partition windows
            self.shift_size = 0
            self.window_size = min(self.input_resolution)
        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"

        self.norm1 = norm_layer(dim)
        self.attn = WindowAttention(
            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)

        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)

        self.H = input_resolution[0]
        self.W = input_resolution[1]

        self.attn_mask_dict = {} # {self.H: self.create_attn_mask(self.H, self.W)}


    def create_attn_mask(self, H, W):
        # calculate attention mask for SW-MSA

        Hp = int(np.ceil(H / self.window_size)) * self.window_size
        Wp = int(np.ceil(W / self.window_size)) * self.window_size
        img_mask = torch.zeros((1, Hp, Wp, 1))  # 1 Hp Wp 1
        h_slices = (slice(0, -self.window_size),
                    slice(-self.window_size, -self.shift_size),
                    slice(-self.shift_size, None))
        w_slices = (slice(0, -self.window_size),
                    slice(-self.window_size, -self.shift_size),
                    slice(-self.shift_size, None))
        cnt = 0
        for h in h_slices:
            for w in w_slices:
                img_mask[:, h, w, :] = cnt
                cnt += 1

        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))

        return attn_mask


    def forward(self, x):

        B, L, C = x.shape
        H = int(sqrt(L))
        W = H

        shortcut = x
        x = self.norm1(x)
        x = x.view(B, H, W, C)

        # pad feature maps to multiples of window size
        pad_l = pad_t = 0
        pad_r = (self.window_size - W % self.window_size) % self.window_size
        pad_b = (self.window_size - H % self.window_size) % self.window_size
        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
        _, Hp, Wp, _ = x.shape

        # cyclic shift
        if self.shift_size > 0:
            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))

            if H is self.attn_mask_dict.keys():
                attn_mask = self.attn_mask_dict[H]
            else:
                self.attn_mask_dict[H] = self.create_attn_mask(H, W).to(x.device)
                attn_mask = self.attn_mask_dict[H]

        else:
            shifted_x = x
            attn_mask = None

        # partition windows
        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C

        # W-MSA/SW-MSA
        attn_windows, attn = self.attn(x_windows, attn_mask)  # nW*B, window_size*window_size, C

        # merge windows
        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C

        # reverse cyclic shift
        if self.shift_size > 0:
            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
        else:
            x = shifted_x

        if pad_r > 0 or pad_b > 0:
            x = x[:, :H, :W, :].contiguous()

        x = x.view(B, H * W, C)

        # FFN
        x = shortcut + self.drop_path(x)
        x = x + self.drop_path(self.mlp(self.norm2(x)))

        return x, attn

    def extra_repr(self) -> str:
        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
               f"window_size={self.window_size}, shift_size={self.shift_size} mlp_ratio={self.mlp_ratio}"

    def flops(self):
        flops = 0
        H, W = self.input_resolution
        # norm1
        flops += self.dim * H * W
        # W-MSA/SW-MSA
        nW = H * W / self.window_size / self.window_size
        flops += nW * self.attn.flops(self.window_size * self.window_size)
        # mlp
        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
        # norm2
        flops += self.dim * H * W
        return flops


class PatchMerging(nn.Module):
    r"""Patch Merging Layer.

    Args:
        input_resolution (tuple[int]): Resolution of input feature.
        dim (int): Number of input channels.
        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
    """

    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
        super().__init__()
        self.input_resolution = input_resolution
        self.dim = dim
        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
        self.norm = norm_layer(4 * dim)

    def forward(self, x):
        """ Forward function.
        Args:
            x: Input feature, tensor size (B, H*W, C).
            H, W: Spatial resolution of the input feature.
        """
        B, L, C = x.shape
        H = int(sqrt(L))
        W = H

        x = x.view(B, H, W, C)

        # padding
        pad_input = (H % 2 == 1) or (W % 2 == 1)
        if pad_input:
            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))

        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C

        x = self.norm(x)
        x = self.reduction(x)

        return x


    def extra_repr(self) -> str:
        return f"input_resolution={self.input_resolution}, dim={self.dim}"

    def flops(self):
        H, W = self.input_resolution
        flops = H * W * self.dim
        flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
        return flops


class BasicLayer(nn.Module):
    """A basic Swin Transformer layer for one stage.

    Args:
        dim (int): Number of input channels.
        input_resolution (tuple[int]): Input resulotion.
        depth (int): Number of blocks.
        num_heads (int): Number of attention heads.
        window_size (int): Window size.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
    """

    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None):

        super().__init__()
        self.dim = dim
        self.input_resolution = input_resolution
        self.depth = depth

        self.blocks = nn.ModuleList([
            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
                                 num_heads=num_heads, window_size=window_size,
                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
                                 mlp_ratio=mlp_ratio,
                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
                                 drop=drop, attn_drop=attn_drop,
                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
                                 norm_layer=norm_layer)
            for i in range(depth)])
        if downsample is not None:
            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
        else:
            self.downsample = None

    def forward(self, x):
        for blk in self.blocks:
            x, _ = blk(x)
        if self.downsample is not None:
            x = self.downsample(x)
        return x

    def forward_with_features(self, x):
        fea = []
        for blk in self.blocks:
            x, _ = blk(x)
            fea.append(x)
        if self.downsample is not None:
            x = self.downsample(x)
        return x, fea

    def forward_with_attention(self, x):
        attns = []
        for blk in self.blocks:
            x, attn = blk(x)
            attns.append(attn)
        if self.downsample is not None:
            x = self.downsample(x)
        return x, attns


    def extra_repr(self) -> str:
        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"

    def flops(self):
        flops = 0
        for blk in self.blocks:
            flops += blk.flops()
        if self.downsample is not None:
            flops += self.downsample.flops()
        return flops


class PatchEmbed(nn.Module):
    """ Image to Patch Embedding
    """

    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None):
        super().__init__()
        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)
        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
        self.img_size = img_size
        self.patch_size = patch_size
        self.patches_resolution = patches_resolution
        self.num_patches = patches_resolution[0] * patches_resolution[1]

        self.in_chans = in_chans
        self.embed_dim = embed_dim

        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
        if norm_layer is not None:
            self.norm = norm_layer(embed_dim)
        else:
            self.norm = None

    def forward(self, x):

        # # FIXME look at relaxing size constraints
        # assert H == self.img_size[0] and W == self.img_size[1], \
        #     f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."

        x = self.proj(x)
        B, C, H, W = x.shape
        x = x.flatten(2).transpose(1, 2)  # B Ph*Pw C
        if self.norm is not None:
            x = self.norm(x)
        return x.transpose(1, 2).reshape(B, C, H, W)


    def flops(self):
        Ho, Wo = self.patches_resolution
        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
        if self.norm is not None:
            flops += Ho * Wo * self.embed_dim
        return flops


class SwinTransformer(nn.Module):
    r""" Swin Transformer
        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
          https://arxiv.org/pdf/2103.14030

    Args:
        img_size (int | tuple(int)): Input image size.
        patch_size (int | tuple(int)): Patch size.
        in_chans (int): Number of input channels.
        num_classes (int): Number of classes for classification head.
        embed_dim (int): Embedding dimension.
        depths (tuple(int)): Depth of Swin Transformer layers.
        num_heads (tuple(int)): Number of attention heads in different layers.
        window_size (int): Window size.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: Truee
        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
        drop_rate (float): Dropout rate.
        attn_drop_rate (float): Attention dropout rate.
        drop_path_rate (float): Stochastic depth rate.
        norm_layer (nn.Module): normalization layer.
        ape (bool): If True, add absolute position embedding to the patch embedding.
        patch_norm (bool): If True, add normalization after patch embedding.
    """

    def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000,
                 embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24],
                 window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None,
                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.,
                 norm_layer=partial(nn.LayerNorm, eps=1e-6), ape=False, patch_norm=True, 
                 return_all_tokens=False, use_mean_pooling=True, masked_im_modeling=False):

        super().__init__()

        self.num_classes = num_classes
        self.depths = depths
        self.num_layers = len(depths)
        self.embed_dim = embed_dim
        self.ape = ape
        self.patch_norm = patch_norm
        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
        self.mlp_ratio = mlp_ratio
        self.return_all_tokens = return_all_tokens

        self.patch_embed = PatchEmbed(
            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
            norm_layer=norm_layer if self.patch_norm else None)
        num_patches = self.patch_embed.num_patches
        patches_resolution = self.patch_embed.patches_resolution
        self.patches_resolution = patches_resolution

        if self.ape:
            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
            trunc_normal_(self.absolute_pos_embed, std=.02)

        self.pos_drop = nn.Dropout(p=drop_rate)

        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
        self.layers = nn.ModuleList()
        for i_layer in range(self.num_layers):
            layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer),
                               input_resolution=(patches_resolution[0] // (2 ** i_layer),
                                                 patches_resolution[1] // (2 ** i_layer)),
                               depth=depths[i_layer],
                               num_heads=num_heads[i_layer],
                               window_size=window_size,
                               mlp_ratio=self.mlp_ratio,
                               qkv_bias=qkv_bias, qk_scale=qk_scale,
                               drop=drop_rate, attn_drop=attn_drop_rate,
                               drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
                               norm_layer=norm_layer,
                               downsample=PatchMerging if (i_layer < self.num_layers - 1) else None)
            self.layers.append(layer)

        self.norm = norm_layer(self.num_features)
        self.avgpool = nn.AdaptiveAvgPool1d(1)
        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()

        self.apply(self._init_weights)

        # masked image modeling
        self.masked_im_modeling = masked_im_modeling
        if masked_im_modeling:
            self.masked_embed = nn.Parameter(torch.zeros(1, embed_dim))

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    @torch.jit.ignore
    def no_weight_decay(self):
        return {'absolute_pos_embed'}

    @torch.jit.ignore
    def no_weight_decay_keywords(self):
        # todo: to be implemented
        return {'relative_position_bias_table'}

    def forward(self, x, return_all_tokens=None, mask=None):
        # patch linear embedding
        x = self.patch_embed(x)
        # mask image modeling
        if mask is not None:
            x = self.mask_model(x, mask)
        x = x.flatten(2).transpose(1, 2)

        if self.ape:
            x = x + self.absolute_pos_embed
        x = self.pos_drop(x)

        for layer in self.layers:
            x = layer(x)

        x_region = self.norm(x)  # B L C
        x = self.avgpool(x_region.transpose(1, 2))  # B C 1
        x = torch.flatten(x, 1)

        return_all_tokens = self.return_all_tokens if \
            return_all_tokens is None else return_all_tokens
        if return_all_tokens:
            return torch.cat([x.unsqueeze(1), x_region], dim=1)
        return x

    def get_selfattention(self, x, n=1):
        # n=1 return the last layer attn map; otherwise return attn maps in all layers
        x = self.patch_embed(x)
        x = x.flatten(2).transpose(1, 2)
        if self.ape:
            x = x + self.absolute_pos_embed
        x = self.pos_drop(x)

        if n==1:
            return self.get_last_selfattention(x)
        else:
            return self.get_all_selfattention(x)

    def get_last_selfattention(self, x):

        for i, layer in enumerate(self.layers):
            if i < len(self.layers) - 1:
                x = layer(x)
            else:
                x, attns = layer.forward_with_attention(x)
                return attns[-1]

    def get_all_selfattention(self, x):
        attn_out = []

        for layer in self.layers:
            x, attns = layer.forward_with_attention(x)
            attn_out += attns

        return attn_out

    def get_intermediate_layers(self, x, n=1, return_patch_avgpool=False):

        num_blks = sum(self.depths)
        start_idx = num_blks - n

        sum_cur = 0
        for i, d in enumerate(self.depths):
            sum_cur_new = sum_cur + d
            if start_idx >= sum_cur and start_idx < sum_cur_new:
                start_stage = i
                start_blk = start_idx - sum_cur
            sum_cur = sum_cur_new


        x = self.patch_embed(x)
        x = x.flatten(2).transpose(1, 2)
        if self.ape:
            x = x + self.absolute_pos_embed
        x = self.pos_drop(x)

        # we will return the averaged token features from the `n` last blocks
        # note: there is no [CLS] token in Swin Transformer
        output = []
        s = 0
        for i, layer in enumerate(self.layers):
            x, fea = layer.forward_with_features(x)

            if i >= start_stage:
                for x_ in fea[start_blk:]:

                    if i == len(self.layers)-1: # use the norm in the last stage
                        x_ = self.norm(x_)

                    x_avg = torch.flatten(self.avgpool(x_.transpose(1, 2)), 1)  # B C 
                    if return_patch_avgpool:
                        x_o = x_avg
                    else:
                        x_o = torch.cat((x_avg.unsqueeze(1), x_), dim=1)
                    # print(f'Stage {i},  x_o {x_o.shape}')          
                    output.append(x_o)

                start_blk = 0

        #return torch.cat(output, dim=-1)
        return output

    def flops(self):
        flops = 0
        flops += self.patch_embed.flops()
        for i, layer in enumerate(self.layers):
            flops += layer.flops()
            if dist.get_rank() == 0:
                print(f"GFLOPs layer_{i}: {layer.flops() / 1e9}")
        flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers)
        flops += self.num_features * self.num_classes
        return flops

    def init_weights(self, pretrained='', pretrained_layers=[], verbose=True):
        if os.path.isfile(pretrained):
            pretrained_dict = torch.load(pretrained, map_location='cpu')
            logging.info(f'=> loading pretrained model {pretrained}')
            model_dict = self.state_dict()
            pretrained_dict = {
                k: v for k, v in pretrained_dict.items()
                if k in model_dict.keys()
            }
            need_init_state_dict = {}
            for k, v in pretrained_dict.items():
                need_init = (
                        k.split('.')[0] in pretrained_layers
                        or pretrained_layers[0] is '*'
                        or 'relative_position_index' not in k
                        or 'attn_mask' not in k
                )

                if need_init:
                    if verbose:
                        logging.info(f'=> init {k} from {pretrained}')

                    if 'relative_position_bias_table' in k and v.size() != model_dict[k].size():
                        relative_position_bias_table_pretrained = v
                        relative_position_bias_table_current = model_dict[k]
                        L1, nH1 = relative_position_bias_table_pretrained.size()
                        L2, nH2 = relative_position_bias_table_current.size()
                        if nH1 != nH2:
                            logging.info(f"Error in loading {k}, passing")
                        else:
                            if L1 != L2:
                                logging.info(
                                    '=> load_pretrained: resized variant: {} to {}'
                                        .format((L1, nH1), (L2, nH2))
                                )
                                S1 = int(L1 ** 0.5)
                                S2 = int(L2 ** 0.5)
                                relative_position_bias_table_pretrained_resized = torch.nn.functional.interpolate(
                                    relative_position_bias_table_pretrained.permute(1, 0).view(1, nH1, S1, S1),
                                    size=(S2, S2),
                                    mode='bicubic')
                                v = relative_position_bias_table_pretrained_resized.view(nH2, L2).permute(1, 0)

                    if 'absolute_pos_embed' in k and v.size() != model_dict[k].size():
                        absolute_pos_embed_pretrained = v
                        absolute_pos_embed_current = model_dict[k]
                        _, L1, C1 = absolute_pos_embed_pretrained.size()
                        _, L2, C2 = absolute_pos_embed_current.size()
                        if C1 != C1:
                            logging.info(f"Error in loading {k}, passing")
                        else:
                            if L1 != L2:
                                logging.info(
                                    '=> load_pretrained: resized variant: {} to {}'
                                        .format((1, L1, C1), (1, L2, C2))
                                )
                                S1 = int(L1 ** 0.5)
                                S2 = int(L2 ** 0.5)
                                absolute_pos_embed_pretrained = absolute_pos_embed_pretrained.reshape(-1, S1, S1, C1)
                                absolute_pos_embed_pretrained = absolute_pos_embed_pretrained.permute(0, 3, 1, 2)
                                absolute_pos_embed_pretrained_resized = torch.nn.functional.interpolate(
                                    absolute_pos_embed_pretrained, size=(S2, S2), mode='bicubic')
                                v = absolute_pos_embed_pretrained_resized.permute(0, 2, 3, 1).flatten(1, 2)

                    need_init_state_dict[k] = v
            self.load_state_dict(need_init_state_dict, strict=False)

    def freeze_pretrained_layers(self, frozen_layers=[]):
        for name, module in self.named_modules():
            if (
                    name.split('.')[0] in frozen_layers
                    or '.'.join(name.split('.')[0:2]) in frozen_layers
                    or (len(frozen_layers) > 0 and frozen_layers[0] is '*')
            ):
                for _name, param in module.named_parameters():
                    param.requires_grad = False
                logging.info(
                    '=> set param {} requires grad to False'
                        .format(name)
                )
        for name, param in self.named_parameters():
            if (
                    name.split('.')[0] in frozen_layers
                    or (len(frozen_layers) > 0 and frozen_layers[0] is '*')
                    and param.requires_grad is True
            ):
                param.requires_grad = False
                logging.info(
                    '=> set param {} requires grad to False'
                        .format(name)
                )
        return self

    def get_num_layers(self):
        #return len(self.layers)
        return sum(self.depths)

    def mask_model(self, x, mask):
        # extend mask for hierarchical features
        if x.shape[-2:] != mask.shape[-2:]:
            htimes, wtimes = np.array(x.shape[-2:]) // np.array(mask.shape[-2:])
            mask = mask.repeat_interleave(htimes, -2).repeat_interleave(wtimes, -1)
        
        # mask embed
        x.permute(0, 2, 3, 1)[mask, :] = self.masked_embed.to(x.dtype)

        return x

@register_model
def swin_tiny(window_size=7, **kwargs):
    model = SwinTransformer(
        window_size=window_size, embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24],
        mlp_ratio=4, qkv_bias=True, drop_path_rate=kwargs.pop('drop_path_rate', 0.1), **kwargs)
    return model

@register_model
def swin_small(window_size=7, **kwargs):
    model = SwinTransformer(
        window_size=window_size, embed_dim=96, depths=[2, 2, 18, 2], num_heads=[3, 6, 12, 24],
        mlp_ratio=4, qkv_bias=True, drop_path_rate=kwargs.pop('drop_path_rate', 0.2), **kwargs)
    return model

@register_model
def swin_base(window_size=7, **kwargs):
    model = SwinTransformer(
        window_size=window_size, embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32],
        mlp_ratio=4, qkv_bias=True, drop_path_rate=kwargs.pop('drop_path_rate', 0.2), **kwargs)
    return model

@register_model
def swin_large(window_size=7, **kwargs):
    model = SwinTransformer(
        window_size=window_size, embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48],
        mlp_ratio=4, qkv_bias=True, drop_path_rate=kwargs.pop('drop_path_rate', 0.2), **kwargs)
    return model

================================================
FILE: downstream_tasks/detection/models/vision_transformer.py
================================================
# Copyright (c) ByteDance, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

"""
Mostly copy-paste from DINO and timm library:
https://github.com/facebookresearch/dino
https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
"""

import math
import torch
import torch.nn as nn
import torch.nn.functional as F

from functools import partial
from utils import trunc_normal_
from timm.models.registry import register_model

def drop_path(x, drop_prob: float = 0., training: bool = False):
    if drop_prob == 0. or not training:
        return x
    keep_prob = 1 - drop_prob
    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
    random_tensor.floor_()  # binarize
    output = x.div(keep_prob) * random_tensor
    return output


class DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    """
    def __init__(self, drop_prob=None):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training)


class Mlp(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


class Attention(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., window_size=None):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim ** -0.5

        #self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        ####add by wxd
        self.qkv = nn.Linear(dim, dim * 3, bias=False)
        all_head_dim = head_dim * self.num_heads
        if qkv_bias:
            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
        else:
            self.q_bias = None
            self.v_bias = None
        


        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x, x_rel_pos_bias = None):
        B, N, C = x.shape
        #qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        ########add by wxd
        qkv_bias = None
        if self.q_bias is not None:
            qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        attn = (q @ k.transpose(-2, -1)) * self.scale

#        if self.relative_position_bias_table is not None:
#            relative_position_bias = \
#                self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
#                    self.window_size[0] * self.window_size[1] + 1,
#                    self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
#            print("################before relative:", relative_position_bias.shape)
#            relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
#            print("################after relative:", relative_position_bias.shape)
#            relative_position_bias = intepolate_rpe(relative_position_bias)
#            print("################ater inter relative:", relative_position_bias.shape)
        if x_rel_pos_bias is not None:
            attn = attn + x_rel_pos_bias.unsqueeze(0)

        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x, attn

class Block(nn.Module):
    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., 
                 attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, window_size=None, init_values=0):
        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attn = Attention(
            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, window_size=window_size)
        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)

        if init_values > 0:
            self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True)
            self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True)
        else:
            self.gamma_1, self.gamma_2 = None, None



    def forward(self, x, x_rel_pos_bias=None, return_attention=False):
        y, attn = self.attn(self.norm1(x), x_rel_pos_bias)
        if return_attention:
            return attn
        if self.gamma_1 is None:
            x = x + self.drop_path(y)
            x = x + self.drop_path(self.mlp(self.norm2(x)))
        else:
            x = x + self.drop_path(self.gamma_1 * y)
            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
        return x

#class PatchEmbed(nn.Module):
#    """ Image to Patch Embedding
#    """
#    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
#        super().__init__()
#        num_patches = (img_size // patch_size) * (img_size // patch_size)
#        self.img_size = img_size
#        self.patch_size = patch_size
#        self.num_patches = num_patches
#        print("#################patch in!!!")
#        self.patch_shape = (img_size // patch_size, img_size // patch_size)
#
#        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
#            
#    def forward(self, x):
#        B, C, H, W = x.shape
#        return self.proj(x)


class PatchEmbed(nn.Module):
    """ Image to Patch Embedding
    """
    def __init__(self, img_size=[224, 224], patch_size=16, in_chans=3, embed_dim=768):
        super().__init__()
        self.num_patches_w = img_size[0] // patch_size
        self.num_patches_h = img_size[1] // patch_size

        num_patches = self.num_patches_w * self.num_patches_h
        self.patch_shape = (img_size[0] // patch_size, img_size[1] // patch_size)
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = num_patches
        print("##############patch here!!!")
        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
            
    def forward(self, x, mask=None):
        B, C, H, W = x.shape
        return self.proj(x)

class VisionTransformer(nn.Module):
    """ Vision Transformer """
    def __init__(self, img_size=[224], patch_size=16, in_chans=3, num_classes=0, embed_dim=768, depth=12,
                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
                 drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-6), return_all_tokens=False, 
                 init_values=0, use_sincos_pos_emb=False, use_abs_pos_emb=False, use_rel_pos_bias=False, use_mean_pooling=False, masked_im_modeling=False):
        super().__init__()
        self.num_features = self.embed_dim = embed_dim
        self.return_all_tokens = return_all_tokens
        print("############use_abs_pos:", use_abs_pos_emb)
        print("############use_sincos_pos:", use_sincos_pos_emb)
        print("############use_rel_pos_bias:", use_rel_pos_bias)
        self.use_abs_pos_emb = use_abs_pos_emb
        self.use_sincos_pos_emb = use_sincos_pos_emb

        self.patch_embed = PatchEmbed(
            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
        num_patches = self.patch_embed.num_patches

        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        if use_abs_pos_emb:
            self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
        else:
            self.pos_embed = None

        self.pos_drop = nn.Dropout(p=drop_rate)

        self.use_rel_pos_bias = use_rel_pos_bias

        if self.use_rel_pos_bias:
            print("=================use RelativePositionBias===================")
            window_size=self.patch_embed.patch_shape
            self.window_size = window_size
            self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
            self.relative_position_bias_table = nn.Parameter(
                torch.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
            # cls to token & token 2 cls & cls to cls

        else:
            self.window_size = None
            self.relative_position_bias_table = None
            self.relative_position_index = None

        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
        self.blocks = nn.ModuleList([
            Block(
                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, 
                init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None)
            for i in range(depth)])

        self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim)
        self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
        # Classifier head
        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
        
        if use_abs_pos_emb:
            trunc_normal_(self.pos_embed, std=.02)
        trunc_normal_(self.cls_token, std=.02)
        self.apply(self._init_weights)

        # masked image modeling
        self.masked_im_modeling = masked_im_modeling
        if masked_im_modeling:
            self.masked_embed = nn.Parameter(torch.zeros(1, embed_dim))

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    def interpolate_pos_encoding(self, x, w, h):
        npatch = x.shape[1] - 1
        print("############self.pos_embed:", self.pos_embed)
        N = self.pos_embed.shape[1] - 1
        if npatch == N and w == h:
            return self.pos_embed
        class_pos_embed = self.pos_embed[:, 0]
        patch_pos_embed = self.pos_embed[:, 1:]
        dim = x.shape[-1]
        w0 = w // self.patch_embed.patch_size
        h0 = h // self.patch_embed.patch_size
        # we add a small number to avoid floating point error in the interpolation
        # see discussion at https://github.com/facebookresearch/dino/issues/8
        w0, h0 = w0 + 0.1, h0 + 0.1
        patch_pos_embed = nn.functional.interpolate(
            patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
            scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
            mode='bicubic',
        )
        assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1]
        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)

    def prepare_tokens(self, x, mask=None):
        B, nc, w, h = x.shape
        # patch linear embedding
        x = self.patch_embed(x)

        # mask image modeling
        if mask is not None:
            x = self.mask_model(x, mask)
        x = x.flatten(2).transpose(1, 2)

        # add the [CLS] token to the embed patch tokens
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        
        # add positional encoding to each token
        if self.pos_embed is not None:
            x = x + self.interpolate_pos_encoding(x, w, h)

        return self.pos_drop(x)

    def forward(self, x, return_all_tokens=None, mask=None):
        # mim
        if self.masked_im_modeling:
            assert mask is not None
            x = self.prepare_tokens(x, mask=mask)
        else:
            x = self.prepare_tokens(x)

        for blk in self.blocks:
            x = blk(x)

        x = self.norm(x)
        if self.fc_norm is not None:
            x[:, 0] = self.fc_norm(x[:, 1:, :].mean(1))
        
        return_all_tokens = self.return_all_tokens if \
            return_all_tokens is None else return_all_tokens
        if return_all_tokens:
            return x
        return x[:, 0]

    def get_last_selfattention(self, x):
        x = self.prepare_tokens(x)
        for i, blk in enumerate(self.blocks):
            if i < len(self.blocks) - 1:
                x = blk(x)
            else:
                # return attention of the last block
                return blk(x, return_attention=True)

    def get_intermediate_layers(self, x, n=1):
        x = self.prepare_tokens(x)
        # we return the output tokens from the `n` last blocks
        output = []
        for i, blk in enumerate(self.blocks):
            x = blk(x)
            if len(self.blocks) - i <= n:
                output.append(self.norm(x))
        return output
        
    def get_num_layers(self):
        return len(self.blocks)

    def mask_model(self, x, mask):
        x.permute(0, 2, 3, 1)[mask, :] = self.masked_embed.to(x.dtype)
        return x

def vit_tiny(patch_size=16, **kwargs):
    model = VisionTransformer(
        patch_size=patch_size, embed_dim=192, depth=12, num_heads=3, mlp_ratio=4,
        qkv_bias=True, **kwargs)
    return model

def vit_small(patch_size=16, **kwargs):
    model = VisionTransformer(
        patch_size=patch_size, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4,
        qkv_bias=True, **kwargs)
    return model

def vit_base(patch_size=16, **kwargs):
    model = VisionTransformer(
        patch_size=patch_size, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4,
        qkv_bias=True, **kwargs)
    return model

def vit_large(patch_size=16, **kwargs):
    model = VisionTransformer(
        patch_size=patch_size, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4,
        qkv_bias=True, **kwargs)
    return model


================================================
FILE: downstream_tasks/detection/scripts/run_eval.sh
================================================
#!/usr/bin/env bash

echo "EVAL MODEL:"$MODEL
python -m torch.distributed.launch --nproc_per_node=8 \
    evaluation/object_detection/test.py \
    $CONFIG \
    $MODEL \
    --launcher pytorch \
    --eval bbox segm \
    --cfg-options model.backbone.use_checkpoint=True \
    ${@:6}



================================================
FILE: downstream_tasks/detection/scripts/run_train_maskrcnn_vit_base.sh
================================================
#!/usr/bin/env bash

python -m torch.distributed.launch --nproc_per_node=8 \
    --nnodes=$NNODES \
    --node_rank=$RANK \
    --master_addr=$ADDRESS \
    --master_port=$PORT \
    evaluation/object_detection/train.py \
    evaluation/object_detection/configs/mask_rcnn/vit_base_giou_4conv1f_coco_maskrcnn_1x_cae_sincos_init0.1_lr00003.py \
    --launcher pytorch \
    --work-dir $OUTPUT_DIR \
    --no-validate \
    --deterministic \
    --cfg-options model.backbone.use_checkpoint=True \
    model.pretrained=$PRETRAINED \
    ${@:6}



================================================
FILE: downstream_tasks/detection/scripts/run_train_maskrcnn_vit_large.sh
================================================
#!/usr/bin/env bash

python -m torch.distributed.launch --nproc_per_node=8 \
    --nnodes=$NNODES \
    --node_rank=$RANK \
    --master_addr=$ADDRESS \
    --master_port=$PORT \
    evaluation/object_detection/train.py \
    evaluation/object_detection/configs/mask_rcnn/vit_large_giou_4conv1f_coco_maskrcnn_1x_cae_sincos_init0.1_lr00002_lrdr0.85_dp0.2.py \
    --launcher pytorch \
    --work-dir $OUTPUT_DIR \
    --no-validate \
    --deterministic \
    --cfg-options model.backbone.use_checkpoint=True \
	model.pretrained=$PRETRAINED \
    ${@:6}



================================================
FILE: downstream_tasks/detection/utils.py
================================================
# Copyright (c) ByteDance, Inc. and its affiliates.
#

Download .txt

gitextract_wj4p2u9n/

├── .gitignore
├── .gitignore.swp
├── README.md
├── dall_e/
│   ├── __init__.py
│   ├── decoder.py
│   ├── encoder.py
│   └── utils.py
├── downstream_tasks/
│   ├── detection/
│   │   ├── README.md
│   │   ├── evaluation/
│   │   │   └── object_detection/
│   │   │       ├── configs/
│   │   │       │   ├── _base_/
│   │   │       │   │   ├── datasets/
│   │   │       │   │   │   └── coco_instance.py
│   │   │       │   │   ├── default_runtime.py
│   │   │       │   │   ├── models/
│   │   │       │   │   │   ├── cascade_mask_rcnn_r50_fpn.py
│   │   │       │   │   │   ├── cascade_mask_rcnn_swin_fpn.py
│   │   │       │   │   │   ├── cascade_mask_rcnn_vit_fpn.py
│   │   │       │   │   │   ├── mask_rcnn_r50_fpn.py
│   │   │       │   │   │   └── mask_rcnn_vit_fpn.py
│   │   │       │   │   └── schedules/
│   │   │       │   │       └── schedule_1x.py
│   │   │       │   └── mask_rcnn/
│   │   │       │       ├── vit_base_giou_4conv1f_coco_maskrcnn_1x_cae_sincos_init0.1_lr00003.py
│   │   │       │       └── vit_large_giou_4conv1f_coco_maskrcnn_1x_cae_sincos_init0.1_lr00002_lrdr0.85_dp0.2.py
│   │   │       ├── mmcv_custom/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── checkpoint.py
│   │   │       │   ├── layer_decay_optimizer_constructor.py
│   │   │       │   ├── prepare_rpe.py
│   │   │       │   ├── register_backbone.py
│   │   │       │   └── runner/
│   │   │       │       ├── __init__.py
│   │   │       │       ├── checkpoint.py
│   │   │       │       └── epoch_based_runner.py
│   │   │       ├── test.py
│   │   │       └── train.py
│   │   ├── loader.py
│   │   ├── models/
│   │   │   ├── __init__.py
│   │   │   ├── head.py
│   │   │   ├── swin_transformer.py
│   │   │   └── vision_transformer.py
│   │   ├── scripts/
│   │   │   ├── run_eval.sh
│   │   │   ├── run_train_maskrcnn_vit_base.sh
│   │   │   └── run_train_maskrcnn_vit_large.sh
│   │   └── utils.py
│   └── semantic_segmentation/
│       ├── README.md
│       ├── backbone/
│       │   ├── beit.py
│       │   ├── beit_fapn.py
│       │   ├── cae.py
│       │   ├── fapn.py
│       │   └── mae.py
│       ├── configs_local/
│       │   ├── _base_/
│       │   │   ├── datasets/
│       │   │   │   ├── ade20k.py
│       │   │   │   ├── ade20k_640x640.py
│       │   │   │   ├── chase_db1.py
│       │   │   │   ├── cityscapes.py
│       │   │   │   ├── cityscapes_769x769.py
│       │   │   │   ├── coco-stuff10k.py
│       │   │   │   ├── drive.py
│       │   │   │   ├── hrf.py
│       │   │   │   ├── pascal_context.py
│       │   │   │   ├── pascal_voc12.py
│       │   │   │   ├── pascal_voc12_aug.py
│       │   │   │   └── stare.py
│       │   │   ├── default_runtime.py
│       │   │   ├── models/
│       │   │   │   ├── ann_r50-d8.py
│       │   │   │   ├── apcnet_r50-d8.py
│       │   │   │   ├── ccnet_r50-d8.py
│       │   │   │   ├── cgnet.py
│       │   │   │   ├── danet_r50-d8.py
│       │   │   │   ├── deeplabv3_r50-d8.py
│       │   │   │   ├── deeplabv3_unet_s5-d16.py
│       │   │   │   ├── deeplabv3plus_r50-d8.py
│       │   │   │   ├── dmnet_r50-d8.py
│       │   │   │   ├── dnl_r50-d8.py
│       │   │   │   ├── emanet_r50-d8.py
│       │   │   │   ├── encnet_r50-d8.py
│       │   │   │   ├── fast_scnn.py
│       │   │   │   ├── fcn_hr18.py
│       │   │   │   ├── fcn_r50-d8.py
│       │   │   │   ├── fcn_unet_s5-d16.py
│       │   │   │   ├── fpn_r50.py
│       │   │   │   ├── gcnet_r50-d8.py
│       │   │   │   ├── lraspp_m-v3-d8.py
│       │   │   │   ├── nonlocal_r50-d8.py
│       │   │   │   ├── ocrnet_hr18.py
│       │   │   │   ├── ocrnet_r50-d8.py
│       │   │   │   ├── pointrend_r50.py
│       │   │   │   ├── psanet_r50-d8.py
│       │   │   │   ├── pspnet_r50-d8.py
│       │   │   │   ├── pspnet_unet_s5-d16.py
│       │   │   │   ├── upernet_cae.py
│       │   │   │   └── upernet_r50.py
│       │   │   └── schedules/
│       │   │       ├── schedule_160k.py
│       │   │       ├── schedule_20k.py
│       │   │       ├── schedule_320k.py
│       │   │       ├── schedule_40k.py
│       │   │       └── schedule_80k.py
│       │   ├── beit/
│       │   │   └── upernet_beit_base_12_512_slide_160k_ade20k_pt_4e-4.py
│       │   ├── cae/
│       │   │   └── upernet/
│       │   │       ├── upernet_cae_base_12_512_slide_160k_ade20k_pt_1e-4.py
│       │   │       ├── upernet_cae_base_12_512_slide_160k_ade20k_pt_2e-4.py
│       │   │       ├── upernet_cae_base_12_512_slide_160k_ade20k_pt_3e-4.py
│       │   │       └── upernet_cae_large_24_512_slide_160k_ade20k_pt_decay095_4e-5_dp015.py
│       │   └── mae/
│       │       └── upernet_mae_large_12_512_slide_160k_ade20k_pt_4e-4.py
│       ├── mmcv_custom/
│       │   ├── __init__.py
│       │   ├── apex_runner/
│       │   │   ├── __init__.py
│       │   │   ├── apex_iter_based_runner.py
│       │   │   ├── checkpoint.py
│       │   │   └── optimizer.py
│       │   ├── checkpoint.py
│       │   ├── checkpoint_beit.py
│       │   ├── layer_decay_optimizer_constructor.py
│       │   ├── resize_transform.py
│       │   └── train_api.py
│       ├── mmseg/
│       │   ├── __init__.py
│       │   ├── apis/
│       │   │   ├── __init__.py
│       │   │   ├── inference.py
│       │   │   ├── test.py
│       │   │   └── train.py
│       │   ├── core/
│       │   │   ├── __init__.py
│       │   │   ├── evaluation/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── class_names.py
│       │   │   │   ├── eval_hooks.py
│       │   │   │   └── metrics.py
│       │   │   ├── seg/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── builder.py
│       │   │   │   └── sampler/
│       │   │   │       ├── __init__.py
│       │   │   │       ├── base_pixel_sampler.py
│       │   │   │       └── ohem_pixel_sampler.py
│       │   │   └── utils/
│       │   │       ├── __init__.py
│       │   │       └── misc.py
│       │   ├── datasets/
│       │   │   ├── __init__.py
│       │   │   ├── ade.py
│       │   │   ├── builder.py
│       │   │   ├── chase_db1.py
│       │   │   ├── cityscapes.py
│       │   │   ├── coco_stuff.py
│       │   │   ├── custom.py
│       │   │   ├── dataset_wrappers.py
│       │   │   ├── drive.py
│       │   │   ├── hrf.py
│       │   │   ├── pascal_context.py
│       │   │   ├── pipelines/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── compose.py
│       │   │   │   ├── formating.py
│       │   │   │   ├── loading.py
│       │   │   │   ├── test_time_aug.py
│       │   │   │   └── transforms.py
│       │   │   ├── stare.py
│       │   │   └── voc.py
│       │   ├── models/
│       │   │   ├── __init__.py
│       │   │   ├── backbones/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── cgnet.py
│       │   │   │   ├── fast_scnn.py
│       │   │   │   ├── hrnet.py
│       │   │   │   ├── mobilenet_v2.py
│       │   │   │   ├── mobilenet_v3.py
│       │   │   │   ├── resnest.py
│       │   │   │   ├── resnet.py
│       │   │   │   ├── resnext.py
│       │   │   │   └── unet.py
│       │   │   ├── builder.py
│       │   │   ├── decode_heads/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── ann_head.py
│       │   │   │   ├── apc_head.py
│       │   │   │   ├── aspp_head.py
│       │   │   │   ├── cascade_decode_head.py
│       │   │   │   ├── cc_head.py
│       │   │   │   ├── da_head.py
│       │   │   │   ├── decode_head.py
│       │   │   │   ├── dm_head.py
│       │   │   │   ├── dnl_head.py
│       │   │   │   ├── ema_head.py
│       │   │   │   ├── enc_head.py
│       │   │   │   ├── fcn_head.py
│       │   │   │   ├── fpn_head.py
│       │   │   │   ├── gc_head.py
│       │   │   │   ├── lraspp_head.py
│       │   │   │   ├── nl_head.py
│       │   │   │   ├── ocr_head.py
│       │   │   │   ├── point_head.py
│       │   │   │   ├── psa_head.py
│       │   │   │   ├── psp_head.py
│       │   │   │   ├── sep_aspp_head.py
│       │   │   │   ├── sep_fcn_head.py
│       │   │   │   └── uper_head.py
│       │   │   ├── losses/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── accuracy.py
│       │   │   │   ├── cross_entropy_loss.py
│       │   │   │   ├── lovasz_loss.py
│       │   │   │   └── utils.py
│       │   │   ├── necks/
│       │   │   │   ├── __init__.py
│       │   │   │   └── fpn.py
│       │   │   ├── segmentors/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── base.py
│       │   │   │   ├── cascade_encoder_decoder.py
│       │   │   │   └── encoder_decoder.py
│       │   │   └── utils/
│       │   │       ├── __init__.py
│       │   │       ├── inverted_residual.py
│       │   │       ├── make_divisible.py
│       │   │       ├── res_layer.py
│       │   │       ├── se_layer.py
│       │   │       ├── self_attention_block.py
│       │   │       └── up_conv_block.py
│       │   ├── ops/
│       │   │   ├── __init__.py
│       │   │   ├── encoding.py
│       │   │   └── wrappers.py
│       │   ├── utils/
│       │   │   ├── __init__.py
│       │   │   ├── collect_env.py
│       │   │   └── logger.py
│       │   └── version.py
│       └── tools/
│           ├── dist_test.sh
│           ├── dist_train.sh
│           ├── test.py
│           └── train.py
├── furnace/
│   ├── dataset_folder.py
│   ├── datasets.py
│   ├── engine_for_finetuning.py
│   ├── engine_for_pretraining.py
│   ├── masking_generator.py
│   ├── optim_factory.py
│   ├── transforms.py
│   └── utils.py
├── linear_util/
│   ├── crop.py
│   ├── datasets.py
│   ├── engine_finetune.py
│   ├── lars.py
│   ├── lr_decay.py
│   ├── lr_sched.py
│   ├── misc.py
│   └── pos_embed.py
├── models/
│   ├── modeling_cae.py
│   ├── modeling_cae_helper.py
│   ├── modeling_discrete_vae.py
│   └── modeling_finetune.py
├── requirements.txt
├── scripts/
│   ├── cae_base_800e.sh
│   ├── cae_base_finetune.sh
│   ├── cae_large_1600e.sh
│   └── cae_large_finetune.sh
├── tokenizer-weights/
│   └── README
└── tools/
    ├── run_attentive.py
    ├── run_class_finetuning.py
    ├── run_linear.py
    └── run_pretraining.py

Download .txt

SYMBOL INDEX (1281 symbols across 136 files)

FILE: dall_e/__init__.py
  function load_model (line 9) | def load_model(path: str, device: torch.device = None) -> nn.Module:

FILE: dall_e/decoder.py
  class DecoderBlock (line 13) | class DecoderBlock(nn.Module):
    method __attrs_post_init__ (line 21) | def __attrs_post_init__(self) -> None:
    method forward (line 38) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  class Decoder (line 42) | class Decoder(nn.Module):
    method __attrs_post_init__ (line 54) | def __attrs_post_init__(self) -> None:
    method forward (line 86) | def forward(self, x: torch.Tensor) -> torch.Tensor:

FILE: dall_e/encoder.py
  class EncoderBlock (line 13) | class EncoderBlock(nn.Module):
    method __attrs_post_init__ (line 21) | def __attrs_post_init__(self) -> None:
    method forward (line 38) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  class Encoder (line 42) | class Encoder(nn.Module):
    method __attrs_post_init__ (line 53) | def __attrs_post_init__(self) -> None:
    method forward (line 85) | def forward(self, x: torch.Tensor) -> torch.Tensor:

FILE: dall_e/utils.py
  class Conv2d (line 11) | class Conv2d(nn.Module):
    method __attrs_post_init__ (line 20) | def __attrs_post_init__(self) -> None:
    method forward (line 31) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  function map_pixels (line 45) | def map_pixels(x: torch.Tensor) -> torch.Tensor:
  function unmap_pixels (line 51) | def unmap_pixels(x: torch.Tensor) -> torch.Tensor:

FILE: downstream_tasks/detection/evaluation/object_detection/mmcv_custom/checkpoint.py
  function _get_mmcv_home (line 43) | def _get_mmcv_home():
  function load_state_dict (line 54) | def load_state_dict(module, state_dict, strict=False, logger=None):
  function load_url_dist (line 122) | def load_url_dist(url, model_dir=None, map_location="cpu"):
  function load_pavimodel_dist (line 136) | def load_pavimodel_dist(model_path, map_location=None):
  function load_fileclient_dist (line 164) | def load_fileclient_dist(filename, backend, map_location):
  function get_torchvision_models (line 185) | def get_torchvision_models():
  function get_external_models (line 197) | def get_external_models():
  function get_mmcls_models (line 211) | def get_mmcls_models():
  function get_deprecated_model_names (line 218) | def get_deprecated_model_names():
  function _process_mmcls_checkpoint (line 227) | def _process_mmcls_checkpoint(checkpoint):
  function _load_checkpoint (line 238) | def _load_checkpoint(filename, map_location=None):
  function cosine_scheduler (line 299) | def cosine_scheduler(base_value, final_value, epochs, niter_per_ep, warm...
  function load_checkpoint (line 319) | def load_checkpoint(model,
  function weights_to_cpu (line 597) | def weights_to_cpu(state_dict):
  function _save_to_state_dict (line 612) | def _save_to_state_dict(module, destination, prefix, keep_vars):
  function get_state_dict (line 632) | def get_state_dict(module, destination=None, prefix='', keep_vars=False):
  function save_checkpoint (line 676) | def save_checkpoint(model, filename, optimizer=None, meta=None):

FILE: downstream_tasks/detection/evaluation/object_detection/mmcv_custom/layer_decay_optimizer_constructor.py
  function get_num_layer_for_vit (line 17) | def get_num_layer_for_vit(var_name, num_max_layer):
  class LayerDecayOptimizerConstructor (line 30) | class LayerDecayOptimizerConstructor(DefaultOptimizerConstructor):
    method add_params (line 31) | def add_params(self, params, module, prefix='', is_dcn_module=None):

FILE: downstream_tasks/detection/evaluation/object_detection/mmcv_custom/prepare_rpe.py
  function rpe_index (line 9) | def rpe_index(window_size):
  function prepare_rpe (line 31) | def prepare_rpe(rel_pos_bias, src_patch_shape, dst_patch_shape):

FILE: downstream_tasks/detection/evaluation/object_detection/mmcv_custom/register_backbone.py
  class PatchEmbed (line 19) | class PatchEmbed(nn.Module):
    method __init__ (line 22) | def __init__(self, img_size=[224, 224], patch_size=16, in_chans=3, emb...
    method forward (line 34) | def forward(self, x, mask=None):
  class VisionTransformer (line 39) | class VisionTransformer(VisionTransformer):
    method __init__ (line 40) | def __init__(self,
    method build_2d_sincos_position_embedding (line 115) | def build_2d_sincos_position_embedding(self, embed_dim=768, temperatur...
    method train (line 133) | def train(self, mode=True):
    method _freeze_stages (line 144) | def _freeze_stages(self):
    method init_weights (line 167) | def init_weights(self, pretrained=None):
    method interpolate_pos_encoding (line 186) | def interpolate_pos_encoding(self, x, w, h):
    method forward (line 210) | def forward(self, x):

FILE: downstream_tasks/detection/evaluation/object_detection/mmcv_custom/runner/checkpoint.py
  function save_checkpoint (line 26) | def save_checkpoint(model, filename, optimizer=None, meta=None):

FILE: downstream_tasks/detection/evaluation/object_detection/mmcv_custom/runner/epoch_based_runner.py
  class EpochBasedRunnerAmp (line 27) | class EpochBasedRunnerAmp(EpochBasedRunner):
    method save_checkpoint (line 32) | def save_checkpoint(self,
    method resume (line 75) | def resume(self,

FILE: downstream_tasks/detection/evaluation/object_detection/test.py
  function parse_args (line 28) | def parse_args():
  function main (line 110) | def main():

FILE: downstream_tasks/detection/evaluation/object_detection/train.py
  function parse_args (line 30) | def parse_args():
  function main (line 95) | def main():

FILE: downstream_tasks/detection/loader.py
  class ImageFolderInstance (line 13) | class ImageFolderInstance(ImageFolder):
    method __getitem__ (line 14) | def __getitem__(self, index):
  class ImageFolderMask (line 18) | class ImageFolderMask(ImageFolder):
    method __init__ (line 19) | def __init__(self, *args, patch_size, pred_ratio, pred_ratio_var, pred...
    method get_pred_ratio (line 33) | def get_pred_ratio(self):
    method set_epoch (line 51) | def set_epoch(self, epoch):
    method __getitem__ (line 54) | def __getitem__(self, index):

FILE: downstream_tasks/detection/models/head.py
  class CSyncBatchNorm (line 13) | class CSyncBatchNorm(nn.SyncBatchNorm):
    method __init__ (line 14) | def __init__(self,
    method forward (line 21) | def forward(self, x):
  class PSyncBatchNorm (line 32) | class PSyncBatchNorm(nn.SyncBatchNorm):
    method __init__ (line 33) | def __init__(self,
  class CustomSequential (line 51) | class CustomSequential(nn.Sequential):
    method forward (line 54) | def forward(self, input):
  class DINOHead (line 65) | class DINOHead(nn.Module):
    method __init__ (line 66) | def __init__(self, in_dim, out_dim, norm=None, act='gelu', last_norm=N...
    method _init_weights (line 106) | def _init_weights(self, m):
    method forward (line 112) | def forward(self, x):
    method _build_norm (line 121) | def _build_norm(self, norm, hidden_dim, **kwargs):
    method _build_act (line 136) | def _build_act(self, act):
  class iBOTHead (line 145) | class iBOTHead(DINOHead):
    method __init__ (line 147) | def __init__(self, *args, patch_out_dim=8192, norm=None, act='gelu', l...
    method forward (line 181) | def forward(self, x):

FILE: downstream_tasks/detection/models/swin_transformer.py
  class Mlp (line 26) | class Mlp(nn.Module):
    method __init__ (line 27) | def __init__(self, in_features, hidden_features=None, out_features=Non...
    method forward (line 36) | def forward(self, x):
  function window_partition (line 45) | def window_partition(x, window_size):
  function window_reverse (line 60) | def window_reverse(windows, window_size, H, W):
  class WindowAttention (line 77) | class WindowAttention(nn.Module):
    method __init__ (line 91) | def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scal...
    method forward (line 125) | def forward(self, x, mask=None):
    method extra_repr (line 159) | def extra_repr(self) -> str:
    method flops (line 162) | def flops(self, N):
    method compute_macs (line 176) | def compute_macs(module, input, output):
  class SwinTransformerBlock (line 182) | class SwinTransformerBlock(nn.Module):
    method __init__ (line 201) | def __init__(self, dim, input_resolution, num_heads, window_size=7, sh...
    method create_attn_mask (line 233) | def create_attn_mask(self, H, W):
    method forward (line 259) | def forward(self, x):
    method extra_repr (line 318) | def extra_repr(self) -> str:
    method flops (line 322) | def flops(self):
  class PatchMerging (line 337) | class PatchMerging(nn.Module):
    method __init__ (line 346) | def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
    method forward (line 353) | def forward(self, x):
    method extra_repr (line 383) | def extra_repr(self) -> str:
    method flops (line 386) | def flops(self):
  class BasicLayer (line 393) | class BasicLayer(nn.Module):
    method __init__ (line 412) | def __init__(self, dim, input_resolution, depth, num_heads, window_size,
    method forward (line 436) | def forward(self, x):
    method forward_with_features (line 443) | def forward_with_features(self, x):
    method forward_with_attention (line 452) | def forward_with_attention(self, x):
    method extra_repr (line 462) | def extra_repr(self) -> str:
    method flops (line 465) | def flops(self):
  class PatchEmbed (line 474) | class PatchEmbed(nn.Module):
    method __init__ (line 478) | def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=...
    method forward (line 497) | def forward(self, x):
    method flops (line 511) | def flops(self):
  class SwinTransformer (line 519) | class SwinTransformer(nn.Module):
    method __init__ (line 544) | def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes...
    method _init_weights (line 604) | def _init_weights(self, m):
    method no_weight_decay (line 614) | def no_weight_decay(self):
    method no_weight_decay_keywords (line 618) | def no_weight_decay_keywords(self):
    method forward (line 622) | def forward(self, x, return_all_tokens=None, mask=None):
    method get_selfattention (line 647) | def get_selfattention(self, x, n=1):
    method get_last_selfattention (line 660) | def get_last_selfattention(self, x):
    method get_all_selfattention (line 669) | def get_all_selfattention(self, x):
    method get_intermediate_layers (line 678) | def get_intermediate_layers(self, x, n=1, return_patch_avgpool=False):
    method flops (line 724) | def flops(self):
    method init_weights (line 735) | def init_weights(self, pretrained='', pretrained_layers=[], verbose=Tr...
    method freeze_pretrained_layers (line 802) | def freeze_pretrained_layers(self, frozen_layers=[]):
    method get_num_layers (line 828) | def get_num_layers(self):
    method mask_model (line 832) | def mask_model(self, x, mask):
  function swin_tiny (line 844) | def swin_tiny(window_size=7, **kwargs):
  function swin_small (line 851) | def swin_small(window_size=7, **kwargs):
  function swin_base (line 858) | def swin_base(window_size=7, **kwargs):
  function swin_large (line 865) | def swin_large(window_size=7, **kwargs):

FILE: downstream_tasks/detection/models/vision_transformer.py
  function drop_path (line 22) | def drop_path(x, drop_prob: float = 0., training: bool = False):
  class DropPath (line 33) | class DropPath(nn.Module):
    method __init__ (line 36) | def __init__(self, drop_prob=None):
    method forward (line 40) | def forward(self, x):
  class Mlp (line 44) | class Mlp(nn.Module):
    method __init__ (line 45) | def __init__(self, in_features, hidden_features=None, out_features=Non...
    method forward (line 54) | def forward(self, x):
  class Attention (line 63) | class Attention(nn.Module):
    method __init__ (line 64) | def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, at...
    method forward (line 87) | def forward(self, x, x_rel_pos_bias = None):
  class Block (line 121) | class Block(nn.Module):
    method __init__ (line 122) | def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_sc...
    method forward (line 141) | def forward(self, x, x_rel_pos_bias=None, return_attention=False):
  class PatchEmbed (line 172) | class PatchEmbed(nn.Module):
    method __init__ (line 175) | def __init__(self, img_size=[224, 224], patch_size=16, in_chans=3, emb...
    method forward (line 188) | def forward(self, x, mask=None):
  class VisionTransformer (line 192) | class VisionTransformer(nn.Module):
    method __init__ (line 194) | def __init__(self, img_size=[224], patch_size=16, in_chans=3, num_clas...
    method _init_weights (line 258) | def _init_weights(self, m):
    method interpolate_pos_encoding (line 267) | def interpolate_pos_encoding(self, x, w, h):
    method prepare_tokens (line 290) | def prepare_tokens(self, x, mask=None):
    method forward (line 310) | def forward(self, x, return_all_tokens=None, mask=None):
    method get_last_selfattention (line 331) | def get_last_selfattention(self, x):
    method get_intermediate_layers (line 340) | def get_intermediate_layers(self, x, n=1):
    method get_num_layers (line 350) | def get_num_layers(self):
    method mask_model (line 353) | def mask_model(self, x, mask):
  function vit_tiny (line 357) | def vit_tiny(patch_size=16, **kwargs):
  function vit_small (line 363) | def vit_small(patch_size=16, **kwargs):
  function vit_base (line 369) | def vit_base(patch_size=16, **kwargs):
  function vit_large (line 375) | def vit_large(patch_size=16, **kwargs):

FILE: downstream_tasks/detection/utils.py
  class GaussianBlur (line 29) | class GaussianBlur(object):
    method __init__ (line 33) | def __init__(self, p=0.5, radius_min=0.1, radius_max=2.):
    method __call__ (line 38) | def __call__(self, img):
  class Solarization (line 50) | class Solarization(object):
    method __init__ (line 54) | def __init__(self, p):
    method __call__ (line 57) | def __call__(self, img):
  class PermutePatch (line 64) | class PermutePatch(object):
    method __init__ (line 68) | def __init__(self, psz):
    method __call__ (line 71) | def __call__(self, img):
  class HideAndSeek (line 87) | class HideAndSeek(object):
    method __init__ (line 91) | def __init__(self, ratio, psz):
    method __call__ (line 95) | def __call__(self, img):
  function load_pretrained_weights (line 111) | def load_pretrained_weights(model, pretrained_weights, checkpoint_key, m...
  function clip_gradients (line 154) | def clip_gradients(model, clip):
  function cancel_gradients_last_layer (line 166) | def cancel_gradients_last_layer(epoch, model, freeze_last_layer):
  function restart_from_checkpoint (line 174) | def restart_from_checkpoint(ckp_path, run_variables=None, **kwargs):
  function cosine_scheduler (line 209) | def cosine_scheduler(base_value, final_value, epochs, niter_per_ep, warm...
  function bool_flag (line 223) | def bool_flag(s):
  function fix_random_seeds (line 237) | def fix_random_seeds(seed=31):
  class SmoothedValue (line 248) | class SmoothedValue(object):
    method __init__ (line 253) | def __init__(self, window_size=20, fmt=None):
    method update (line 261) | def update(self, value, n=1):
    method synchronize_between_processes (line 266) | def synchronize_between_processes(self):
    method median (line 280) | def median(self):
    method avg (line 285) | def avg(self):
    method global_avg (line 290) | def global_avg(self):
    method max (line 294) | def max(self):
    method value (line 298) | def value(self):
    method __str__ (line 301) | def __str__(self):
  function reduce_dict (line 310) | def reduce_dict(input_dict, average=True):
  class MetricLogger (line 337) | class MetricLogger(object):
    method __init__ (line 338) | def __init__(self, delimiter="\t"):
    method update (line 342) | def update(self, **kwargs):
    method __getattr__ (line 349) | def __getattr__(self, attr):
    method __str__ (line 357) | def __str__(self):
    method synchronize_between_processes (line 365) | def synchronize_between_processes(self):
    method add_meter (line 369) | def add_meter(self, name, meter):
    method log_every (line 372) | def log_every(self, iterable, print_freq, header=None):
  function get_sha (line 427) | def get_sha():
  function is_dist_avail_and_initialized (line 447) | def is_dist_avail_and_initialized():
  function get_world_size (line 455) | def get_world_size():
  function get_rank (line 461) | def get_rank():
  function is_main_process (line 467) | def is_main_process():
  function save_on_master (line 471) | def save_on_master(*args, **kwargs):
  function setup_for_distributed (line 476) | def setup_for_distributed(is_master):
  function init_distributed_mode (line 491) | def init_distributed_mode(args):
  function accuracy (line 526) | def accuracy(output, target, topk=(1,)):
  function _no_grad_trunc_normal_ (line 536) | def _no_grad_trunc_normal_(tensor, mean, std, a, b):
  function trunc_normal_ (line 572) | def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
  class LARS (line 577) | class LARS(torch.optim.Optimizer):
    method __init__ (line 581) | def __init__(self, params, lr=0, weight_decay=0, momentum=0.9, eta=0.001,
    method step (line 589) | def step(self):
  function create_ds_config (line 617) | def create_ds_config(args):
  class MultiCropWrapper (line 648) | class MultiCropWrapper(nn.Module):
    method __init__ (line 657) | def __init__(self, backbone, head=None):
    method forward (line 667) | def forward(self, x, mask=None, return_backbone_feat=False,
  function get_params_groups (line 698) | def get_params_groups(model):
  function has_batchnorms (line 712) | def has_batchnorms(model):
  function concat_all_gather (line 720) | def concat_all_gather(tensor):
  class PCA (line 733) | class PCA():
    method __init__ (line 737) | def __init__(self, dim=256, whit=0.5):
    method train_pca (line 742) | def train_pca(self, cov):
    method apply (line 768) | def apply(self, x):
  function compute_ap (line 787) | def compute_ap(ranks, nres):
  function compute_map (line 822) | def compute_map(ranks, gnd, kappas=[]):

FILE: downstream_tasks/semantic_segmentation/backbone/beit.py
  class DropPath (line 29) | class DropPath(nn.Module):
    method __init__ (line 32) | def __init__(self, drop_prob=None):
    method forward (line 36) | def forward(self, x):
    method extra_repr (line 39) | def extra_repr(self) -> str:
  class Mlp (line 43) | class Mlp(nn.Module):
    method __init__ (line 44) | def __init__(self, in_features, hidden_features=None, out_features=Non...
    method forward (line 53) | def forward(self, x):
  class Attention (line 63) | class Attention(nn.Module):
    method __init__ (line 64) | def __init__(
    method forward (line 120) | def forward(self, x, rel_pos_bias=None):
  class Block (line 153) | class Block(nn.Module):
    method __init__ (line 155) | def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_sc...
    method forward (line 175) | def forward(self, x, rel_pos_bias=None):
  class PatchEmbed (line 185) | class PatchEmbed(nn.Module):
    method __init__ (line 188) | def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=...
    method forward (line 200) | def forward(self, x, **kwargs):
  class HybridEmbed (line 212) | class HybridEmbed(nn.Module):
    method __init__ (line 216) | def __init__(self, backbone, img_size=224, feature_size=None, in_chans...
    method forward (line 240) | def forward(self, x):
  class RelativePositionBias (line 247) | class RelativePositionBias(nn.Module):
    method __init__ (line 249) | def __init__(self, window_size, num_heads):
    method forward (line 278) | def forward(self):
  function get_sinusoid_encoding_table (line 285) | def get_sinusoid_encoding_table(n_position, d_hid, token=False):
  class BEiT (line 301) | class BEiT(nn.Module):
    method __init__ (line 304) | def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classe...
    method build_2d_sincos_position_embedding (line 398) | def build_2d_sincos_position_embedding(self, embed_dim=768, temperatur...
    method fix_init_weight (line 416) | def fix_init_weight(self):
    method _init_weights (line 424) | def _init_weights(self, m):
    method init_weights (line 433) | def init_weights(self, pretrained=None):
    method get_num_layers (line 459) | def get_num_layers(self):
    method no_weight_decay (line 463) | def no_weight_decay(self):
    method forward_features (line 466) | def forward_features(self, x):
    method forward (line 502) | def forward(self, x):

FILE: downstream_tasks/semantic_segmentation/backbone/beit_fapn.py
  class DropPath (line 28) | class DropPath(nn.Module):
    method __init__ (line 31) | def __init__(self, drop_prob=None):
    method forward (line 35) | def forward(self, x):
    method extra_repr (line 38) | def extra_repr(self) -> str:
  class Mlp (line 42) | class Mlp(nn.Module):
    method __init__ (line 43) | def __init__(self, in_features, hidden_features=None, out_features=Non...
    method forward (line 52) | def forward(self, x):
  class Attention (line 62) | class Attention(nn.Module):
    method __init__ (line 63) | def __init__(
    method forward (line 119) | def forward(self, x, rel_pos_bias=None):
  class Block (line 152) | class Block(nn.Module):
    method __init__ (line 154) | def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_sc...
    method forward (line 174) | def forward(self, x, rel_pos_bias=None):
  class PatchEmbed (line 184) | class PatchEmbed(nn.Module):
    method __init__ (line 187) | def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=...
    method forward (line 199) | def forward(self, x, **kwargs):
  class HybridEmbed (line 211) | class HybridEmbed(nn.Module):
    method __init__ (line 215) | def __init__(self, backbone, img_size=224, feature_size=None, in_chans...
    method forward (line 239) | def forward(self, x):
  class RelativePositionBias (line 246) | class RelativePositionBias(nn.Module):
    method __init__ (line 248) | def __init__(self, window_size, num_heads):
    method forward (line 277) | def forward(self):
  class FeatureSelectionModule (line 284) | class FeatureSelectionModule(nn.Module):
    method __init__ (line 285) | def __init__(self, in_c, out_c, norm="GM"):
    method forward (line 300) | def forward(self, x):
  class FeatureAlign (line 307) | class FeatureAlign(nn.Module):
    method __init__ (line 308) | def __init__(self, in_c, out_c, norm=None):
    method forward (line 314) | def forward(self, feat_l, feat_s):
  class BEiT_FaPN (line 328) | class BEiT_FaPN(nn.Module):
    method __init__ (line 331) | def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classe...
    method fix_init_weight (line 415) | def fix_init_weight(self):
    method _init_weights (line 423) | def _init_weights(self, m):
    method init_weights (line 432) | def init_weights(self, pretrained=None):
    method get_num_layers (line 458) | def get_num_layers(self):
    method no_weight_decay (line 462) | def no_weight_decay(self):
    method forward_features (line 465) | def forward_features(self, x):
    method forward (line 499) | def forward(self, x):

FILE: downstream_tasks/semantic_segmentation/backbone/cae.py
  class DropPath (line 29) | class DropPath(nn.Module):
    method __init__ (line 32) | def __init__(self, drop_prob=None):
    method forward (line 36) | def forward(self, x):
    method extra_repr (line 39) | def extra_repr(self) -> str:
  class Mlp (line 43) | class Mlp(nn.Module):
    method __init__ (line 44) | def __init__(self, in_features, hidden_features=None, out_features=Non...
    method forward (line 53) | def forward(self, x):
  class Attention (line 63) | class Attention(nn.Module):
    method __init__ (line 64) | def __init__(
    method forward (line 120) | def forward(self, x, rel_pos_bias=None):
  class Block (line 153) | class Block(nn.Module):
    method __init__ (line 155) | def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_sc...
    method forward (line 175) | def forward(self, x, rel_pos_bias=None):
  class PatchEmbed (line 185) | class PatchEmbed(nn.Module):
    method __init__ (line 188) | def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=...
    method forward (line 200) | def forward(self, x, **kwargs):
  class HybridEmbed (line 212) | class HybridEmbed(nn.Module):
    method __init__ (line 216) | def __init__(self, backbone, img_size=224, feature_size=None, in_chans...
    method forward (line 240) | def forward(self, x):
  class RelativePositionBias (line 247) | class RelativePositionBias(nn.Module):
    method __init__ (line 249) | def __init__(self, window_size, num_heads):
    method forward (line 278) | def forward(self):
  function get_sinusoid_encoding_table (line 285) | def get_sinusoid_encoding_table(n_position, d_hid, token=False):
  class CAE (line 301) | class CAE(nn.Module):
    method __init__ (line 304) | def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classe...
    method build_2d_sincos_position_embedding (line 398) | def build_2d_sincos_position_embedding(self, embed_dim=768, temperatur...
    method fix_init_weight (line 416) | def fix_init_weight(self):
    method _init_weights (line 424) | def _init_weights(self, m):
    method init_weights (line 433) | def init_weights(self, pretrained=None):
    method get_num_layers (line 459) | def get_num_layers(self):
    method no_weight_decay (line 463) | def no_weight_decay(self):
    method forward_features (line 466) | def forward_features(self, x):
    method forward (line 502) | def forward(self, x):

FILE: downstream_tasks/semantic_segmentation/backbone/fapn.py
  class FeatureSelectionModule (line 1) | class FeatureSelectionModule(nn.Module):
    method __init__ (line 2) | def __init__(self, in_c, out_c, norm="GM"):
  class FeatureAlign (line 24) | class FeatureAlign(nn.Module):
    method __init__ (line 25) | def __init__(self, in_c, out_c, norm=None):
    method forward (line 31) | def forward(self, teat_l, feat_s):

FILE: downstream_tasks/semantic_segmentation/backbone/mae.py
  class DropPath (line 29) | class DropPath(nn.Module):
    method __init__ (line 32) | def __init__(self, drop_prob=None):
    method forward (line 36) | def forward(self, x):
    method extra_repr (line 39) | def extra_repr(self) -> str:
  class Mlp (line 43) | class Mlp(nn.Module):
    method __init__ (line 44) | def __init__(self, in_features, hidden_features=None, out_features=Non...
    method forward (line 53) | def forward(self, x):
  class Attention (line 63) | class Attention(nn.Module):
    method __init__ (line 64) | def __init__(
    method forward (line 120) | def forward(self, x, rel_pos_bias=None):
  class Block (line 153) | class Block(nn.Module):
    method __init__ (line 155) | def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_sc...
    method forward (line 175) | def forward(self, x, rel_pos_bias=None):
  class PatchEmbed (line 185) | class PatchEmbed(nn.Module):
    method __init__ (line 188) | def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=...
    method forward (line 200) | def forward(self, x, **kwargs):
  class HybridEmbed (line 212) | class HybridEmbed(nn.Module):
    method __init__ (line 216) | def __init__(self, backbone, img_size=224, feature_size=None, in_chans...
    method forward (line 240) | def forward(self, x):
  class RelativePositionBias (line 247) | class RelativePositionBias(nn.Module):
    method __init__ (line 249) | def __init__(self, window_size, num_heads):
    method forward (line 278) | def forward(self):
  function get_sinusoid_encoding_table (line 285) | def get_sinusoid_encoding_table(n_position, d_hid, token=False):
  class MAE (line 301) | class MAE(nn.Module):
    method __init__ (line 304) | def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classe...
    method build_2d_sincos_position_embedding (line 389) | def build_2d_sincos_position_embedding(self, embed_dim=768, temperatur...
    method fix_init_weight (line 407) | def fix_init_weight(self):
    method _init_weights (line 415) | def _init_weights(self, m):
    method init_weights (line 424) | def init_weights(self, pretrained=None):
    method get_num_layers (line 450) | def get_num_layers(self):
    method no_weight_decay (line 454) | def no_weight_decay(self):
    method forward_features (line 457) | def forward_features(self, x):
    method forward (line 492) | def forward(self, x):

FILE: downstream_tasks/semantic_segmentation/mmcv_custom/apex_runner/apex_iter_based_runner.py
  class IterBasedRunnerAmp (line 20) | class IterBasedRunnerAmp(IterBasedRunner):
    method save_checkpoint (line 26) | def save_checkpoint(self,
    method resume (line 68) | def resume(self,

FILE: downstream_tasks/semantic_segmentation/mmcv_custom/apex_runner/checkpoint.py
  function save_checkpoint (line 19) | def save_checkpoint(model, filename, optimizer=None, meta=None):

FILE: downstream_tasks/semantic_segmentation/mmcv_custom/apex_runner/optimizer.py
  class DistOptimizerHook (line 9) | class DistOptimizerHook(OptimizerHook):
    method __init__ (line 12) | def __init__(self, update_interval=1, grad_clip=None, coalesce=True, b...
    method before_run (line 19) | def before_run(self, runner):
    method after_train_iter (line 22) | def after_train_iter(self, runner):

FILE: downstream_tasks/semantic_segmentation/mmcv_custom/checkpoint.py
  function _get_mmcv_home (line 34) | def _get_mmcv_home():
  function load_state_dict (line 45) | def load_state_dict(module, state_dict, strict=False, logger=None):
  function load_url_dist (line 113) | def load_url_dist(url, model_dir=None, map_location="cpu"):
  function load_pavimodel_dist (line 127) | def load_pavimodel_dist(model_path, map_location=None):
  function load_fileclient_dist (line 155) | def load_fileclient_dist(filename, backend, map_location):
  function get_torchvision_models (line 176) | def get_torchvision_models():
  function get_external_models (line 188) | def get_external_models():
  function get_mmcls_models (line 202) | def get_mmcls_models():
  function get_deprecated_model_names (line 209) | def get_deprecated_model_names():
  function _process_mmcls_checkpoint (line 218) | def _process_mmcls_checkpoint(checkpoint):
  function _load_checkpoint (line 229) | def _load_checkpoint(filename, map_location=None):
  function cosine_scheduler (line 290) | def cosine_scheduler(base_value, final_value, epochs, niter_per_ep, warm...
  function load_checkpoint (line 310) | def load_checkpoint(model,
  function weights_to_cpu (line 534) | def weights_to_cpu(state_dict):
  function _save_to_state_dict (line 549) | def _save_to_state_dict(module, destination, prefix, keep_vars):
  function get_state_dict (line 569) | def get_state_dict(module, destination=None, prefix='', keep_vars=False):
  function save_checkpoint (line 613) | def save_checkpoint(model, filename, optimizer=None, meta=None):

FILE: downstream_tasks/semantic_segmentation/mmcv_custom/checkpoint_beit.py
  function _get_mmcv_home (line 34) | def _get_mmcv_home():
  function load_state_dict (line 45) | def load_state_dict(module, state_dict, strict=False, logger=None):
  function load_url_dist (line 113) | def load_url_dist(url, model_dir=None, map_location="cpu"):
  function load_pavimodel_dist (line 127) | def load_pavimodel_dist(model_path, map_location=None):
  function load_fileclient_dist (line 155) | def load_fileclient_dist(filename, backend, map_location):
  function get_torchvision_models (line 176) | def get_torchvision_models():
  function get_external_models (line 188) | def get_external_models():
  function get_mmcls_models (line 202) | def get_mmcls_models():
  function get_deprecated_model_names (line 209) | def get_deprecated_model_names():
  function _process_mmcls_checkpoint (line 218) | def _process_mmcls_checkpoint(checkpoint):
  function _load_checkpoint (line 229) | def _load_checkpoint(filename, map_location=None):
  function cosine_scheduler (line 290) | def cosine_scheduler(base_value, final_value, epochs, niter_per_ep, warm...
  function load_checkpoint (line 310) | def load_checkpoint(model,
  function weights_to_cpu (line 515) | def weights_to_cpu(state_dict):
  function _save_to_state_dict (line 530) | def _save_to_state_dict(module, destination, prefix, keep_vars):
  function get_state_dict (line 550) | def get_state_dict(module, destination=None, prefix='', keep_vars=False):
  function save_checkpoint (line 594) | def save_checkpoint(model, filename, optimizer=None, meta=None):

FILE: downstream_tasks/semantic_segmentation/mmcv_custom/layer_decay_optimizer_constructor.py
  function get_num_layer_for_vit (line 6) | def get_num_layer_for_vit(var_name, num_max_layer):
  class LayerDecayOptimizerConstructor (line 19) | class LayerDecayOptimizerConstructor(DefaultOptimizerConstructor):
    method add_params (line 20) | def add_params(self, params, module, prefix='', is_dcn_module=None):

FILE: downstream_tasks/semantic_segmentation/mmcv_custom/resize_transform.py
  class SETR_Resize (line 8) | class SETR_Resize(object):
    method __init__ (line 35) | def __init__(self,
    method random_select (line 66) | def random_select(img_scales):
    method random_sample (line 84) | def random_sample(img_scales):
    method random_sample_ratio (line 111) | def random_sample_ratio(img_scale, ratio_range):
    method _random_scale (line 137) | def _random_scale(self, results):
    method _resize_img (line 170) | def _resize_img(self, results):
    method _resize_seg (line 206) | def _resize_seg(self, results):
    method __call__ (line 217) | def __call__(self, results):
    method __repr__ (line 235) | def __repr__(self):

FILE: downstream_tasks/semantic_segmentation/mmcv_custom/train_api.py
  function set_random_seed (line 18) | def set_random_seed(seed, deterministic=False):
  function train_segmentor (line 37) | def train_segmentor(model,

FILE: downstream_tasks/semantic_segmentation/mmseg/__init__.py
  function digit_version (line 9) | def digit_version(version_str):

FILE: downstream_tasks/semantic_segmentation/mmseg/apis/inference.py
  function init_segmentor (line 11) | def init_segmentor(config, checkpoint=None, device='cuda:0'):
  class LoadImage (line 42) | class LoadImage:
    method __call__ (line 45) | def __call__(self, results):
  function inference_segmentor (line 69) | def inference_segmentor(model, img):
  function show_result_pyplot (line 101) | def show_result_pyplot(model, img, result, palette=None, fig_size=(15, 1...

FILE: downstream_tasks/semantic_segmentation/mmseg/apis/test.py
  function np2tmp (line 14) | def np2tmp(array, temp_file_name=None):
  function single_gpu_test (line 34) | def single_gpu_test(model,
  function multi_gpu_test (line 102) | def multi_gpu_test(model,
  function collect_results_cpu (line 160) | def collect_results_cpu(result_part, size, tmpdir=None):
  function collect_results_gpu (line 203) | def collect_results_gpu(result_part, size):

FILE: downstream_tasks/semantic_segmentation/mmseg/apis/train.py
  function set_random_seed (line 14) | def set_random_seed(seed, deterministic=False):
  function train_segmentor (line 33) | def train_segmentor(model,

FILE: downstream_tasks/semantic_segmentation/mmseg/core/evaluation/class_names.py
  function cityscapes_classes (line 4) | def cityscapes_classes():
  function ade_classes (line 14) | def ade_classes():
  function voc_classes (line 44) | def voc_classes():
  function cityscapes_palette (line 54) | def cityscapes_palette():
  function ade_palette (line 63) | def ade_palette():
  function voc_palette (line 105) | def voc_palette():
  function get_classes (line 121) | def get_classes(dataset):
  function get_palette (line 138) | def get_palette(dataset):

FILE: downstream_tasks/semantic_segmentation/mmseg/core/evaluation/eval_hooks.py
  class EvalHook (line 7) | class EvalHook(Hook):
    method __init__ (line 15) | def __init__(self, dataloader, interval=1, by_epoch=False, **eval_kwar...
    method after_train_iter (line 24) | def after_train_iter(self, runner):
    method after_train_epoch (line 33) | def after_train_epoch(self, runner):
    method evaluate (line 42) | def evaluate(self, runner, results):
  class DistEvalHook (line 51) | class DistEvalHook(EvalHook):
    method __init__ (line 63) | def __init__(self,
    method after_train_iter (line 79) | def after_train_iter(self, runner):
    method after_train_epoch (line 94) | def after_train_epoch(self, runner):

FILE: downstream_tasks/semantic_segmentation/mmseg/core/evaluation/metrics.py
  function intersect_and_union (line 5) | def intersect_and_union(pred_label,
  function total_intersect_and_union (line 62) | def total_intersect_and_union(results,
  function mean_iou (line 105) | def mean_iou(results,
  function mean_dice (line 142) | def mean_dice(results,
  function eval_metrics (line 179) | def eval_metrics(results,

FILE: downstream_tasks/semantic_segmentation/mmseg/core/seg/builder.py
  function build_pixel_sampler (line 6) | def build_pixel_sampler(cfg, **default_args):

FILE: downstream_tasks/semantic_segmentation/mmseg/core/seg/sampler/base_pixel_sampler.py
  class BasePixelSampler (line 4) | class BasePixelSampler(metaclass=ABCMeta):
    method __init__ (line 7) | def __init__(self, **kwargs):
    method sample (line 11) | def sample(self, seg_logit, seg_label):

FILE: downstream_tasks/semantic_segmentation/mmseg/core/seg/sampler/ohem_pixel_sampler.py
  class OHEMPixelSampler (line 9) | class OHEMPixelSampler(BasePixelSampler):
    method __init__ (line 23) | def __init__(self, context, thresh=None, min_kept=100000):
    method sample (line 30) | def sample(self, seg_logit, seg_label):

FILE: downstream_tasks/semantic_segmentation/mmseg/core/utils/misc.py
  function add_prefix (line 1) | def add_prefix(inputs, prefix):

FILE: downstream_tasks/semantic_segmentation/mmseg/datasets/ade.py
  class ADE20KDataset (line 6) | class ADE20KDataset(CustomDataset):
    method __init__ (line 79) | def __init__(self, **kwargs):

FILE: downstream_tasks/semantic_segmentation/mmseg/datasets/builder.py
  function _concat_dataset (line 25) | def _concat_dataset(cfg, default_args=None):
  function build_dataset (line 61) | def build_dataset(cfg, default_args=None):
  function build_dataloader (line 78) | def build_dataloader(dataset,
  function worker_init_fn (line 155) | def worker_init_fn(worker_id, num_workers, rank, seed):

FILE: downstream_tasks/semantic_segmentation/mmseg/datasets/chase_db1.py
  class ChaseDB1Dataset (line 8) | class ChaseDB1Dataset(CustomDataset):
    method __init__ (line 21) | def __init__(self, **kwargs):

FILE: downstream_tasks/semantic_segmentation/mmseg/datasets/cityscapes.py
  class CityscapesDataset (line 14) | class CityscapesDataset(CustomDataset):
    method __init__ (line 32) | def __init__(self, **kwargs):
    method _convert_to_label_id (line 39) | def _convert_to_label_id(result):
    method results2img (line 50) | def results2img(self, results, imgfile_prefix, to_label_id):
    method format_results (line 91) | def format_results(self, results, imgfile_prefix=None, to_label_id=True):
    method evaluate (line 124) | def evaluate(self,
    method _evaluate_cityscapes (line 164) | def _evaluate_cityscapes(self, results, logger, imgfile_prefix):

FILE: downstream_tasks/semantic_segmentation/mmseg/datasets/coco_stuff.py
  class COCOStuffDataset (line 6) | class COCOStuffDataset(CustomDataset):
    method __init__ (line 90) | def __init__(self, **kwargs):

FILE: downstream_tasks/semantic_segmentation/mmseg/datasets/custom.py
  class CustomDataset (line 18) | class CustomDataset(Dataset):
    method __init__ (line 74) | def __init__(self,
    method __len__ (line 115) | def __len__(self):
    method load_annotations (line 119) | def load_annotations(self, img_dir, img_suffix, ann_dir, seg_map_suffix,
    method get_ann_info (line 157) | def get_ann_info(self, idx):
    method pre_pipeline (line 169) | def pre_pipeline(self, results):
    method __getitem__ (line 177) | def __getitem__(self, idx):
    method prepare_train_img (line 193) | def prepare_train_img(self, idx):
    method prepare_test_img (line 210) | def prepare_test_img(self, idx):
    method format_results (line 226) | def format_results(self, results, **kwargs):
    method get_gt_seg_maps (line 230) | def get_gt_seg_maps(self, efficient_test=False):
    method get_classes_and_palette (line 243) | def get_classes_and_palette(self, classes=None, palette=None):
    method get_palette_for_custom_classes (line 287) | def get_palette_for_custom_classes(self, class_names, palette=None):
    method evaluate (line 306) | def evaluate(self,

FILE: downstream_tasks/semantic_segmentation/mmseg/datasets/dataset_wrappers.py
  class ConcatDataset (line 7) | class ConcatDataset(_ConcatDataset):
    method __init__ (line 17) | def __init__(self, datasets):
  class RepeatDataset (line 24) | class RepeatDataset(object):
    method __init__ (line 37) | def __init__(self, dataset, times):
    method __getitem__ (line 44) | def __getitem__(self, idx):
    method __len__ (line 48) | def __len__(self):

FILE: downstream_tasks/semantic_segmentation/mmseg/datasets/drive.py
  class DRIVEDataset (line 8) | class DRIVEDataset(CustomDataset):
    method __init__ (line 21) | def __init__(self, **kwargs):

FILE: downstream_tasks/semantic_segmentation/mmseg/datasets/hrf.py
  class HRFDataset (line 8) | class HRFDataset(CustomDataset):
    method __init__ (line 21) | def __init__(self, **kwargs):

FILE: downstream_tasks/semantic_segmentation/mmseg/datasets/pascal_context.py
  class PascalContextDataset (line 8) | class PascalContextDataset(CustomDataset):
    method __init__ (line 47) | def __init__(self, split, **kwargs):

FILE: downstream_tasks/semantic_segmentation/mmseg/datasets/pipelines/compose.py
  class Compose (line 9) | class Compose(object):
    method __init__ (line 17) | def __init__(self, transforms):
    method __call__ (line 29) | def __call__(self, data):
    method __repr__ (line 45) | def __repr__(self):

FILE: downstream_tasks/semantic_segmentation/mmseg/datasets/pipelines/formating.py
  function to_tensor (line 11) | def to_tensor(data):
  class ToTensor (line 37) | class ToTensor(object):
    method __init__ (line 44) | def __init__(self, keys):
    method __call__ (line 47) | def __call__(self, results):
    method __repr__ (line 62) | def __repr__(self):
  class ImageToTensor (line 67) | class ImageToTensor(object):
    method __init__ (line 78) | def __init__(self, keys):
    method __call__ (line 81) | def __call__(self, results):
    method __repr__ (line 100) | def __repr__(self):
  class Transpose (line 105) | class Transpose(object):
    method __init__ (line 113) | def __init__(self, keys, order):
    method __call__ (line 117) | def __call__(self, results):
    method __repr__ (line 133) | def __repr__(self):
  class ToDataContainer (line 139) | class ToDataContainer(object):
    method __init__ (line 150) | def __init__(self,
    method __call__ (line 155) | def __call__(self, results):
    method __repr__ (line 173) | def __repr__(self):
  class DefaultFormatBundle (line 178) | class DefaultFormatBundle(object):
    method __call__ (line 189) | def __call__(self, results):
    method __repr__ (line 214) | def __repr__(self):
  class Collect (line 219) | class Collect(object):
    method __init__ (line 256) | def __init__(self,
    method __call__ (line 264) | def __call__(self, results):
    method __repr__ (line 286) | def __repr__(self):

FILE: downstream_tasks/semantic_segmentation/mmseg/datasets/pipelines/loading.py
  class LoadImageFromFile (line 10) | class LoadImageFromFile(object):
    method __init__ (line 31) | def __init__(self,
    method __call__ (line 42) | def __call__(self, results):
    method __repr__ (line 81) | def __repr__(self):
  class LoadAnnotations (line 90) | class LoadAnnotations(object):
    method __init__ (line 104) | def __init__(self,
    method __call__ (line 113) | def __call__(self, results):
    method __repr__ (line 149) | def __repr__(self):

FILE: downstream_tasks/semantic_segmentation/mmseg/datasets/pipelines/test_time_aug.py
  class MultiScaleFlipAug (line 10) | class MultiScaleFlipAug(object):
    method __init__ (line 53) | def __init__(self,
    method __call__ (line 93) | def __call__(self, results):
    method __repr__ (line 128) | def __repr__(self):

FILE: downstream_tasks/semantic_segmentation/mmseg/datasets/pipelines/transforms.py
  class Resize (line 10) | class Resize(object):
    method __init__ (line 41) | def __init__(self,
    method random_select (line 68) | def random_select(img_scales):
    method random_sample (line 86) | def random_sample(img_scales):
    method random_sample_ratio (line 113) | def random_sample_ratio(img_scale, ratio_range):
    method _random_scale (line 139) | def _random_scale(self, results):
    method _resize_img (line 177) | def _resize_img(self, results):
    method _resize_seg (line 199) | def _resize_seg(self, results):
    method __call__ (line 210) | def __call__(self, results):
    method __repr__ (line 228) | def __repr__(self):
  class RandomFlip (line 238) | class RandomFlip(object):
    method __init__ (line 252) | def __init__(self, prob=None, direction='horizontal'):
    method __call__ (line 259) | def __call__(self, results):
    method __repr__ (line 288) | def __repr__(self):
  class Pad (line 293) | class Pad(object):
    method __init__ (line 308) | def __init__(self,
    method _pad_img (line 321) | def _pad_img(self, results):
    method _pad_seg (line 334) | def _pad_seg(self, results):
    method __call__ (line 342) | def __call__(self, results):
    method __repr__ (line 356) | def __repr__(self):
  class Normalize (line 364) | class Normalize(object):
    method __init__ (line 376) | def __init__(self, mean, std, to_rgb=True):
    method __call__ (line 381) | def __call__(self, results):
    method __repr__ (line 398) | def __repr__(self):
  class Rerange (line 406) | class Rerange(object):
    method __init__ (line 416) | def __init__(self, min_value=0, max_value=255):
    method __call__ (line 423) | def __call__(self, results):
    method __repr__ (line 445) | def __repr__(self):
  class CLAHE (line 452) | class CLAHE(object):
    method __init__ (line 465) | def __init__(self, clip_limit=40.0, tile_grid_size=(8, 8)):
    method __call__ (line 472) | def __call__(self, results):
    method __repr__ (line 489) | def __repr__(self):
  class RandomCrop (line 497) | class RandomCrop(object):
    method __init__ (line 506) | def __init__(self, crop_size, cat_max_ratio=1., ignore_index=255):
    method get_crop_bbox (line 512) | def get_crop_bbox(self, img):
    method crop (line 523) | def crop(self, img, crop_bbox):
    method __call__ (line 529) | def __call__(self, results):
    method __repr__ (line 565) | def __repr__(self):
  class RandomRotate (line 570) | class RandomRotate(object):
    method __init__ (line 588) | def __init__(self,
    method __call__ (line 609) | def __call__(self, results):
    method __repr__ (line 641) | def __repr__(self):
  class RGB2Gray (line 653) | class RGB2Gray(object):
    method __init__ (line 668) | def __init__(self, out_channels=None, weights=(0.299, 0.587, 0.114)):
    method __call__ (line 676) | def __call__(self, results):
    method __repr__ (line 700) | def __repr__(self):
  class AdjustGamma (line 708) | class AdjustGamma(object):
    method __init__ (line 716) | def __init__(self, gamma=1.0):
    method __call__ (line 724) | def __call__(self, results):
    method __repr__ (line 739) | def __repr__(self):
  class SegRescale (line 744) | class SegRescale(object):
    method __init__ (line 751) | def __init__(self, scale_factor=1):
    method __call__ (line 754) | def __call__(self, results):
    method __repr__ (line 769) | def __repr__(self):
  class PhotoMetricDistortion (line 774) | class PhotoMetricDistortion(object):
    method __init__ (line 795) | def __init__(self,
    method convert (line 805) | def convert(self, img, alpha=1, beta=0):
    method brightness (line 811) | def brightness(self, img):
    method contrast (line 820) | def contrast(self, img):
    method saturation (line 828) | def saturation(self, img):
    method hue (line 839) | def hue(self, img):
    method __call__ (line 849) | def __call__(self, results):
    method __repr__ (line 882) | def __repr__(self):

FILE: downstream_tasks/semantic_segmentation/mmseg/datasets/stare.py
  class STAREDataset (line 8) | class STAREDataset(CustomDataset):
    method __init__ (line 21) | def __init__(self, **kwargs):

FILE: downstream_tasks/semantic_segmentation/mmseg/datasets/voc.py
  class PascalVOCDataset (line 8) | class PascalVOCDataset(CustomDataset):
    method __init__ (line 26) | def __init__(self, split, **kwargs):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/backbones/cgnet.py
  class GlobalContextExtractor (line 13) | class GlobalContextExtractor(nn.Module):
    method __init__ (line 26) | def __init__(self, channel, reduction=16, with_cp=False):
    method forward (line 37) | def forward(self, x):
  class ContextGuidedBlock (line 53) | class ContextGuidedBlock(nn.Module):
    method __init__ (line 78) | def __init__(self,
    method forward (line 142) | def forward(self, x):
  class InputInjection (line 170) | class InputInjection(nn.Module):
    method __init__ (line 173) | def __init__(self, num_downsampling):
    method forward (line 179) | def forward(self, x):
  class CGNet (line 186) | class CGNet(nn.Module):
    method __init__ (line 215) | def __init__(self,
    method forward (line 309) | def forward(self, x):
    method init_weights (line 338) | def init_weights(self, pretrained=None):
    method train (line 359) | def train(self, mode=True):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/backbones/fast_scnn.py
  class LearningToDownsample (line 13) | class LearningToDownsample(nn.Module):
    method __init__ (line 29) | def __init__(self,
    method forward (line 66) | def forward(self, x):
  class GlobalFeatureExtractor (line 73) | class GlobalFeatureExtractor(nn.Module):
    method __init__ (line 106) | def __init__(self,
    method _make_layer (line 148) | def _make_layer(self,
    method forward (line 172) | def forward(self, x):
  class FeatureFusionModule (line 181) | class FeatureFusionModule(nn.Module):
    method __init__ (line 199) | def __init__(self,
    method forward (line 235) | def forward(self, higher_res_feature, lower_res_feature):
  class FastSCNN (line 250) | class FastSCNN(nn.Module):
    method __init__ (line 296) | def __init__(self,
    method init_weights (line 360) | def init_weights(self, pretrained=None):
    method forward (line 367) | def forward(self, x):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/backbones/hrnet.py
  class HRModule (line 13) | class HRModule(nn.Module):
    method __init__ (line 20) | def __init__(self,
    method _check_branches (line 46) | def _check_branches(self, num_branches, num_blocks, in_channels,
    method _make_one_branch (line 64) | def _make_one_branch(self,
    method _make_branches (line 109) | def _make_branches(self, num_branches, block, num_blocks, num_channels):
    method _make_fuse_layers (line 119) | def _make_fuse_layers(self):
    method forward (line 185) | def forward(self, x):
  class HRNet (line 212) | class HRNet(nn.Module):
    method __init__ (line 273) | def __init__(self,
    method norm1 (line 362) | def norm1(self):
    method norm2 (line 367) | def norm2(self):
    method _make_transition_layer (line 371) | def _make_transition_layer(self, num_channels_pre_layer,
    method _make_layer (line 418) | def _make_layer(self, block, inplanes, planes, blocks, stride=1):
    method _make_stage (line 454) | def _make_stage(self, layer_config, in_channels, multiscale_output=True):
    method init_weights (line 484) | def init_weights(self, pretrained=None):
    method forward (line 510) | def forward(self, x):
    method train (line 547) | def train(self, mode=True):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/backbones/mobilenet_v2.py
  class MobileNetV2 (line 13) | class MobileNetV2(nn.Module):
    method __init__ (line 45) | def __init__(self,
    method make_layer (line 107) | def make_layer(self, out_channels, num_blocks, stride, dilation,
    method init_weights (line 136) | def init_weights(self, pretrained=None):
    method forward (line 149) | def forward(self, x):
    method _freeze_stages (line 164) | def _freeze_stages(self):
    method train (line 174) | def train(self, mode=True):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/backbones/mobilenet_v3.py
  class MobileNetV3 (line 15) | class MobileNetV3(nn.Module):
    method __init__ (line 70) | def __init__(self,
    method _make_layer (line 104) | def _make_layer(self):
    method init_weights (line 220) | def init_weights(self, pretrained=None):
    method forward (line 233) | def forward(self, x):
    method _freeze_stages (line 242) | def _freeze_stages(self):
    method train (line 249) | def train(self, mode=True):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/backbones/resnest.py
  class RSoftmax (line 15) | class RSoftmax(nn.Module):
    method __init__ (line 23) | def __init__(self, radix, groups):
    method forward (line 28) | def forward(self, x):
  class SplitAttentionConv2d (line 39) | class SplitAttentionConv2d(nn.Module):
    method __init__ (line 58) | def __init__(self,
    method norm0 (line 108) | def norm0(self):
    method norm1 (line 113) | def norm1(self):
    method forward (line 117) | def forward(self, x):
  class Bottleneck (line 146) | class Bottleneck(_Bottleneck):
    method __init__ (line 165) | def __init__(self,
    method forward (line 226) | def forward(self, x):
  class ResNeSt (line 270) | class ResNeSt(ResNetV1d):
    method __init__ (line 291) | def __init__(self,
    method make_res_layer (line 305) | def make_res_layer(self, **kwargs):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/backbones/resnet.py
  class BasicBlock (line 13) | class BasicBlock(nn.Module):
    method __init__ (line 18) | def __init__(self,
    method norm1 (line 58) | def norm1(self):
    method norm2 (line 63) | def norm2(self):
    method forward (line 67) | def forward(self, x):
  class Bottleneck (line 97) | class Bottleneck(nn.Module):
    method __init__ (line 106) | def __init__(self,
    method make_block_plugins (line 219) | def make_block_plugins(self, in_channels, plugins):
    method forward_plugin (line 242) | def forward_plugin(self, x, plugin_names):
    method norm1 (line 250) | def norm1(self):
    method norm2 (line 255) | def norm2(self):
    method norm3 (line 260) | def norm3(self):
    method forward (line 264) | def forward(self, x):
  class ResNet (line 308) | class ResNet(nn.Module):
    method __init__ (line 373) | def __init__(self,
    method make_stage_plugins (line 470) | def make_stage_plugins(self, plugins, stage_idx):
    method make_res_layer (line 523) | def make_res_layer(self, **kwargs):
    method norm1 (line 528) | def norm1(self):
    method _make_stem_layer (line 532) | def _make_stem_layer(self, in_channels, stem_channels):
    method _freeze_stages (line 581) | def _freeze_stages(self):
    method init_weights (line 600) | def init_weights(self, pretrained=None):
    method forward (line 632) | def forward(self, x):
    method train (line 649) | def train(self, mode=True):
  class ResNetV1c (line 662) | class ResNetV1c(ResNet):
    method __init__ (line 672) | def __init__(self, **kwargs):
  class ResNetV1d (line 678) | class ResNetV1d(ResNet):
    method __init__ (line 686) | def __init__(self, **kwargs):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/backbones/resnext.py
  class Bottleneck (line 11) | class Bottleneck(_Bottleneck):
    method __init__ (line 18) | def __init__(self,
  class ResNeXt (line 87) | class ResNeXt(ResNet):
    method __init__ (line 134) | def __init__(self, groups=1, base_width=4, **kwargs):
    method make_res_layer (line 139) | def make_res_layer(self, **kwargs):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/backbones/unet.py
  class BasicConvBlock (line 13) | class BasicConvBlock(nn.Module):
    method __init__ (line 43) | def __init__(self,
    method forward (line 76) | def forward(self, x):
  class DeconvModule (line 87) | class DeconvModule(nn.Module):
    method __init__ (line 105) | def __init__(self,
    method forward (line 137) | def forward(self, x):
  class InterpConv (line 148) | class InterpConv(nn.Module):
    method __init__ (line 179) | def __init__(self,
    method forward (line 211) | def forward(self, x):
  class UNet (line 222) | class UNet(nn.Module):
    method __init__ (line 277) | def __init__(self,
    method forward (line 375) | def forward(self, x):
    method train (line 388) | def train(self, mode=True):
    method _check_input_devisible (line 398) | def _check_input_devisible(self, x):
    method init_weights (line 411) | def init_weights(self, pretrained=None):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/builder.py
  function build (line 13) | def build(cfg, registry, default_args=None):
  function build_backbone (line 36) | def build_backbone(cfg):
  function build_neck (line 41) | def build_neck(cfg):
  function build_head (line 46) | def build_head(cfg):
  function build_loss (line 51) | def build_loss(cfg):
  function build_segmentor (line 56) | def build_segmentor(cfg, train_cfg=None, test_cfg=None):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/ann_head.py
  class PPMConcat (line 10) | class PPMConcat(nn.ModuleList):
    method __init__ (line 18) | def __init__(self, pool_scales=(1, 3, 6, 8)):
    method forward (line 22) | def forward(self, feats):
  class SelfAttentionBlock (line 32) | class SelfAttentionBlock(_SelfAttentionBlock):
    method __init__ (line 52) | def __init__(self, low_in_channels, high_in_channels, channels,
  class AFNB (line 79) | class AFNB(nn.Module):
    method __init__ (line 99) | def __init__(self, low_in_channels, high_in_channels, channels,
    method forward (line 125) | def forward(self, low_feats, high_feats):
  class APNB (line 133) | class APNB(nn.Module):
    method __init__ (line 150) | def __init__(self, in_channels, channels, out_channels, query_scales,
    method forward (line 175) | def forward(self, feats):
  class ANNHead (line 184) | class ANNHead(BaseDecodeHead):
    method __init__ (line 198) | def __init__(self,
    method forward (line 236) | def forward(self, inputs):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/apc_head.py
  class ACM (line 11) | class ACM(nn.Module):
    method __init__ (line 25) | def __init__(self, pool_scale, fusion, in_channels, channels, conv_cfg,
    method forward (line 78) | def forward(self, x):
  class APCHead (line 110) | class APCHead(BaseDecodeHead):
    method __init__ (line 124) | def __init__(self, pool_scales=(1, 2, 3, 6), fusion=True, **kwargs):
    method forward (line 149) | def forward(self, inputs):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/aspp_head.py
  class ASPPModule (line 10) | class ASPPModule(nn.ModuleList):
    method __init__ (line 22) | def __init__(self, dilations, in_channels, channels, conv_cfg, norm_cfg,
    method forward (line 43) | def forward(self, x):
  class ASPPHead (line 53) | class ASPPHead(BaseDecodeHead):
    method __init__ (line 64) | def __init__(self, dilations=(1, 6, 12, 18), **kwargs):
    method forward (line 93) | def forward(self, inputs):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/cascade_decode_head.py
  class BaseCascadeDecodeHead (line 6) | class BaseCascadeDecodeHead(BaseDecodeHead, metaclass=ABCMeta):
    method __init__ (line 10) | def __init__(self, *args, **kwargs):
    method forward (line 14) | def forward(self, inputs, prev_output):
    method forward_train (line 18) | def forward_train(self, inputs, prev_output, img_metas, gt_semantic_seg,
    method forward_test (line 41) | def forward_test(self, inputs, prev_output, img_metas, test_cfg):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/cc_head.py
  class CCHead (line 13) | class CCHead(FCNHead):
    method __init__ (line 24) | def __init__(self, recurrence=2, **kwargs):
    method forward (line 32) | def forward(self, inputs):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/da_head.py
  class PAM (line 12) | class PAM(_SelfAttentionBlock):
    method __init__ (line 20) | def __init__(self, in_channels, channels):
    method forward (line 41) | def forward(self, x):
  class CAM (line 49) | class CAM(nn.Module):
    method __init__ (line 52) | def __init__(self):
    method forward (line 56) | def forward(self, x):
  class DAHead (line 75) | class DAHead(BaseDecodeHead):
    method __init__ (line 85) | def __init__(self, pam_channels, **kwargs):
    method pam_cls_seg (line 128) | def pam_cls_seg(self, feat):
    method cam_cls_seg (line 135) | def cam_cls_seg(self, feat):
    method forward (line 142) | def forward(self, inputs):
    method forward_test (line 160) | def forward_test(self, inputs, img_metas, test_cfg):
    method losses (line 164) | def losses(self, seg_logit, seg_label):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/decode_head.py
  class BaseDecodeHead (line 14) | class BaseDecodeHead(nn.Module, metaclass=ABCMeta):
    method __init__ (line 46) | def __init__(self,
    method extra_repr (line 88) | def extra_repr(self):
    method _init_inputs (line 95) | def _init_inputs(self, in_channels, in_index, input_transform):
    method init_weights (line 133) | def init_weights(self):
    method _transform_inputs (line 137) | def _transform_inputs(self, inputs):
    method forward (line 166) | def forward(self, inputs):
    method forward_train (line 170) | def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg):
    method forward_test (line 190) | def forward_test(self, inputs, img_metas, test_cfg):
    method cls_seg (line 207) | def cls_seg(self, feat):
    method losses (line 215) | def losses(self, seg_logit, seg_label):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/dm_head.py
  class DCM (line 10) | class DCM(nn.Module):
    method __init__ (line 24) | def __init__(self, filter_size, fusion, in_channels, channels, conv_cfg,
    method forward (line 60) | def forward(self, x):
  class DMHead (line 92) | class DMHead(BaseDecodeHead):
    method __init__ (line 106) | def __init__(self, filter_sizes=(1, 3, 5, 7), fusion=False, **kwargs):
    method forward (line 131) | def forward(self, inputs):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/dnl_head.py
  class DisentangledNonLocal2d (line 9) | class DisentangledNonLocal2d(NonLocal2d):
    method __init__ (line 16) | def __init__(self, *arg, temperature, **kwargs):
    method embedded_gaussian (line 21) | def embedded_gaussian(self, theta_x, phi_x):
    method forward (line 33) | def forward(self, x):
  class DNLHead (line 87) | class DNLHead(FCNHead):
    method __init__ (line 102) | def __init__(self,
    method forward (line 122) | def forward(self, inputs):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/ema_head.py
  function reduce_mean (line 13) | def reduce_mean(tensor):
  class EMAModule (line 22) | class EMAModule(nn.Module):
    method __init__ (line 31) | def __init__(self, channels, num_bases, num_stages, momentum):
    method forward (line 44) | def forward(self, feats):
  class EMAHead (line 79) | class EMAHead(BaseDecodeHead):
    method __init__ (line 94) | def __init__(self,
    method forward (line 154) | def forward(self, inputs):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/enc_head.py
  class EncModule (line 11) | class EncModule(nn.Module):
    method __init__ (line 22) | def __init__(self, in_channels, num_codes, conv_cfg, norm_cfg, act_cfg):
    method forward (line 50) | def forward(self, x):
  class EncHead (line 62) | class EncHead(BaseDecodeHead):
    method __init__ (line 78) | def __init__(self,
    method forward (line 129) | def forward(self, inputs):
    method forward_test (line 151) | def forward_test(self, inputs, img_metas, test_cfg):
    method _convert_to_onehot_labels (line 159) | def _convert_to_onehot_labels(seg_label, num_classes):
    method losses (line 178) | def losses(self, seg_logit, seg_label):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/fcn_head.py
  class FCNHead (line 10) | class FCNHead(BaseDecodeHead):
    method __init__ (line 22) | def __init__(self,
    method forward (line 69) | def forward(self, inputs):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/fpn_head.py
  class FPNHead (line 11) | class FPNHead(BaseDecodeHead):
    method __init__ (line 23) | def __init__(self, feature_strides, **kwargs):
    method forward (line 54) | def forward(self, inputs):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/gc_head.py
  class GCHead (line 9) | class GCHead(FCNHead):
    method __init__ (line 23) | def __init__(self,
    method forward (line 38) | def forward(self, inputs):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/lraspp_head.py
  class LRASPPHead (line 12) | class LRASPPHead(BaseDecodeHead):
    method __init__ (line 23) | def __init__(self, branch_channels=(32, 64), **kwargs):
    method forward (line 68) | def forward(self, inputs):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/nl_head.py
  class NLHead (line 9) | class NLHead(FCNHead):
    method __init__ (line 23) | def __init__(self,
    method forward (line 40) | def forward(self, inputs):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/ocr_head.py
  class SpatialGatherModule (line 12) | class SpatialGatherModule(nn.Module):
    method __init__ (line 19) | def __init__(self, scale):
    method forward (line 23) | def forward(self, feats, probs):
  class ObjectAttentionBlock (line 39) | class ObjectAttentionBlock(_SelfAttentionBlock):
    method __init__ (line 42) | def __init__(self, in_channels, channels, scale, conv_cfg, norm_cfg,
    method forward (line 73) | def forward(self, query_feats, key_feats):
  class OCRHead (line 85) | class OCRHead(BaseCascadeDecodeHead):
    method __init__ (line 97) | def __init__(self, ocr_channels, scale=1, **kwargs):
    method forward (line 119) | def forward(self, inputs, prev_output):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/point_head.py
  function calculate_uncertainty (line 14) | def calculate_uncertainty(seg_logits):
  class PointHead (line 35) | class PointHead(BaseCascadeDecodeHead):
    method __init__ (line 60) | def __init__(self,
    method init_weights (line 104) | def init_weights(self):
    method cls_seg (line 108) | def cls_seg(self, feat):
    method forward (line 115) | def forward(self, fine_grained_point_feats, coarse_point_feats):
    method _get_fine_grained_point_feats (line 123) | def _get_fine_grained_point_feats(self, x, points):
    method _get_coarse_point_feats (line 147) | def _get_coarse_point_feats(self, prev_output, points):
    method forward_train (line 165) | def forward_train(self, inputs, prev_output, img_metas, gt_semantic_seg,
    method forward_test (line 203) | def forward_test(self, inputs, prev_output, img_metas, test_cfg):
    method losses (line 248) | def losses(self, point_logits, point_label):
    method get_points_train (line 256) | def get_points_train(self, seg_logits, uncertainty_func, cfg):
    method get_points_test (line 310) | def get_points_test(self, seg_logits, uncertainty_func, cfg):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/psa_head.py
  class PSAHead (line 17) | class PSAHead(BaseDecodeHead):
    method __init__ (line 35) | def __init__(self,
    method forward (line 113) | def forward(self, inputs):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/psp_head.py
  class PPM (line 10) | class PPM(nn.ModuleList):
    method __init__ (line 24) | def __init__(self, pool_scales, in_channels, channels, conv_cfg, norm_...
    method forward (line 46) | def forward(self, x):
  class PSPHead (line 61) | class PSPHead(BaseDecodeHead):
    method __init__ (line 72) | def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
    method forward (line 93) | def forward(self, inputs):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/sep_aspp_head.py
  class DepthwiseSeparableASPPModule (line 10) | class DepthwiseSeparableASPPModule(ASPPModule):
    method __init__ (line 14) | def __init__(self, **kwargs):
  class DepthwiseSeparableASPPHead (line 29) | class DepthwiseSeparableASPPHead(ASPPHead):
    method __init__ (line 42) | def __init__(self, c1_in_channels, c1_channels, **kwargs):
    method forward (line 78) | def forward(self, inputs):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/sep_fcn_head.py
  class DepthwiseSeparableFCNHead (line 8) | class DepthwiseSeparableFCNHead(FCNHead):
    method __init__ (line 29) | def __init__(self, **kwargs):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/uper_head.py
  class UPerHead (line 12) | class UPerHead(BaseDecodeHead):
    method __init__ (line 23) | def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
    method psp_forward (line 76) | def psp_forward(self, inputs):
    method forward (line 86) | def forward(self, inputs):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/losses/accuracy.py
  function accuracy (line 4) | def accuracy(pred, target, topk=1, thresh=None):
  class Accuracy (line 52) | class Accuracy(nn.Module):
    method __init__ (line 55) | def __init__(self, topk=(1, ), thresh=None):
    method forward (line 68) | def forward(self, pred, target):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/losses/cross_entropy_loss.py
  function cross_entropy (line 9) | def cross_entropy(pred,
  function _expand_onehot_labels (line 35) | def _expand_onehot_labels(labels, label_weights, target_shape, ignore_in...
  function binary_cross_entropy (line 57) | def binary_cross_entropy(pred,
  function mask_cross_entropy (line 100) | def mask_cross_entropy(pred,
  class CrossEntropyLoss (line 139) | class CrossEntropyLoss(nn.Module):
    method __init__ (line 154) | def __init__(self,
    method forward (line 175) | def forward(self,

FILE: downstream_tasks/semantic_segmentation/mmseg/models/losses/lovasz_loss.py
  function lovasz_grad (line 14) | def lovasz_grad(gt_sorted):
  function flatten_binary_logits (line 29) | def flatten_binary_logits(logits, labels, ignore_index=None):
  function flatten_probs (line 42) | def flatten_probs(probs, labels, ignore_index=None):
  function lovasz_hinge_flat (line 59) | def lovasz_hinge_flat(logits, labels):
  function lovasz_hinge (line 83) | def lovasz_hinge(logits,
  function lovasz_softmax_flat (line 128) | def lovasz_softmax_flat(probs, labels, classes='present', class_weight=N...
  function lovasz_softmax (line 171) | def lovasz_softmax(probs,
  class LovaszLoss (line 225) | class LovaszLoss(nn.Module):
    method __init__ (line 248) | def __init__(self,
    method forward (line 274) | def forward(self,

FILE: downstream_tasks/semantic_segmentation/mmseg/models/losses/utils.py
  function reduce_loss (line 6) | def reduce_loss(loss, reduction):
  function weight_reduce_loss (line 26) | def weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=N...
  function weighted_loss (line 58) | def weighted_loss(loss_func):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/necks/fpn.py
  class FPN (line 9) | class FPN(nn.Module):
    method __init__ (line 63) | def __init__(self,
    method init_weights (line 157) | def init_weights(self):
    method forward (line 162) | def forward(self, inputs):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/segmentors/base.py
  class BaseSegmentor (line 14) | class BaseSegmentor(nn.Module):
    method __init__ (line 19) | def __init__(self):
    method with_neck (line 24) | def with_neck(self):
    method with_auxiliary_head (line 29) | def with_auxiliary_head(self):
    method with_decode_head (line 35) | def with_decode_head(self):
    method extract_feat (line 40) | def extract_feat(self, imgs):
    method encode_decode (line 45) | def encode_decode(self, img, img_metas):
    method forward_train (line 51) | def forward_train(self, imgs, img_metas, **kwargs):
    method simple_test (line 56) | def simple_test(self, img, img_meta, **kwargs):
    method aug_test (line 61) | def aug_test(self, imgs, img_metas, **kwargs):
    method init_weights (line 65) | def init_weights(self, pretrained=None):
    method forward_test (line 76) | def forward_test(self, imgs, img_metas, **kwargs):
    method forward (line 111) | def forward(self, img, img_metas, return_loss=True, **kwargs):
    method train_step (line 126) | def train_step(self, data_batch, optimizer, **kwargs):
    method val_step (line 162) | def val_step(self, data_batch, **kwargs):
    method _parse_losses (line 173) | def _parse_losses(losses):
    method show_result (line 208) | def show_result(self,

FILE: downstream_tasks/semantic_segmentation/mmseg/models/segmentors/cascade_encoder_decoder.py
  class CascadeEncoderDecoder (line 11) | class CascadeEncoderDecoder(EncoderDecoder):
    method __init__ (line 19) | def __init__(self,
    method _init_decode_head (line 38) | def _init_decode_head(self, decode_head):
    method init_weights (line 48) | def init_weights(self, pretrained=None):
    method encode_decode (line 65) | def encode_decode(self, img, img_metas):
    method _decode_head_forward_train (line 80) | def _decode_head_forward_train(self, x, img_metas, gt_semantic_seg):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/segmentors/encoder_decoder.py
  class EncoderDecoder (line 13) | class EncoderDecoder(BaseSegmentor):
    method __init__ (line 21) | def __init__(self,
    method _init_decode_head (line 43) | def _init_decode_head(self, decode_head):
    method _init_auxiliary_head (line 49) | def _init_auxiliary_head(self, auxiliary_head):
    method init_weights (line 59) | def init_weights(self, pretrained=None):
    method extract_feat (line 77) | def extract_feat(self, img):
    method encode_decode (line 84) | def encode_decode(self, img, img_metas):
    method _decode_head_forward_train (line 96) | def _decode_head_forward_train(self, x, img_metas, gt_semantic_seg):
    method _decode_head_forward_test (line 107) | def _decode_head_forward_test(self, x, img_metas):
    method _auxiliary_head_forward_train (line 113) | def _auxiliary_head_forward_train(self, x, img_metas, gt_semantic_seg):
    method forward_dummy (line 130) | def forward_dummy(self, img):
    method forward_train (line 136) | def forward_train(self, img, img_metas, gt_semantic_seg):
    method slide_inference (line 169) | def slide_inference(self, img, img_meta, rescale):
    method whole_inference (line 214) | def whole_inference(self, img, img_meta, rescale):
    method inference (line 228) | def inference(self, img, img_meta, rescale):
    method simple_test (line 263) | def simple_test(self, img, img_meta, rescale=True):
    method aug_test (line 276) | def aug_test(self, imgs, img_metas, rescale=True):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/utils/inverted_residual.py
  class InvertedResidual (line 8) | class InvertedResidual(nn.Module):
    method __init__ (line 31) | def __init__(self,
    method forward (line 81) | def forward(self, x):
  class InvertedResidualV3 (line 97) | class InvertedResidualV3(nn.Module):
    method __init__ (line 124) | def __init__(self,
    method forward (line 183) | def forward(self, x):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/utils/make_divisible.py
  function make_divisible (line 1) | def make_divisible(value, divisor, min_value=None, min_ratio=0.9):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/utils/res_layer.py
  class ResLayer (line 5) | class ResLayer(nn.Sequential):
    method __init__ (line 26) | def __init__(self,

FILE: downstream_tasks/semantic_segmentation/mmseg/models/utils/se_layer.py
  class SELayer (line 8) | class SELayer(nn.Module):
    method __init__ (line 26) | def __init__(self,
    method forward (line 53) | def forward(self, x):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/utils/self_attention_block.py
  class SelfAttentionBlock (line 7) | class SelfAttentionBlock(nn.Module):
    method __init__ (line 32) | def __init__(self, key_in_channels, query_in_channels, channels,
    method init_weights (line 93) | def init_weights(self):
    method build_project (line 99) | def build_project(self, in_channels, channels, num_convs, use_conv_mod...
    method forward (line 131) | def forward(self, query_feats, key_feats):

FILE: downstream_tasks/semantic_segmentation/mmseg/models/utils/up_conv_block.py
  class UpConvBlock (line 6) | class UpConvBlock(nn.Module):
    method __init__ (line 44) | def __init__(self,
    method forward (line 94) | def forward(self, skip, x):

FILE: downstream_tasks/semantic_segmentation/mmseg/ops/encoding.py
  class Encoding (line 6) | class Encoding(nn.Module):
    method __init__ (line 17) | def __init__(self, channels, num_codes):
    method scaled_l2 (line 33) | def scaled_l2(x, codewords, scale):
    method aggregate (line 46) | def aggregate(assigment_weights, x, codewords):
    method forward (line 57) | def forward(self, x):
    method __repr__ (line 70) | def __repr__(self):

FILE: downstream_tasks/semantic_segmentation/mmseg/ops/wrappers.py
  function resize (line 8) | def resize(input,
  class Upsample (line 32) | class Upsample(nn.Module):
    method __init__ (line 34) | def __init__(self,
    method forward (line 48) | def forward(self, x):

FILE: downstream_tasks/semantic_segmentation/mmseg/utils/collect_env.py
  function collect_env (line 7) | def collect_env():

FILE: downstream_tasks/semantic_segmentation/mmseg/utils/logger.py
  function get_root_logger (line 6) | def get_root_logger(log_file=None, log_level=logging.INFO):

FILE: downstream_tasks/semantic_segmentation/mmseg/version.py
  function parse_version_info (line 6) | def parse_version_info(version_str):

FILE: downstream_tasks/semantic_segmentation/tools/test.py
  function parse_args (line 17) | def parse_args():
  function main (line 67) | def main():

FILE: downstream_tasks/semantic_segmentation/tools/train.py
  function parse_args (line 26) | def parse_args():
  function main (line 70) | def main():

FILE: furnace/dataset_folder.py
  function has_file_allowed_extension (line 11) | def has_file_allowed_extension(filename: str, extensions: Tuple[str, ......
  function is_image_file (line 24) | def is_image_file(filename: str) -> bool:
  function make_dataset (line 36) | def make_dataset(
  class DatasetFolder (line 66) | class DatasetFolder(VisionDataset):
    method __init__ (line 98) | def __init__(
    method _find_classes (line 125) | def _find_classes(self, dir: str) -> Tuple[List[str], Dict[str, int]]:
    method __getitem__ (line 143) | def __getitem__(self, index: int) -> Tuple[Any, Any]:
    method __len__ (line 167) | def __len__(self) -> int:
  function pil_loader (line 174) | def pil_loader(path: str) -> Image.Image:
  function accimage_loader (line 182) | def accimage_loader(path: str) -> Any:
  function default_loader (line 191) | def default_loader(path: str) -> Any:
  class ImageFolder (line 199) | class ImageFolder(DatasetFolder):
    method __init__ (line 226) | def __init__(

FILE: furnace/datasets.py
  function preprocess_vqgan (line 15) | def preprocess_vqgan(x):
  class DataAugmentationForCAE (line 19) | class DataAugmentationForCAE(object):
    method __init__ (line 20) | def __init__(self, args):
    method __call__ (line 85) | def __call__(self, image):
    method __repr__ (line 92) | def __repr__(self):
  function build_cae_pretraining_dataset (line 101) | def build_cae_pretraining_dataset(args):
  function build_dataset (line 107) | def build_dataset(is_train, args):
  function build_transform (line 141) | def build_transform(is_train, args):

FILE: furnace/engine_for_finetuning.py
  function train_class_batch (line 14) | def train_class_batch(model, samples, target, criterion):
  function get_loss_scale_for_deepspeed (line 20) | def get_loss_scale_for_deepspeed(model):
  function train_one_epoch (line 25) | def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
  function evaluate (line 150) | def evaluate(data_loader, model, device):

FILE: furnace/engine_for_pretraining.py
  function loss_selector (line 13) | def loss_selector(loss_type, pred, target):
  function train_one_epoch (line 19) | def train_one_epoch(model: torch.nn.Module, d_vae: torch.nn.Module,

FILE: furnace/masking_generator.py
  class MaskingGenerator (line 5) | class MaskingGenerator:
    method __init__ (line 6) | def __init__(
    method __repr__ (line 22) | def __repr__(self):
    method get_shape (line 28) | def get_shape(self):
    method _mask (line 31) | def _mask(self, mask, max_mask_patches):
    method __call__ (line 55) | def __call__(self):
  class RandomMaskingGenerator (line 68) | class RandomMaskingGenerator:
    method __init__ (line 69) | def __init__(
    method __repr__ (line 78) | def __repr__(self):
    method __call__ (line 85) | def __call__(self):

FILE: furnace/optim_factory.py
  function get_num_layer_for_vit (line 24) | def get_num_layer_for_vit(var_name, num_max_layer):
  class LayerDecayValueAssigner (line 38) | class LayerDecayValueAssigner(object):
    method __init__ (line 39) | def __init__(self, values):
    method get_scale (line 42) | def get_scale(self, layer_id):
    method get_layer_id (line 45) | def get_layer_id(self, var_name):
  function get_parameter_groups (line 49) | def get_parameter_groups(model, weight_decay=1e-5, skip_list=(), get_num...
  function create_optimizer (line 91) | def create_optimizer(args, model, get_num_layer=None, get_layer_scale=No...

FILE: furnace/transforms.py
  class ToNumpy (line 10) | class ToNumpy:
    method __call__ (line 12) | def __call__(self, pil_img):
  class ToTensor (line 20) | class ToTensor:
    method __init__ (line 22) | def __init__(self, dtype=torch.float32):
    method __call__ (line 25) | def __call__(self, pil_img):
  function _pil_interp (line 43) | def _pil_interp(method):
  class RandomResizedCropAndInterpolationWithTwoPic (line 58) | class RandomResizedCropAndInterpolationWithTwoPic:
    method __init__ (line 73) | def __init__(self, size, second_size=None, scale=(0.08, 1.0), ratio=(3...
    method get_params (line 98) | def get_params(img, scale, ratio):
    method __call__ (line 140) | def __call__(self, img):
    method __repr__ (line 159) | def __repr__(self):

FILE: furnace/utils.py
  class SmoothedValue (line 34) | class SmoothedValue(object):
    method __init__ (line 39) | def __init__(self, window_size=20, fmt=None):
    method update (line 47) | def update(self, value, n=1):
    method synchronize_between_processes (line 52) | def synchronize_between_processes(self):
    method median (line 66) | def median(self):
    method avg (line 71) | def avg(self):
    method global_avg (line 76) | def global_avg(self):
    method max (line 80) | def max(self):
    method value (line 84) | def value(self):
    method __str__ (line 87) | def __str__(self):
  class MetricLogger (line 96) | class MetricLogger(object):
    method __init__ (line 97) | def __init__(self, delimiter="\t"):
    method update (line 101) | def update(self, **kwargs):
    method __getattr__ (line 110) | def __getattr__(self, attr):
    method __str__ (line 118) | def __str__(self):
    method synchronize_between_processes (line 126) | def synchronize_between_processes(self):
    method add_meter (line 130) | def add_meter(self, name, meter):
    method log_every (line 133) | def log_every(self, iterable, print_freq, header=None):
  class TensorboardLogger (line 180) | class TensorboardLogger(object):
    method __init__ (line 181) | def __init__(self, log_dir):
    method set_step (line 185) | def set_step(self, step=None):
    method update (line 191) | def update(self, head='scalar', step=None, **kwargs):
    method flush (line 200) | def flush(self):
  function _load_checkpoint_for_ema (line 204) | def _load_checkpoint_for_ema(model_ema, checkpoint):
  function setup_for_distributed_each_gpu (line 213) | def setup_for_distributed_each_gpu(rank):
  function setup_for_distributed (line 225) | def setup_for_distributed(is_master):
  function is_dist_avail_and_initialized (line 242) | def is_dist_avail_and_initialized():
  function get_world_size (line 250) | def get_world_size():
  function get_rank (line 256) | def get_rank():
  function is_main_process (line 262) | def is_main_process():
  function save_on_master (line 266) | def save_on_master(*args, **kwargs):
  function init_distributed_mode (line 271) | def init_distributed_mode(args):
  function load_state_dict (line 308) | def load_state_dict(model, state_dict, prefix='', ignore_missing="relati...
  class NativeScalerWithGradNormCount (line 357) | class NativeScalerWithGradNormCount:
    method __init__ (line 360) | def __init__(self):
    method __call__ (line 363) | def __call__(self, loss, optimizer, clip_grad=None, parameters=None, c...
    method state_dict (line 379) | def state_dict(self):
    method load_state_dict (line 382) | def load_state_dict(self, state_dict):
  function get_grad_norm_ (line 386) | def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor:
  function cosine_scheduler (line 401) | def cosine_scheduler(base_value, final_value, epochs, niter_per_ep, warm...
  function save_model (line 421) | def save_model(args, epoch, model, model_without_ddp, optimizer, loss_sc...
  function auto_load_model (line 459) | def auto_load_model(args, model, model_without_ddp, optimizer, loss_scal...
  function create_d_vae (line 529) | def create_d_vae(weight_path, d_vae_type, image_size, device, args=None):
  function get_vqgan_gumbel_f8_8192 (line 541) | def get_vqgan_gumbel_f8_8192(weight_path, image_size, device):
  function get_dalle_vae (line 549) | def get_dalle_vae(weight_path, image_size, device):
  function get_d_vae (line 555) | def get_d_vae(weight_path, image_size, device, args):
  function create_ds_config (line 575) | def create_ds_config(args):
  class LP_BatchNorm (line 607) | class LP_BatchNorm(_NormBase):
    method __init__ (line 614) | def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True,
    method _check_input_dim (line 619) | def _check_input_dim(self, input):
    method forward (line 624) | def forward(self, input, is_train):

FILE: linear_util/crop.py
  class RandomResizedCrop (line 9) | class RandomResizedCrop(transforms.RandomResizedCrop):
    method get_params (line 17) | def get_params(img, scale, ratio):

FILE: linear_util/datasets.py
  class DataAugmentationMySelf (line 10) | class DataAugmentationMySelf(object):
    method __init__ (line 11) | def __init__(self, args):
    method __call__ (line 29) | def __call__(self, image):
    method __repr__ (line 32) | def __repr__(self):
  function build_dataset (line 39) | def build_dataset(is_train, args):
  function build_dataset_finetune (line 49) | def build_dataset_finetune(is_train, args):
  function build_transform_finetune (line 59) | def build_transform_finetune(is_train, args):
  function build_transform (line 95) | def build_transform(is_train, args):

FILE: linear_util/engine_finetune.py
  function train_one_epoch (line 14) | def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
  function evaluate (line 88) | def evaluate(data_loader, model, device):

FILE: linear_util/lars.py
  class LARS (line 4) | class LARS(torch.optim.Optimizer):
    method __init__ (line 8) | def __init__(self, params, lr=0, weight_decay=0, momentum=0.9, trust_c...
    method step (line 13) | def step(self):

FILE: linear_util/lr_decay.py
  function param_groups_lrd (line 4) | def param_groups_lrd(model, weight_decay=0.05, no_weight_decay_list=[], ...
  function get_layer_id_for_vit (line 49) | def get_layer_id_for_vit(name, num_layers):

FILE: linear_util/lr_sched.py
  function adjust_learning_rate (line 3) | def adjust_learning_rate(optimizer, epoch, args):

FILE: linear_util/misc.py
  class SmoothedValue (line 13) | class SmoothedValue(object):
    method __init__ (line 18) | def __init__(self, window_size=20, fmt=None):
    method update (line 26) | def update(self, value, n=1):
    method synchronize_between_processes (line 31) | def synchronize_between_processes(self):
    method median (line 45) | def median(self):
    method avg (line 50) | def avg(self):
    method global_avg (line 55) | def global_avg(self):
    method max (line 59) | def max(self):
    method value (line 63) | def value(self):
    method __str__ (line 66) | def __str__(self):
  class MetricLogger (line 75) | class MetricLogger(object):
    method __init__ (line 76) | def __init__(self, delimiter="\t"):
    method update (line 80) | def update(self, **kwargs):
    method __getattr__ (line 89) | def __getattr__(self, attr):
    method __str__ (line 97) | def __str__(self):
    method synchronize_between_processes (line 105) | def synchronize_between_processes(self):
    method add_meter (line 109) | def add_meter(self, name, meter):
    method log_every (line 112) | def log_every(self, iterable, print_freq, header=None):
  function setup_for_distributed (line 159) | def setup_for_distributed(is_master):
  function is_dist_avail_and_initialized (line 176) | def is_dist_avail_and_initialized():
  function get_world_size (line 184) | def get_world_size():
  function get_rank (line 190) | def get_rank():
  function is_main_process (line 196) | def is_main_process():
  function save_on_master (line 200) | def save_on_master(*args, **kwargs):
  function init_distributed_mode (line 205) | def init_distributed_mode(args):
  class NativeScalerWithGradNormCount (line 240) | class NativeScalerWithGradNormCount:
    method __init__ (line 243) | def __init__(self):
    method __call__ (line 246) | def __call__(self, loss, optimizer, clip_grad=None, parameters=None, c...
    method state_dict (line 262) | def state_dict(self):
    method load_state_dict (line 265) | def load_state_dict(self, state_dict):
  function get_grad_norm_ (line 269) | def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor:
  function save_model (line 284) | def save_model(args, epoch, model, model_without_ddp, optimizer, loss_sc...
  function load_model (line 311) | def load_model(args, model_without_ddp, optimizer, loss_scaler):
  function all_reduce_mean (line 330) | def all_reduce_mean(x):

FILE: linear_util/pos_embed.py
  function get_2d_sincos_pos_embed (line 5) | def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
  function get_2d_sincos_pos_embed_from_grid (line 23) | def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
  function get_1d_sincos_pos_embed_from_grid (line 34) | def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
  function interpolate_pos_embed (line 60) | def interpolate_pos_embed(model, checkpoint_model):

FILE: models/modeling_cae.py
  function trunc_normal_ (line 12) | def trunc_normal_(tensor, mean=0., std=1.):
  class VisionTransformerForMaskedImageModeling (line 16) | class VisionTransformerForMaskedImageModeling(nn.Module):
    method __init__ (line 17) | def __init__(self, img_size=224, patch_size=16, in_chans=3, vocab_size...
    method _init_teacher (line 62) | def _init_teacher(self):
    method momentum_update (line 69) | def momentum_update(self, base_momentum=0):
    method _init_weights (line 76) | def _init_weights(self, m):
    method forward (line 90) | def forward(self, x, bool_masked_pos, return_all_tokens=None):
  function cae_small_patch16_224_8k_vocab (line 144) | def cae_small_patch16_224_8k_vocab(pretrained=False, **kwargs):
  function cae_base_patch16_224_8k_vocab (line 158) | def cae_base_patch16_224_8k_vocab(pretrained=False, **kwargs):
  function cae_large_patch16_224_8k_vocab (line 172) | def cae_large_patch16_224_8k_vocab(pretrained=False, **kwargs):

FILE: models/modeling_cae_helper.py
  function trunc_normal_ (line 12) | def trunc_normal_(tensor, mean=0., std=1.):
  class Attention (line 15) | class Attention(nn.Module):
    method __init__ (line 16) | def __init__(
    method forward (line 39) | def forward(self, x, bool_masked_pos=None):
  class CrossAttention (line 65) | class CrossAttention(nn.Module):
    method __init__ (line 66) | def __init__(
    method forward (line 93) | def forward(self, x, bool_masked_pos=None, k=None, v=None):
  class Block (line 126) | class Block(nn.Module):
    method __init__ (line 128) | def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_sc...
    method forward (line 147) | def forward(self, x, bool_masked_pos=None):
  class RegressorBlock (line 158) | class RegressorBlock(nn.Module):
    method __init__ (line 159) | def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_sc...
    method forward (line 183) | def forward(self, x_q, x_kv, pos_q, pos_k, bool_masked_pos):
  class VisionTransformerEncoder (line 195) | class VisionTransformerEncoder(nn.Module):
    method __init__ (line 196) | def __init__(self, img_size=224, patch_size=16, in_chans=3, vocab_size...
    method build_2d_sincos_position_embedding (line 233) | def build_2d_sincos_position_embedding(self, embed_dim=768, temperatur...
    method fix_init_weight (line 254) | def fix_init_weight(self):
    method _init_weights (line 262) | def _init_weights(self, m):
    method no_weight_decay (line 276) | def no_weight_decay(self):
    method get_num_layers (line 279) | def get_num_layers(self):
    method forward_features (line 282) | def forward_features(self, x, bool_masked_pos):
    method forward (line 307) | def forward(self, x, bool_masked_pos, return_all_tokens=False):
  class VisionTransformerNeck (line 314) | class VisionTransformerNeck(nn.Module):
    method __init__ (line 315) | def __init__(self, patch_size=16, num_classes=8192, embed_dim=768, dep...
    method fix_init_weight (line 353) | def fix_init_weight(self):
    method _init_weights (line 365) | def _init_weights(self, m):
    method no_weight_decay (line 379) | def no_weight_decay(self):
    method forward (line 382) | def forward(self, x_masked, x_unmasked, pos_embed_masked, pos_embed_un...

FILE: models/modeling_discrete_vae.py
  function top_k (line 19) | def top_k(logits, thres = 0.5):
  function exists (line 28) | def exists(val):
  function default (line 32) | def default(val, d):
  function eval_decorator (line 36) | def eval_decorator(fn):
  class BasicVAE (line 46) | class BasicVAE(nn.Module):
    method get_codebook_indices (line 48) | def get_codebook_indices(self, images):
    method decode (line 51) | def decode(self, img_seq):
    method get_codebook_probs (line 54) | def get_codebook_probs(self, img_seq):
    method get_image_tokens_size (line 57) | def get_image_tokens_size(self):
    method get_image_size (line 60) | def get_image_size(self):
  class ResBlock (line 65) | class ResBlock(nn.Module):
    method __init__ (line 66) | def __init__(self, chan):
    method forward (line 76) | def forward(self, x):
  class DiscreteVAE (line 82) | class DiscreteVAE(BasicVAE):
    method __init__ (line 83) | def __init__(
    method get_image_size (line 144) | def get_image_size(self):
    method get_image_tokens_size (line 147) | def get_image_tokens_size(self):
    method get_codebook_indices (line 152) | def get_codebook_indices(self, images):
    method get_codebook_probs (line 159) | def get_codebook_probs(self, images, temp):
    method decode (line 163) | def decode(
    method forward (line 175) | def forward(
  class Dalle_VAE (line 220) | class Dalle_VAE(BasicVAE):
    method __init__ (line 221) | def __init__(self, image_size):
    method load_model (line 227) | def load_model(self, model_dir, device):
    method decode (line 231) | def decode(self, img_seq):
    method get_codebook_indices (line 237) | def get_codebook_indices(self, images):
    method get_codebook_probs (line 241) | def get_codebook_probs(self, images):
    method forward (line 245) | def forward(self, img_seq_prob, no_process=False):
  class VGGAN (line 254) | class VGGAN(BasicVAE):
    method __init__ (line 255) | def __init__(self, image_size):
    method load_model (line 261) | def load_model(self, weight_path, device):
    method get_codebook_indices (line 264) | def get_codebook_indices(self, images):

FILE: models/modeling_finetune.py
  function _cfg (line 12) | def _cfg(url='', **kwargs):
  class DropPath (line 22) | class DropPath(nn.Module):
    method __init__ (line 25) | def __init__(self, drop_prob=None):
    method forward (line 29) | def forward(self, x):
    method extra_repr (line 32) | def extra_repr(self) -> str:
  class Mlp (line 36) | class Mlp(nn.Module):
    method __init__ (line 37) | def __init__(self, in_features, hidden_features=None, out_features=Non...
    method forward (line 46) | def forward(self, x):
  class Attention (line 55) | class Attention(nn.Module):
    method __init__ (line 56) | def __init__(
    method forward (line 109) | def forward(self, x, rel_pos_bias=None):
  class CrossAttention (line 141) | class CrossAttention(nn.Module):
    method __init__ (line 142) | def __init__(
    method forward (line 169) | def forward(self, x, bool_masked_pos=None, k=None, v=None):
  class Block (line 201) | class Block(nn.Module):
    method __init__ (line 203) | def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_sc...
    method forward (line 223) | def forward(self, x, rel_pos_bias=None):
  class AttentiveBlock (line 232) | class AttentiveBlock(nn.Module):
    method __init__ (line 234) | def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_sc...
    method forward (line 249) | def forward(self, x_q, x_kv, pos_q, pos_k, bool_masked_pos, rel_pos_bi...
  class PatchEmbed (line 258) | class PatchEmbed(nn.Module):
    method __init__ (line 261) | def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=...
    method forward (line 273) | def forward(self, x, **kwargs):
  class RelativePositionBias (line 282) | class RelativePositionBias(nn.Module):
    method __init__ (line 284) | def __init__(self, window_size, num_heads):
    method forward (line 313) | def forward(self):
  function get_sinusoid_encoding_table (line 320) | def get_sinusoid_encoding_table(n_position, d_hid, token=False):
  class VisionTransformer (line 335) | class VisionTransformer(nn.Module):
    method __init__ (line 338) | def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classe...
    method build_2d_sincos_position_embedding (line 413) | def build_2d_sincos_position_embedding(self, embed_dim=768, temperatur...
    method fix_init_weight (line 434) | def fix_init_weight(self):
    method _init_weights (line 442) | def _init_weights(self, m):
    method get_num_layers (line 451) | def get_num_layers(self):
    method no_weight_decay (line 455) | def no_weight_decay(self):
    method get_classifier (line 458) | def get_classifier(self):
    method reset_classifier (line 461) | def reset_classifier(self, num_classes, global_pool=''):
    method forward_features (line 465) | def forward_features(self, x, is_train=True):
    method forward (line 501) | def forward(self, x, is_train=True):
  function cae_small_patch16_224 (line 508) | def cae_small_patch16_224(pretrained=False, **kwargs):
  function cae_base_patch16_224 (line 516) | def cae_base_patch16_224(pretrained=False, **kwargs):
  function cae_base_patch16_384 (line 525) | def cae_base_patch16_384(pretrained=False, **kwargs):
  function cae_large_patch16_224 (line 534) | def cae_large_patch16_224(pretrained=False, **kwargs):
  function cae_large_patch16_384 (line 543) | def cae_large_patch16_384(pretrained=False, **kwargs):
  function cae_large_patch16_512 (line 552) | def cae_large_patch16_512(pretrained=False, **kwargs):

FILE: tools/run_attentive.py
  function get_args (line 26) | def get_args():
  function main (line 216) | def main(args, ds_init):

FILE: tools/run_class_finetuning.py
  function get_args (line 27) | def get_args():
  function main (line 216) | def main(args, ds_init):

FILE: tools/run_linear.py
  function setup_for_distributed (line 31) | def setup_for_distributed(rank):
  function get_args_parser (line 44) | def get_args_parser():
  function main (line 144) | def main(args):

FILE: tools/run_pretraining.py
  function get_args (line 23) | def get_args():
  function get_model (line 167) | def get_model(args):
  function main (line 182) | def main(args):

Download .json

Condensed preview — 236 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (1,278K chars).

[
  {
    "path": ".gitignore",
    "chars": 10,
    "preview": ".DS_Store\n"
  },
  {
    "path": "README.md",
    "chars": 6298,
    "preview": "# CAE: Context AutoEncoder for Self-Supervised Representation Learning \n\n<p align=\"center\">\n  <img src='furnace/CAE.png'"
  },
  {
    "path": "dall_e/__init__.py",
    "chars": 595,
    "preview": "import io, requests\nimport torch\nimport torch.nn as nn\n\nfrom dall_e.encoder import Encoder\nfrom dall_e.decoder import De"
  },
  {
    "path": "dall_e/decoder.py",
    "chars": 3936,
    "preview": "import attr\nimport numpy as np\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom collections  im"
  },
  {
    "path": "dall_e/encoder.py",
    "chars": 3775,
    "preview": "import attr\nimport numpy as np\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom collections  im"
  },
  {
    "path": "dall_e/utils.py",
    "chars": 1771,
    "preview": "import attr\nimport math\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nlogit_laplace_eps: float = "
  },
  {
    "path": "downstream_tasks/detection/README.md",
    "chars": 3049,
    "preview": "\n# COCO Detection and Instance segmentation with CAE\n\n# Installation\n\nPlease install [PyTorch](https://pytorch.org/). Th"
  },
  {
    "path": "downstream_tasks/detection/evaluation/object_detection/configs/_base_/datasets/coco_instance.py",
    "chars": 1722,
    "preview": "dataset_type = 'CocoDataset'\ndata_root = '/path/to/coco/'\nimg_norm_cfg = dict(\n    mean=[123.675, 116.28, 103.53], std=["
  },
  {
    "path": "downstream_tasks/detection/evaluation/object_detection/configs/_base_/default_runtime.py",
    "chars": 367,
    "preview": "checkpoint_config = dict(interval=1)\n# yapf:disable\nlog_config = dict(\n    interval=50,\n    hooks=[\n        dict(type='T"
  },
  {
    "path": "downstream_tasks/detection/evaluation/object_detection/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py",
    "chars": 6942,
    "preview": "ettings\nmodel = dict(\n    type='CascadeRCNN',\n    backbone=dict(\n        type='ResNet',\n        depth=50,\n        num_st"
  },
  {
    "path": "downstream_tasks/detection/evaluation/object_detection/configs/_base_/models/cascade_mask_rcnn_swin_fpn.py",
    "chars": 7188,
    "preview": "# model settings\nmodel = dict(\n    type='CascadeRCNN',\n    pretrained=None,\n    backbone=dict(\n        type='SwinTransfo"
  },
  {
    "path": "downstream_tasks/detection/evaluation/object_detection/configs/_base_/models/cascade_mask_rcnn_vit_fpn.py",
    "chars": 7083,
    "preview": "# model settings\nmodel = dict(\n    type='CascadeRCNN',\n    pretrained=None,\n    backbone=dict(\n        type='VisionTrans"
  },
  {
    "path": "downstream_tasks/detection/evaluation/object_detection/configs/_base_/models/mask_rcnn_r50_fpn.py",
    "chars": 4055,
    "preview": "# model settings\nmodel = dict(\n    type='MaskRCNN',\n    backbone=dict(\n        type='ResNet',\n        depth=50,\n        "
  },
  {
    "path": "downstream_tasks/detection/evaluation/object_detection/configs/_base_/models/mask_rcnn_vit_fpn.py",
    "chars": 4185,
    "preview": "# model settings\nmodel = dict(\n    type='MaskRCNN',\n    pretrained=None,\n    backbone=dict(\n        type='VisionTransfor"
  },
  {
    "path": "downstream_tasks/detection/evaluation/object_detection/configs/_base_/schedules/schedule_1x.py",
    "chars": 318,
    "preview": "# optimizer\noptimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)\noptimizer_config = dict(grad_clip=N"
  },
  {
    "path": "downstream_tasks/detection/evaluation/object_detection/configs/mask_rcnn/vit_base_giou_4conv1f_coco_maskrcnn_1x_cae_sincos_init0.1_lr00003.py",
    "chars": 4275,
    "preview": "# Copyright (c) ByteDance, Inc. and its affiliates.\n# All rights reserved.\n#\n# This source code is licensed under the li"
  },
  {
    "path": "downstream_tasks/detection/evaluation/object_detection/configs/mask_rcnn/vit_large_giou_4conv1f_coco_maskrcnn_1x_cae_sincos_init0.1_lr00002_lrdr0.85_dp0.2.py",
    "chars": 4345,
    "preview": "#Copyright (c) ByteDance, Inc. and its affiliates.\n# All rights reserved.\n#\n# This source code is licensed under the lic"
  },
  {
    "path": "downstream_tasks/detection/evaluation/object_detection/mmcv_custom/__init__.py",
    "chars": 257,
    "preview": "# -*- coding: utf-8 -*-\n\nfrom .checkpoint import load_checkpoint\nfrom .layer_decay_optimizer_constructor import LayerDec"
  },
  {
    "path": "downstream_tasks/detection/evaluation/object_detection/mmcv_custom/checkpoint.py",
    "chars": 29115,
    "preview": "# Copyright (c) ByteDance, Inc. and its affiliates.\n# All rights reserved.\n#\n# This source code is licensed under the li"
  },
  {
    "path": "downstream_tasks/detection/evaluation/object_detection/mmcv_custom/layer_decay_optimizer_constructor.py",
    "chars": 4020,
    "preview": "# Copyright (c) ByteDance, Inc. and its affiliates.\n# All rights reserved.\n#\n# This source code is licensed under the li"
  },
  {
    "path": "downstream_tasks/detection/evaluation/object_detection/mmcv_custom/prepare_rpe.py",
    "chars": 3089,
    "preview": "import torch\n\nimport numpy as np\nfrom scipy import interpolate\n\nfrom mmcv.runner import get_dist_info\nimport torch.nn as"
  },
  {
    "path": "downstream_tasks/detection/evaluation/object_detection/mmcv_custom/register_backbone.py",
    "chars": 9507,
    "preview": "# Copyright (c) ByteDance, Inc. and its affiliates.\n# All rights reserved.\n#\n# This source code is licensed under the li"
  },
  {
    "path": "downstream_tasks/detection/evaluation/object_detection/mmcv_custom/runner/__init__.py",
    "chars": 202,
    "preview": "\n# Copyright (c) Open-MMLab. All rights reserved.\nfrom .checkpoint import save_checkpoint\nfrom .epoch_based_runner impor"
  },
  {
    "path": "downstream_tasks/detection/evaluation/object_detection/mmcv_custom/runner/checkpoint.py",
    "chars": 3103,
    "preview": "# Copyright (c) ByteDance, Inc. and its affiliates.\n# All rights reserved.\n#\n# This source code is licensed under the li"
  },
  {
    "path": "downstream_tasks/detection/evaluation/object_detection/mmcv_custom/runner/epoch_based_runner.py",
    "chars": 4174,
    "preview": "# Copyright (c) ByteDance, Inc. and its affiliates.\n# All rights reserved.\n#\n# This source code is licensed under the li"
  },
  {
    "path": "downstream_tasks/detection/evaluation/object_detection/test.py",
    "chars": 8971,
    "preview": "# Copyright (c) ByteDance, Inc. and its affiliates.\n# All rights reserved.\n#\n# This source code is licensed under the li"
  },
  {
    "path": "downstream_tasks/detection/evaluation/object_detection/train.py",
    "chars": 7206,
    "preview": "# Copyright (c) ByteDance, Inc. and its affiliates.\n# All rights reserved.\n#\n# This source code is licensed under the li"
  },
  {
    "path": "downstream_tasks/detection/loader.py",
    "chars": 4767,
    "preview": "# Copyright (c) ByteDance, Inc. and its affiliates.\n# All rights reserved.\n#\n# This source code is licensed under the li"
  },
  {
    "path": "downstream_tasks/detection/models/__init__.py",
    "chars": 183,
    "preview": "from .vision_transformer import VisionTransformer, vit_tiny, vit_small, vit_base, vit_large\nfrom .swin_transformer impor"
  },
  {
    "path": "downstream_tasks/detection/models/head.py",
    "chars": 7543,
    "preview": "# Copyright (c) ByteDance, Inc. and its affiliates.\n# All rights reserved.\n#\n# This source code is licensed under the li"
  },
  {
    "path": "downstream_tasks/detection/models/swin_transformer.py",
    "chars": 34888,
    "preview": "# Copyright (c) ByteDance, Inc. and its affiliates.\n# All rights reserved.\n#\n# This source code is licensed under the li"
  },
  {
    "path": "downstream_tasks/detection/models/vision_transformer.py",
    "chars": 15306,
    "preview": "# Copyright (c) ByteDance, Inc. and its affiliates.\n# All rights reserved.\n#\n# This source code is licensed under the li"
  },
  {
    "path": "downstream_tasks/detection/scripts/run_eval.sh",
    "chars": 286,
    "preview": "#!/usr/bin/env bash\n\necho \"EVAL MODEL:\"$MODEL\npython -m torch.distributed.launch --nproc_per_node=8 \\\n    evaluation/obj"
  },
  {
    "path": "downstream_tasks/detection/scripts/run_train_maskrcnn_vit_base.sh",
    "chars": 541,
    "preview": "#!/usr/bin/env bash\n\npython -m torch.distributed.launch --nproc_per_node=8 \\\n    --nnodes=$NNODES \\\n    --node_rank=$RAN"
  },
  {
    "path": "downstream_tasks/detection/scripts/run_train_maskrcnn_vit_large.sh",
    "chars": 554,
    "preview": "#!/usr/bin/env bash\n\npython -m torch.distributed.launch --nproc_per_node=8 \\\n    --nnodes=$NNODES \\\n    --node_rank=$RAN"
  },
  {
    "path": "downstream_tasks/detection/utils.py",
    "chars": 29583,
    "preview": "# Copyright (c) ByteDance, Inc. and its affiliates.\n# All rights reserved.\n#\n# This source code is licensed under the li"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/README.md",
    "chars": 3093,
    "preview": "# ADE20k Semantic segmentation with CAE\n\n## Getting started \n\n1. Install the [mmsegmentation](https://github.com/open-mm"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/backbone/beit.py",
    "chars": 22062,
    "preview": "# --------------------------------------------------------\n# BEIT: BERT Pre-Training of Image Transformers (https://arxi"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/backbone/beit_fapn.py",
    "chars": 22090,
    "preview": "# --------------------------------------------------------\n# BEIT: BERT Pre-Training of Image Transformers (https://arxi"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/backbone/cae.py",
    "chars": 22061,
    "preview": "# --------------------------------------------------------\n# BEIT: BERT Pre-Training of Image Transformers (https://arxi"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/backbone/fapn.py",
    "chars": 1866,
    "preview": "class FeatureSelectionModule(nn.Module):\n    def __init__(self, in_c, out_c, norm=\"GM\"):\n        super(FeatureSelectionM"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/backbone/mae.py",
    "chars": 21706,
    "preview": "# --------------------------------------------------------\n# BEIT: BERT Pre-Training of Image Transformers (https://arxi"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/datasets/ade20k.py",
    "chars": 1844,
    "preview": "# dataset settings\ndataset_type = 'ADE20KDataset'\ndata_root = 'data/ade/ADEChallengeData2016'\nimg_norm_cfg = dict(\n    m"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/datasets/ade20k_640x640.py",
    "chars": 1844,
    "preview": "# dataset settings\ndataset_type = 'ADE20KDataset'\ndata_root = 'data/ade/ADEChallengeData2016'\nimg_norm_cfg = dict(\n    m"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/datasets/chase_db1.py",
    "chars": 1924,
    "preview": "# dataset settings\ndataset_type = 'ChaseDB1Dataset'\ndata_root = 'data/CHASE_DB1'\nimg_norm_cfg = dict(\n    mean=[123.675,"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/datasets/cityscapes.py",
    "chars": 1780,
    "preview": "# dataset settings\ndataset_type = 'CityscapesDataset'\ndata_root = 'data/cityscapes/'\nimg_norm_cfg = dict(\n    mean=[123."
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/datasets/cityscapes_769x769.py",
    "chars": 1281,
    "preview": "_base_ = './cityscapes.py'\nimg_norm_cfg = dict(\n    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb="
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/datasets/coco-stuff10k.py",
    "chars": 1925,
    "preview": "# dataset settings\ndataset_type = 'COCOStuffDataset'\ndata_root = 'data/coco_stuff10k'\nimg_norm_cfg = dict(\n    mean=[123"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/datasets/drive.py",
    "chars": 1915,
    "preview": "# dataset settings\ndataset_type = 'DRIVEDataset'\ndata_root = 'data/DRIVE'\nimg_norm_cfg = dict(\n    mean=[123.675, 116.28"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/datasets/hrf.py",
    "chars": 1915,
    "preview": "# dataset settings\ndataset_type = 'HRFDataset'\ndata_root = 'data/HRF'\nimg_norm_cfg = dict(\n    mean=[123.675, 116.28, 10"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/datasets/pascal_context.py",
    "chars": 1998,
    "preview": "# dataset settings\ndataset_type = 'PascalContextDataset'\ndata_root = 'data/VOCdevkit/VOC2010/'\nimg_norm_cfg = dict(\n    "
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/datasets/pascal_voc12.py",
    "chars": 1930,
    "preview": "# dataset settings\ndataset_type = 'PascalVOCDataset'\ndata_root = 'data/VOCdevkit/VOC2012'\nimg_norm_cfg = dict(\n    mean="
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/datasets/pascal_voc12_aug.py",
    "chars": 261,
    "preview": "_base_ = './pascal_voc12.py'\n# dataset settings\ndata = dict(\n    train=dict(\n        ann_dir=['SegmentationClass', 'Segm"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/datasets/stare.py",
    "chars": 1917,
    "preview": "# dataset settings\ndataset_type = 'STAREDataset'\ndata_root = 'data/STARE'\nimg_norm_cfg = dict(\n    mean=[123.675, 116.28"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/default_runtime.py",
    "chars": 321,
    "preview": "# yapf:disable\nlog_config = dict(\n    interval=50,\n    hooks=[\n        dict(type='TextLoggerHook', by_epoch=False),\n    "
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/ann_r50-d8.py",
    "chars": 1346,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n    pretrai"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/apcnet_r50-d8.py",
    "chars": 1302,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n    pretrai"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/ccnet_r50-d8.py",
    "chars": 1258,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n    pretrai"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/cgnet.py",
    "chars": 1110,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', eps=1e-03, requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/danet_r50-d8.py",
    "chars": 1261,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n    pretrai"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/deeplabv3_r50-d8.py",
    "chars": 1273,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n    pretrai"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/deeplabv3_unet_s5-d16.py",
    "chars": 1499,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n    pretrai"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/deeplabv3plus_r50-d8.py",
    "chars": 1343,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n    pretrai"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/dmnet_r50-d8.py",
    "chars": 1302,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n    pretrai"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/dnl_r50-d8.py",
    "chars": 1316,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n    pretrai"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/emanet_r50-d8.py",
    "chars": 1329,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n    pretrai"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/encnet_r50-d8.py",
    "chars": 1435,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n    pretrai"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/fast_scnn.py",
    "chars": 1761,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True, momentum=0.01)\nmodel = dict(\n    type='EncoderDecode"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/fcn_hr18.py",
    "chars": 1646,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n    pretrai"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/fcn_r50-d8.py",
    "chars": 1285,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n    pretrai"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/fcn_unet_s5-d16.py",
    "chars": 1512,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n    pretrai"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/fpn_r50.py",
    "chars": 1056,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n    pretrai"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/gcnet_r50-d8.py",
    "chars": 1326,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n    pretrai"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/lraspp_m-v3-d8.py",
    "chars": 766,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/nonlocal_r50-d8.py",
    "chars": 1315,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n    pretrai"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/ocrnet_hr18.py",
    "chars": 2196,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='CascadeEncoderDecoder',\n    "
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/ocrnet_r50-d8.py",
    "chars": 1385,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='CascadeEncoderDecoder',\n    "
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/pointrend_r50.py",
    "chars": 1704,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='CascadeEncoderDecoder',\n    "
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/psanet_r50-d8.py",
    "chars": 1406,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n    pretrai"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/pspnet_r50-d8.py",
    "chars": 1271,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n    pretrai"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/pspnet_unet_s5-d16.py",
    "chars": 1497,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n    pretrai"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/upernet_cae.py",
    "chars": 1834,
    "preview": "# --------------------------------------------------------\n# BEIT: BERT Pre-Training of Image Transformers (https://arxi"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/models/upernet_r50.py",
    "chars": 1301,
    "preview": "# model settings\nnorm_cfg = dict(type='SyncBN', requires_grad=True)\nmodel = dict(\n    type='EncoderDecoder',\n    pretrai"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/schedules/schedule_160k.py",
    "chars": 382,
    "preview": "# optimizer\noptimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)\noptimizer_config = dict()\n# learnin"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/schedules/schedule_20k.py",
    "chars": 379,
    "preview": "# optimizer\noptimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)\noptimizer_config = dict()\n# learnin"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/schedules/schedule_320k.py",
    "chars": 382,
    "preview": "# optimizer\noptimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)\noptimizer_config = dict()\n# learnin"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/schedules/schedule_40k.py",
    "chars": 379,
    "preview": "# optimizer\noptimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)\noptimizer_config = dict()\n# learnin"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/_base_/schedules/schedule_80k.py",
    "chars": 379,
    "preview": "# optimizer\noptimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)\noptimizer_config = dict()\n# learnin"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/beit/upernet_beit_base_12_512_slide_160k_ade20k_pt_4e-4.py",
    "chars": 3715,
    "preview": "# --------------------------------------------------------\n# BEIT: BERT Pre-Training of Image Transformers (https://arxi"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/cae/upernet/upernet_cae_base_12_512_slide_160k_ade20k_pt_1e-4.py",
    "chars": 3016,
    "preview": "# --------------------------------------------------------\n# BEIT: BERT Pre-Training of Image Transformers (https://arxi"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/cae/upernet/upernet_cae_base_12_512_slide_160k_ade20k_pt_2e-4.py",
    "chars": 3016,
    "preview": "# --------------------------------------------------------\n# BEIT: BERT Pre-Training of Image Transformers (https://arxi"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/cae/upernet/upernet_cae_base_12_512_slide_160k_ade20k_pt_3e-4.py",
    "chars": 3016,
    "preview": "# --------------------------------------------------------\n# BEIT: BERT Pre-Training of Image Transformers (https://arxi"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/cae/upernet/upernet_cae_large_24_512_slide_160k_ade20k_pt_decay095_4e-5_dp015.py",
    "chars": 3690,
    "preview": "# --------------------------------------------------------\n# BEIT: BERT Pre-Training of Image Transformers (https://arxi"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/configs_local/mae/upernet_mae_large_12_512_slide_160k_ade20k_pt_4e-4.py",
    "chars": 3687,
    "preview": "# --------------------------------------------------------\n# BEIT: BERT Pre-Training of Image Transformers (https://arxi"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmcv_custom/__init__.py",
    "chars": 397,
    "preview": "# -*- coding: utf-8 -*-\n\nfrom .checkpoint import load_checkpoint\nfrom .layer_decay_optimizer_constructor import LayerDec"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmcv_custom/apex_runner/__init__.py",
    "chars": 206,
    "preview": "# Copyright (c) Open-MMLab. All rights reserved.\nfrom .checkpoint import save_checkpoint\nfrom .apex_iter_based_runner im"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmcv_custom/apex_runner/apex_iter_based_runner.py",
    "chars": 3884,
    "preview": "# Copyright (c) Open-MMLab. All rights reserved.\nimport os.path as osp\nimport platform\nimport shutil\n\nimport torch\nfrom "
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmcv_custom/apex_runner/checkpoint.py",
    "chars": 2883,
    "preview": "# Copyright (c) Open-MMLab. All rights reserved.\nimport os.path as osp\nimport time\nfrom tempfile import TemporaryDirecto"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmcv_custom/apex_runner/optimizer.py",
    "chars": 1158,
    "preview": "from mmcv.runner import OptimizerHook, HOOKS\ntry:\n    import apex\nexcept:\n    print('apex is not installed')\n\n\n@HOOKS.re"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmcv_custom/checkpoint.py",
    "chars": 26461,
    "preview": "# Copyright (c) Open-MMLab. All rights reserved.\nimport io\nimport os\nimport os.path as osp\nimport pkgutil\nimport time\nim"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmcv_custom/checkpoint_beit.py",
    "chars": 25720,
    "preview": "# Copyright (c) Open-MMLab. All rights reserved.\nimport io\nimport os\nimport os.path as osp\nimport pkgutil\nimport time\nim"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmcv_custom/layer_decay_optimizer_constructor.py",
    "chars": 3653,
    "preview": "import json\nfrom mmcv.runner import OPTIMIZER_BUILDERS, DefaultOptimizerConstructor\nfrom mmcv.runner import get_dist_inf"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmcv_custom/resize_transform.py",
    "chars": 9164,
    "preview": "import mmcv\nimport numpy as np\n\nfrom mmseg.datasets.builder import PIPELINES\n\n\n@PIPELINES.register_module()\nclass SETR_R"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmcv_custom/train_api.py",
    "chars": 4416,
    "preview": "import random\nimport warnings\n\nimport numpy as np\nimport torch\nfrom mmcv.parallel import MMDataParallel, MMDistributedDa"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/__init__.py",
    "chars": 850,
    "preview": "import mmcv\n\nfrom .version import __version__, version_info\n\nMMCV_MIN = '1.1.4'\nMMCV_MAX = '1.3.0'\n\n\ndef digit_version(v"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/apis/__init__.py",
    "chars": 381,
    "preview": "from .inference import inference_segmentor, init_segmentor, show_result_pyplot\nfrom .test import multi_gpu_test, single_"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/apis/inference.py",
    "chars": 4019,
    "preview": "import matplotlib.pyplot as plt\nimport mmcv\nimport torch\nfrom mmcv.parallel import collate, scatter\nfrom mmcv.runner imp"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/apis/test.py",
    "chars": 8039,
    "preview": "import os.path as osp\nimport pickle\nimport shutil\nimport tempfile\n\nimport mmcv\nimport numpy as np\nimport torch\nimport to"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/apis/train.py",
    "chars": 3919,
    "preview": "import random\nimport warnings\n\nimport numpy as np\nimport torch\nfrom mmcv.parallel import MMDataParallel, MMDistributedDa"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/core/__init__.py",
    "chars": 126,
    "preview": "from .evaluation import *  # noqa: F401, F403\nfrom .seg import *  # noqa: F401, F403\nfrom .utils import *  # noqa: F401,"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/core/evaluation/__init__.py",
    "chars": 273,
    "preview": "from .class_names import get_classes, get_palette\nfrom .eval_hooks import DistEvalHook, EvalHook\nfrom .metrics import ev"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/core/evaluation/class_names.py",
    "chars": 7277,
    "preview": "import mmcv\n\n\ndef cityscapes_classes():\n    \"\"\"Cityscapes class names for external use.\"\"\"\n    return [\n        'road', "
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/core/evaluation/eval_hooks.py",
    "chars": 3858,
    "preview": "import os.path as osp\n\nfrom mmcv.runner import Hook\nfrom torch.utils.data import DataLoader\n\n\nclass EvalHook(Hook):\n    "
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/core/evaluation/metrics.py",
    "chars": 9228,
    "preview": "import mmcv\nimport numpy as np\n\n\ndef intersect_and_union(pred_label,\n                        label,\n                    "
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/core/seg/__init__.py",
    "chars": 172,
    "preview": "from .builder import build_pixel_sampler\nfrom .sampler import BasePixelSampler, OHEMPixelSampler\n\n__all__ = ['build_pixe"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/core/seg/builder.py",
    "chars": 253,
    "preview": "from mmcv.utils import Registry, build_from_cfg\n\nPIXEL_SAMPLERS = Registry('pixel sampler')\n\n\ndef build_pixel_sampler(cf"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/core/seg/sampler/__init__.py",
    "chars": 150,
    "preview": "from .base_pixel_sampler import BasePixelSampler\nfrom .ohem_pixel_sampler import OHEMPixelSampler\n\n__all__ = ['BasePixel"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/core/seg/sampler/base_pixel_sampler.py",
    "chars": 297,
    "preview": "from abc import ABCMeta, abstractmethod\n\n\nclass BasePixelSampler(metaclass=ABCMeta):\n    \"\"\"Base class of pixel sampler."
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/core/seg/sampler/ohem_pixel_sampler.py",
    "chars": 3155,
    "preview": "import torch\nimport torch.nn.functional as F\n\nfrom ..builder import PIXEL_SAMPLERS\nfrom .base_pixel_sampler import BaseP"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/core/utils/__init__.py",
    "chars": 55,
    "preview": "from .misc import add_prefix\n\n__all__ = ['add_prefix']\n"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/core/utils/misc.py",
    "chars": 371,
    "preview": "def add_prefix(inputs, prefix):\n    \"\"\"Add prefix for dict.\n\n    Args:\n        inputs (dict): The input dict with str ke"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/datasets/__init__.py",
    "chars": 806,
    "preview": "from .ade import ADE20KDataset\nfrom .builder import DATASETS, PIPELINES, build_dataloader, build_dataset\nfrom .chase_db1"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/datasets/ade.py",
    "chars": 5185,
    "preview": "from .builder import DATASETS\nfrom .custom import CustomDataset\n\n\n@DATASETS.register_module()\nclass ADE20KDataset(Custom"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/datasets/builder.py",
    "chars": 5871,
    "preview": "import copy\nimport platform\nimport random\nfrom functools import partial\n\nimport numpy as np\nfrom mmcv.parallel import co"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/datasets/chase_db1.py",
    "chars": 781,
    "preview": "import os.path as osp\n\nfrom .builder import DATASETS\nfrom .custom import CustomDataset\n\n\n@DATASETS.register_module()\ncla"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/datasets/cityscapes.py",
    "chars": 8446,
    "preview": "import os.path as osp\nimport tempfile\n\nimport mmcv\nimport numpy as np\nfrom mmcv.utils import print_log\nfrom PIL import I"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/datasets/coco_stuff.py",
    "chars": 6108,
    "preview": "from .builder import DATASETS\nfrom .custom import CustomDataset\n\n\n@DATASETS.register_module()\nclass COCOStuffDataset(Cus"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/datasets/custom.py",
    "chars": 14193,
    "preview": "import os\nimport os.path as osp\nfrom functools import reduce\n\nimport mmcv\nimport numpy as np\nfrom mmcv.utils import prin"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/datasets/dataset_wrappers.py",
    "chars": 1499,
    "preview": "from torch.utils.data.dataset import ConcatDataset as _ConcatDataset\n\nfrom .builder import DATASETS\n\n\n@DATASETS.register"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/datasets/drive.py",
    "chars": 771,
    "preview": "import os.path as osp\n\nfrom .builder import DATASETS\nfrom .custom import CustomDataset\n\n\n@DATASETS.register_module()\ncla"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/datasets/hrf.py",
    "chars": 747,
    "preview": "import os.path as osp\n\nfrom .builder import DATASETS\nfrom .custom import CustomDataset\n\n\n@DATASETS.register_module()\ncla"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/datasets/pascal_context.py",
    "chars": 2666,
    "preview": "import os.path as osp\n\nfrom .builder import DATASETS\nfrom .custom import CustomDataset\n\n\n@DATASETS.register_module()\ncla"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/datasets/pipelines/__init__.py",
    "chars": 813,
    "preview": "from .compose import Compose\nfrom .formating import (Collect, ImageToTensor, ToDataContainer, ToTensor,\n                "
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/datasets/pipelines/compose.py",
    "chars": 1464,
    "preview": "import collections\n\nfrom mmcv.utils import build_from_cfg\n\nfrom ..builder import PIPELINES\n\n\n@PIPELINES.register_module("
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/datasets/pipelines/formating.py",
    "chars": 9228,
    "preview": "from collections.abc import Sequence\n\nimport mmcv\nimport numpy as np\nimport torch\nfrom mmcv.parallel import DataContaine"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/datasets/pipelines/loading.py",
    "chars": 5873,
    "preview": "import os.path as osp\n\nimport mmcv\nimport numpy as np\n\nfrom ..builder import PIPELINES\n\n\n@PIPELINES.register_module()\ncl"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/datasets/pipelines/test_time_aug.py",
    "chars": 5173,
    "preview": "import warnings\n\nimport mmcv\n\nfrom ..builder import PIPELINES\nfrom .compose import Compose\n\n\n@PIPELINES.register_module("
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/datasets/pipelines/transforms.py",
    "chars": 30974,
    "preview": "import mmcv\nimport numpy as np\nfrom mmcv.utils import deprecated_api_warning, is_tuple_of\nfrom numpy import random\n\nfrom"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/datasets/stare.py",
    "chars": 761,
    "preview": "import os.path as osp\n\nfrom .builder import DATASETS\nfrom .custom import CustomDataset\n\n\n@DATASETS.register_module()\ncla"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/datasets/voc.py",
    "chars": 1130,
    "preview": "import os.path as osp\n\nfrom .builder import DATASETS\nfrom .custom import CustomDataset\n\n\n@DATASETS.register_module()\ncla"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/__init__.py",
    "chars": 489,
    "preview": "from .backbones import *  # noqa: F401,F403\nfrom .builder import (BACKBONES, HEADS, LOSSES, SEGMENTORS, build_backbone,\n"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/backbones/__init__.py",
    "chars": 436,
    "preview": "from .cgnet import CGNet\nfrom .fast_scnn import FastSCNN\nfrom .hrnet import HRNet\nfrom .mobilenet_v2 import MobileNetV2\n"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/backbones/cgnet.py",
    "chars": 13105,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.utils.checkpoint as cp\nfrom mmcv.cnn import (ConvModule, build_conv_laye"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/backbones/fast_scnn.py",
    "chars": 14376,
    "preview": "import torch\nimport torch.nn as nn\nfrom mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule, constant_init,\n      "
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/backbones/hrnet.py",
    "chars": 21107,
    "preview": "import torch.nn as nn\nfrom mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init,\n                      kai"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/backbones/mobilenet_v2.py",
    "chars": 6941,
    "preview": "import logging\n\nimport torch.nn as nn\nfrom mmcv.cnn import ConvModule, constant_init, kaiming_init\nfrom mmcv.runner impo"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/backbones/mobilenet_v3.py",
    "chars": 10303,
    "preview": "import logging\n\nimport mmcv\nimport torch.nn as nn\nfrom mmcv.cnn import ConvModule, constant_init, kaiming_init\nfrom mmcv"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/backbones/resnest.py",
    "chars": 10090,
    "preview": "import math\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport torch.utils.checkpoint as cp\nfrom"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/backbones/resnet.py",
    "chars": 24210,
    "preview": "import torch.nn as nn\nimport torch.utils.checkpoint as cp\nfrom mmcv.cnn import (build_conv_layer, build_norm_layer, buil"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/backbones/resnext.py",
    "chars": 5121,
    "preview": "import math\n\nfrom mmcv.cnn import build_conv_layer, build_norm_layer\n\nfrom ..builder import BACKBONES\nfrom ..utils impor"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/backbones/unet.py",
    "chars": 18141,
    "preview": "import torch.nn as nn\nimport torch.utils.checkpoint as cp\nfrom mmcv.cnn import (UPSAMPLE_LAYERS, ConvModule, build_activ"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/builder.py",
    "chars": 1822,
    "preview": "import warnings\n\nfrom mmcv.utils import Registry, build_from_cfg\nfrom torch import nn\n\nBACKBONES = Registry('backbone')\n"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/__init__.py",
    "chars": 980,
    "preview": "from .ann_head import ANNHead\nfrom .apc_head import APCHead\nfrom .aspp_head import ASPPHead\nfrom .cc_head import CCHead\n"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/ann_head.py",
    "chars": 9174,
    "preview": "import torch\nimport torch.nn as nn\nfrom mmcv.cnn import ConvModule\n\nfrom ..builder import HEADS\nfrom ..utils import Self"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/apc_head.py",
    "chars": 5531,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmcv.cnn import ConvModule\n\nfrom mmseg.ops impor"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/aspp_head.py",
    "chars": 3419,
    "preview": "import torch\nimport torch.nn as nn\nfrom mmcv.cnn import ConvModule\n\nfrom mmseg.ops import resize\nfrom ..builder import H"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/cascade_decode_head.py",
    "chars": 2351,
    "preview": "from abc import ABCMeta, abstractmethod\n\nfrom .decode_head import BaseDecodeHead\n\n\nclass BaseCascadeDecodeHead(BaseDecod"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/cc_head.py",
    "chars": 1283,
    "preview": "import torch\n\nfrom ..builder import HEADS\nfrom .fcn_head import FCNHead\n\ntry:\n    from mmcv.ops import CrissCrossAttenti"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/da_head.py",
    "chars": 5545,
    "preview": "import torch\nimport torch.nn.functional as F\nfrom mmcv.cnn import ConvModule, Scale\nfrom torch import nn\n\nfrom mmseg.cor"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/decode_head.py",
    "chars": 9160,
    "preview": "from abc import ABCMeta, abstractmethod\n\nimport torch\nimport torch.nn as nn\nfrom mmcv.cnn import normal_init\nfrom mmcv.r"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/dm_head.py",
    "chars": 4978,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmcv.cnn import ConvModule, build_activation_lay"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/dnl_head.py",
    "chars": 4571,
    "preview": "import torch\nfrom mmcv.cnn import NonLocal2d\nfrom torch import nn\n\nfrom ..builder import HEADS\nfrom .fcn_head import FCN"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/ema_head.py",
    "chars": 5776,
    "preview": "import math\n\nimport torch\nimport torch.distributed as dist\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mm"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/enc_head.py",
    "chars": 6744,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmcv.cnn import ConvModule, build_norm_layer\n\nfr"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/fcn_head.py",
    "chars": 2525,
    "preview": "import torch\nimport torch.nn as nn\nfrom mmcv.cnn import ConvModule\n\nfrom ..builder import HEADS\nfrom .decode_head import"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/fpn_head.py",
    "chars": 2382,
    "preview": "import numpy as np\nimport torch.nn as nn\nfrom mmcv.cnn import ConvModule\n\nfrom mmseg.ops import resize\nfrom ..builder im"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/gc_head.py",
    "chars": 1591,
    "preview": "import torch\nfrom mmcv.cnn import ContextBlock\n\nfrom ..builder import HEADS\nfrom .fcn_head import FCNHead\n\n\n@HEADS.regis"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/lraspp_head.py",
    "chars": 3038,
    "preview": "import torch\nimport torch.nn as nn\nfrom mmcv import is_tuple_of\nfrom mmcv.cnn import ConvModule\n\nfrom mmseg.ops import r"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/nl_head.py",
    "chars": 1557,
    "preview": "import torch\nfrom mmcv.cnn import NonLocal2d\n\nfrom ..builder import HEADS\nfrom .fcn_head import FCNHead\n\n\n@HEADS.registe"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/ocr_head.py",
    "chars": 4279,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmcv.cnn import ConvModule\n\nfrom mmseg.ops impor"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/point_head.py",
    "chars": 14674,
    "preview": "# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend/point_head/point_head.py  "
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/psa_head.py",
    "chars": 7484,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmcv.cnn import ConvModule\n\nfrom mmseg.ops impor"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/psp_head.py",
    "chars": 3312,
    "preview": "import torch\nimport torch.nn as nn\nfrom mmcv.cnn import ConvModule\n\nfrom mmseg.ops import resize\nfrom ..builder import H"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/sep_aspp_head.py",
    "chars": 3487,
    "preview": "import torch\nimport torch.nn as nn\nfrom mmcv.cnn import ConvModule, DepthwiseSeparableConvModule\n\nfrom mmseg.ops import "
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/sep_fcn_head.py",
    "chars": 2004,
    "preview": "from mmcv.cnn import DepthwiseSeparableConvModule\n\nfrom ..builder import HEADS\nfrom .fcn_head import FCNHead\n\n\n@HEADS.re"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/decode_heads/uper_head.py",
    "chars": 3972,
    "preview": "import torch\nimport torch.nn as nn\nfrom mmcv.cnn import ConvModule\n\nfrom mmseg.ops import resize\nfrom ..builder import H"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/losses/__init__.py",
    "chars": 485,
    "preview": "from .accuracy import Accuracy, accuracy\nfrom .cross_entropy_loss import (CrossEntropyLoss, binary_cross_entropy,\n      "
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/losses/accuracy.py",
    "chars": 2967,
    "preview": "import torch.nn as nn\n\n\ndef accuracy(pred, target, topk=1, thresh=None):\n    \"\"\"Calculate accuracy according to the pred"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/losses/cross_entropy_loss.py",
    "chars": 7354,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom ..builder import LOSSES\nfrom .utils import weig"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/losses/lovasz_loss.py",
    "chars": 11313,
    "preview": "\"\"\"Modified from https://github.com/bermanmaxim/LovaszSoftmax/blob/master/pytor\nch/lovasz_losses.py Lovasz-Softmax and J"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/losses/utils.py",
    "chars": 3147,
    "preview": "import functools\n\nimport torch.nn.functional as F\n\n\ndef reduce_loss(loss, reduction):\n    \"\"\"Reduce loss as specified.\n\n"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/necks/__init__.py",
    "chars": 40,
    "preview": "from .fpn import FPN\n\n__all__ = ['FPN']\n"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/necks/fpn.py",
    "chars": 9139,
    "preview": "import torch.nn as nn\nimport torch.nn.functional as F\nfrom mmcv.cnn import ConvModule, xavier_init\n\nfrom ..builder impor"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/segmentors/__init__.py",
    "chars": 158,
    "preview": "from .cascade_encoder_decoder import CascadeEncoderDecoder\nfrom .encoder_decoder import EncoderDecoder\n\n__all__ = ['Enco"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/segmentors/base.py",
    "chars": 10134,
    "preview": "import logging\nimport warnings\nfrom abc import ABCMeta, abstractmethod\nfrom collections import OrderedDict\n\nimport mmcv\n"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/segmentors/cascade_encoder_decoder.py",
    "chars": 3668,
    "preview": "from torch import nn\n\nfrom mmseg.core import add_prefix\nfrom mmseg.ops import resize\nfrom .. import builder\nfrom ..build"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/segmentors/encoder_decoder.py",
    "chars": 11129,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom mmseg.core import add_prefix\nfrom mmseg.ops imp"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/utils/__init__.py",
    "chars": 366,
    "preview": "from .inverted_residual import InvertedResidual, InvertedResidualV3\nfrom .make_divisible import make_divisible\nfrom .res"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/utils/inverted_residual.py",
    "chars": 7010,
    "preview": "from mmcv.cnn import ConvModule\nfrom torch import nn as nn\nfrom torch.utils import checkpoint as cp\n\nfrom .se_layer impo"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/utils/make_divisible.py",
    "chars": 1231,
    "preview": "def make_divisible(value, divisor, min_value=None, min_ratio=0.9):\n    \"\"\"Make divisible function.\n\n    This function ro"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/utils/res_layer.py",
    "chars": 3315,
    "preview": "from mmcv.cnn import build_conv_layer, build_norm_layer\nfrom torch import nn as nn\n\n\nclass ResLayer(nn.Sequential):\n    "
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/utils/se_layer.py",
    "chars": 2109,
    "preview": "import mmcv\nimport torch.nn as nn\nfrom mmcv.cnn import ConvModule\n\nfrom .make_divisible import make_divisible\n\n\nclass SE"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/utils/self_attention_block.py",
    "chars": 6125,
    "preview": "import torch\nfrom mmcv.cnn import ConvModule, constant_init\nfrom torch import nn as nn\nfrom torch.nn import functional a"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/models/utils/up_conv_block.py",
    "chars": 3967,
    "preview": "import torch\nimport torch.nn as nn\nfrom mmcv.cnn import ConvModule, build_upsample_layer\n\n\nclass UpConvBlock(nn.Module):"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/ops/__init__.py",
    "chars": 116,
    "preview": "from .encoding import Encoding\nfrom .wrappers import Upsample, resize\n\n__all__ = ['Upsample', 'resize', 'Encoding']\n"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/ops/encoding.py",
    "chars": 2790,
    "preview": "import torch\nfrom torch import nn as nn\nfrom torch.nn import functional as F\n\n\nclass Encoding(nn.Module):\n    \"\"\"Encodin"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/ops/wrappers.py",
    "chars": 1920,
    "preview": "import warnings\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\ndef resize(input,\n           size="
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/utils/__init__.py",
    "chars": 119,
    "preview": "from .collect_env import collect_env\nfrom .logger import get_root_logger\n\n__all__ = ['get_root_logger', 'collect_env']\n"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/utils/collect_env.py",
    "chars": 436,
    "preview": "from mmcv.utils import collect_env as collect_base_env\nfrom mmcv.utils import get_git_hash\n\nimport mmseg\n\n\ndef collect_e"
  },
  {
    "path": "downstream_tasks/semantic_segmentation/mmseg/utils/logger.py",
    "chars": 899,
    "preview": "import logging\n\nfrom mmcv.utils import get_logger\n\n\ndef get_root_logger(log_file=None, log_level=logging.INFO):\n    \"\"\"G"
  }
]

// ... and 36 more files (download for full content)

About this extraction

This page contains the full source code of the lxtGH/CAE GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 236 files (1.2 MB), approximately 299.9k tokens, and a symbol index with 1281 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo