Repository: lxtGH/Video-K-Net
Branch: main
Commit: a69340321f47
Files: 237
Total size: 1.8 MB

Directory structure:
gitextract_0ef7ckct/

├── .gitignore
├── DATASET.md
├── LICENSE
├── README.md
├── configs/
│   ├── det/
│   │   ├── _base_/
│   │   │   ├── datasets/
│   │   │   │   ├── cityscapes_panoptic.py
│   │   │   │   ├── cityscapes_step.py
│   │   │   │   ├── cityscapes_vps_clips.py
│   │   │   │   ├── cityscapes_vps_clips_trainval.py
│   │   │   │   ├── coco_instance.py
│   │   │   │   ├── coco_panoptic.py
│   │   │   │   ├── coco_panoptic_instance_annotations.py
│   │   │   │   ├── kitti_step_dvps.py
│   │   │   │   ├── kitti_step_vps.py
│   │   │   │   ├── kitti_step_vps_trainval.py
│   │   │   │   ├── mapillary_panoptic.py
│   │   │   │   └── vipseg_dvps.py
│   │   │   ├── default_runtime.py
│   │   │   ├── models/
│   │   │   │   ├── knet_citystep_s3_r50_fpn.py
│   │   │   │   ├── knet_kitti_step_s3_r50_fpn.py
│   │   │   │   ├── knet_s3_r50_deformable_fpn.py
│   │   │   │   ├── knet_s3_r50_fpn.py
│   │   │   │   ├── knet_s3_r50_fpn_panoptic.py
│   │   │   │   ├── knet_vipseg_s3_r50_fpn.py
│   │   │   │   └── video_knet_s3_r50_fpn_panoptic.py
│   │   │   └── schedules/
│   │   │       ├── schedule_10e.py
│   │   │       └── schedule_1x.py
│   │   ├── coco/
│   │   │   ├── knet_s3_r50_deformable_fpn_ms-3x_coco.py
│   │   │   ├── knet_s3_r50_fpn_ms-3x_coco-panoptic.py
│   │   │   ├── knet_s3_r50_fpn_ms-3x_coco.py
│   │   │   └── knet_s3_swin-b_deformable_fpn_ms-3x_coco.py
│   │   ├── common/
│   │   │   ├── lsj_coco_panoptic_50e.py
│   │   │   ├── mstrain_3x_coco_instance.py
│   │   │   ├── mstrain_3x_coco_panoptic_inst_anno.py
│   │   │   ├── mstrain_3x_coco_panoptic_inst_anno_detr_aug.py
│   │   │   └── mstrain_64e_city_panoptic.py
│   │   ├── knet_cityscapes_step/
│   │   │   ├── knet_s3_r50_fpn.py
│   │   │   ├── knet_s3_swin_b_fpn.py
│   │   │   └── knet_s3_swin_l_fpn.py
│   │   ├── video_knet_kitti_step/
│   │   │   ├── video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train.py
│   │   │   ├── video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train_8e.py
│   │   │   ├── video_knet_s3_swinb_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_update.py
│   │   │   ├── video_knet_s3_swinl_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_update.py
│   │   │   └── video_knet_s3_swinl_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_update_conv_short_track_fc.py
│   │   └── video_knet_vipseg/
│   │       ├── video_knet_s3_r50_rpn_vipseg_mask_embed_link_ffn_joint_train.py
│   │       └── video_knet_s3_swin_b_rpn_vipseg_mask_embed_link_ffn_joint_train_8e.py
│   └── video_knet_vis/
│       ├── _base_/
│       │   ├── datasets/
│       │   │   ├── coco_instance.py
│       │   │   └── youtubevis_2019.py
│       │   ├── default_runtime.py
│       │   ├── models/
│       │   │   ├── knet_track_r50.py
│       │   │   └── knet_track_r50_deformablefpn.py
│       │   └── schedules/
│       │       ├── schedule_0.75x.py
│       │       ├── schedule_1x.py
│       │       └── schedule_8e.py
│       ├── common/
│       │   └── mstrain_3x_coco_instance.py
│       └── video_knet_vis/
│           ├── knet_track_r50_1x_youtubevis.py
│           ├── knet_track_r50_deformable_fpn_1x_youtubevis.py
│           ├── knet_track_swinb_1x_youtubevis_8e.py
│           └── knet_track_swinb_deformable_1x_youtubevis.py
├── external/
│   ├── cityscape_panoptic.py
│   ├── cityscapes_step.py
│   ├── cityscapes_vps.py
│   ├── coco_panoptic.py
│   ├── dataset/
│   │   ├── dvps_pipelines/
│   │   │   ├── __init__.py
│   │   │   ├── loading.py
│   │   │   ├── transforms.py
│   │   │   └── tricks.py
│   │   ├── forecasting_pipelines/
│   │   │   ├── __init__.py
│   │   │   ├── loading.py
│   │   │   └── transforms.py
│   │   ├── mIoU.py
│   │   └── pipelines/
│   │       ├── __init__.py
│   │       ├── formatting.py
│   │       ├── loading.py
│   │       ├── test_time_aug.py
│   │       └── transforms.py
│   ├── evalhooks.py
│   ├── ext/
│   │   ├── mask.py
│   │   └── ytvos.py
│   ├── fcn_mask_head.py
│   ├── kitti_step_dvps.py
│   ├── panoptic_fpn.py
│   ├── panoptic_head.py
│   ├── semantic_seg_head.py
│   ├── semkitti_dvps.py
│   ├── test.py
│   ├── train.py
│   ├── utils.py
│   └── vipseg_dvps.py
├── knet/
│   ├── __init__.py
│   ├── cross_entropy_loss.py
│   ├── det/
│   │   ├── dice_loss.py
│   │   ├── kernel_head.py
│   │   ├── kernel_iter_head.py
│   │   ├── kernel_update_head.py
│   │   ├── knet.py
│   │   ├── mask_hungarian_assigner.py
│   │   ├── mask_pseudo_sampler.py
│   │   ├── msdeformattn_decoder.py
│   │   ├── semantic_fpn_wrapper.py
│   │   └── utils.py
│   ├── kernel_updator.py
│   └── video/
│       ├── __init__.py
│       ├── dice_loss.py
│       ├── kernel_head.py
│       ├── kernel_iter_head.py
│       ├── kernel_update_head.py
│       ├── knet.py
│       ├── knet_quansi_dense.py
│       ├── knet_quansi_dense_embed_fc.py
│       ├── knet_quansi_dense_embed_fc_joint_train.py
│       ├── knet_quansi_dense_embed_fc_toy_exp.py
│       ├── knet_quansi_dense_roi_gt_box.py
│       ├── knet_quansi_dense_roi_gt_box_joint_train.py
│       ├── knet_track_head.py
│       ├── knet_track_head_roi_align.py
│       ├── knet_uni_track.py
│       ├── mask_hungarian_assigner.py
│       ├── mask_pseudo_sampler.py
│       ├── qdtrack/
│       │   ├── builder.py
│       │   ├── losses/
│       │   │   ├── __init__.py
│       │   │   ├── l2_loss.py
│       │   │   └── multipos_cross_entropy_loss.py
│       │   ├── track/
│       │   │   ├── __init__.py
│       │   │   ├── similarity.py
│       │   │   └── transforms.py
│       │   └── trackers/
│       │       ├── __init__.py
│       │       ├── quasi_dense_embed_tracker.py
│       │       └── tao_tracker.py
│       ├── track_heads.py
│       ├── tracker.py
│       └── util.py
├── knet_vis/
│   ├── __init__.py
│   ├── det/
│   │   ├── __init__.py
│   │   ├── kernel_head.py
│   │   ├── kernel_iter_head.py
│   │   ├── kernel_update_head.py
│   │   ├── knet.py
│   │   ├── mask_hungarian_assigner.py
│   │   ├── mask_pseudo_sampler.py
│   │   ├── semantic_fpn_wrapper.py
│   │   └── utils.py
│   ├── kernel_updator.py
│   └── tracker/
│       ├── __init__.py
│       ├── kernel_frame_head.py
│       ├── kernel_frame_iter_head.py
│       ├── kernel_head.py
│       ├── kernel_iter_head.py
│       ├── kernel_update_head.py
│       ├── mask_hungarian_assigner.py
│       ├── positional_encoding.py
│       ├── semantic_fpn_wrapper3D.py
│       └── track.py
├── mmtrack/
│   ├── datasets/
│   │   ├── coco_video_dataset.py
│   │   ├── parsers/
│   │   │   ├── __init__.py
│   │   │   └── coco_video_parser.py
│   │   └── youtube_vis_dataset.py
│   ├── pipelines/
│   │   ├── __init__.py
│   │   ├── formatting.py
│   │   ├── loading.py
│   │   ├── test_time_aug.py
│   │   └── transforms.py
│   └── transform.py
├── scripts/
│   ├── kitti_step_prepare.py
│   └── visualizer.py
├── swin/
│   ├── DetectRS.py
│   ├── ckpt_convert.py
│   ├── mix_transformer.py
│   ├── swin_checkpoint.py
│   ├── swin_transformer.py
│   ├── swin_transformer_rfp.py
│   └── transformer.py
├── tools/
│   ├── dataset/
│   │   ├── cityscapes_instance_idmap.py
│   │   └── youtubevis2coco.py
│   ├── dist_step_test.sh
│   ├── dist_test.sh
│   ├── dist_train.sh
│   ├── dist_train_new.sh
│   ├── dist_vps_test.sh
│   ├── docker.sh
│   ├── eval_dstq.py
│   ├── eval_dstq_step.py
│   ├── eval_dstq_vipseg.py
│   ├── eval_dvpq_step.py
│   ├── eval_dvpq_vipseg.py
│   ├── flops_counter.py
│   ├── get_flops.py
│   ├── inference_kitti_step.sh
│   ├── slurm_test.sh
│   ├── slurm_test_dvps.sh
│   ├── slurm_test_step.sh
│   ├── slurm_test_vis.sh
│   ├── slurm_test_vps.sh
│   ├── slurm_train.sh
│   ├── test.py
│   ├── test_dvps.py
│   ├── test_step.py
│   ├── test_vps.py
│   ├── train.py
│   ├── utils/
│   │   ├── DSTQ.py
│   │   ├── STQ.py
│   │   └── cityscapesvps_eval.py
│   └── visualization.py
├── tools_vis/
│   ├── apis/
│   │   ├── __init__.py
│   │   └── test.py
│   ├── dist_test_whole_video.sh
│   ├── docker.sh
│   ├── slurm_test_vis.sh
│   ├── test.py
│   └── test_whole_video.py
└── unitrack/
    ├── __init__.py
    ├── basetrack.py
    ├── box.py
    ├── core/
    │   ├── __init__.py
    │   ├── association/
    │   │   ├── __init__.py
    │   │   └── matching.py
    │   ├── motion/
    │   │   └── kalman_filter.py
    │   └── propagation/
    │       ├── __init__.py
    │       ├── propagate_box.py
    │       ├── propagate_mask.py
    │       └── propagate_pose.py
    ├── mask.py
    ├── mask_with_train_embs.py
    ├── model/
    │   ├── __init__.py
    │   ├── functional.py
    │   ├── hrnet.py
    │   ├── model.py
    │   ├── random_feat_generator.py
    │   └── resnet.py
    ├── multitracker.py
    └── utils/
        ├── __init__.py
        ├── box.py
        ├── io.py
        ├── log.py
        ├── mask.py
        ├── meter.py
        ├── palette.py
        └── visualize.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
work_dir/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

data/
data
.vscode
.idea
.DS_Store

# custom
*.pkl
*.pkl.json
*.log.json

# Pytorch
*.pth
*.py~
*.sh~

debug/*
vis/
analysis/*
pretrain/*


================================================
FILE: DATASET.md
================================================
Please prepare the data structure as the following instruction:

The final dataset folder should be like this. 
```
root 
├── data
│   ├──  kitti-step
│   ├──  coco
│   ├──  VIPSeg
│   ├──  youtube_vis_2019
│   ├──  cityscapes
```

### [VPS] KITTI-STEP

Download the KITTI-STEP from the official website. 

Then run the scripts in scripts/kitti_step_prepare.py.
You will get such format.
You can get the our pre-process format in https://huggingface.co/LXT/VideoK-Net/tree/main

```
├── kitti-step
│   ├──  video_sequence
│   │   ├── train
            ├──00018_000331_leftImg8bit.png
            ├──000018_000331_panoptic.png
            ├──****
│   │   ├── val
│   │   ├── test 
```


### [VPS] VIPSeg

Download the origin dataset from the official repo.\
Following official repo, we use resized videos for training and evaluation (The short size of the input is set to 720 while the ratio is keeped).

```
├── VIPSeg
│   ├──  images
│   │   ├── 1241_qYvEuwrSiXc
        │      ├──*.jpg
│   ├──  panomasks 
│   │   ├── 1241_qYvEuwrSiXc
        │      ├──*.png
│   ├──  panomasksRGB 
```


### [VIS] Youtube-VIS-2019
We use pre-processed json file according to mmtracking codebase.
see the "tools/dataset/youtubevis2coco.py"

```
├── youtube_vis_2019
│   ├── annotations
│   │   ├── train.json
│   │   ├── valid.json
│   │   ├── youtube_vis_2019_train.json
│   │   ├── youtube_vis_2019_valid.json
│   ├── train
│   │   ├──JPEGImages
│   │   │   ├──video floders
│   ├── valid
│   │   ├──JPEGImages
│   │   │   ├──video floders
```


### [VSS] VSPW

To do


### [VPS] Cityscapes 

For Cityscape-VPS and Cityscape-DVPS, we suggest the follower to see
The model of Video K-Net will not be released due to the Patent ISSUE and INTERNAL USEAGE. 

You can find our related works. ECCV-2022, PolyphonicFormer: A Unified Framework For Panoptic Segmentation + Depth Estimation (winner of ICCV-2021 BMTT workshop)
(https://github.com/HarborYuan/PolyphonicFormer)


## Image DataSet For Pretraining K-Net

### COCO dataset

COCO is most common datatsets. It contains 80 thing classes and 54 stuff classes.

The dataset format is the same as origin [Detectron2](https://github.com/facebookresearch/detectron2)
including panoptic segmentation preparation [scirpts](https://github.com/facebookresearch/detectron2/blob/master/datasets/prepare_panoptic_fpn.py).

Then the final folder is like this:
```
├── coco
│   ├── annotations
│   │   ├── panoptic_{train,val}2017.json
│   │   ├── instance_{train,val}2017.json
│   ├── train2017
│   ├── val2017
│   ├── panoptic_{train,val}2017/  # png annotations
```

### Cityscapes dataset

Cityscapes dataset is a high-resolution road-scene dataset which contains 19 classes. 
(8 thing classes and 11 stuff classes). 2975 images for training, 500 images for validation and 1525 images for testing.

Preparing cityscape dataset has three steps:

1, Convert segmentation id map(origin label id maps) to trainId maps (id ranges: 0-18 for training) using 
the official scripts [repo](https://github.com/mcordts/cityscapesScripts)

2, The run python dataset/prepare_cityscapes.py to generate the COCO-like annotations. 
This annotations can be used for Instance Segmentation training.

using csCreateTrainIdLabelImgs.py

and put the instancesonly_filtered_gtFine_train.json into annotations folder


3, For Panoptic Segmenation dataset, to generate the json file 

using csCreatePanopticImgs.py 

or you can download the our transformed .json and .png files via link: () and put the 
json file into annotations folder. 

Then the final folder is like this:

```
├── cityscapes
│   ├── annotations
│   │   ├── instancesonly_filtered_gtFine_train.json # coco instance annotation file(COCO format)
│   │   ├── instancesonly_filtered_gtFine_val.json
│   │   ├── cityscapes_panoptic_train.json  # panoptic json file 
│   │   ├── cityscapes_panoptic_val.json  
│   ├── leftImg8bit
│   ├── gtFine
│   │   ├──cityscapes_panoptic_{train,val}/  # png annotations
│   │   
```


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2022 Xiangtai  Lee

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
# Video K-Net: A Simple, Strong, and Unified Baseline for Video Segmentation (CVPR-2022, oral) 
## [Paper](https://arxiv.org/abs/2204.04656), [Sides](./slides/Video-KNet-cvpr-slides-10-25-version.pptx), [Poster](./slides/cvpr22_poster_lxt_zww_pjm.pdf), [Video](https://www.youtube.com/watch?v=LIEyp_czu20&t=3s)

[Xiangtai Li](https://lxtgh.github.io/),
[Wenwei Zhang](https://zhangwenwei.cn/),
[Jiangmiao Pang](https://oceanpang.github.io/),
[Kai Chen](https://chenkai.site/), 
[Guangliang Cheng](https://scholar.google.com/citations?user=FToOC-wAAAAJ),
[Yunhai Tong](https://scholar.google.com/citations?user=T4gqdPkAAAAJ&hl=zh-CN),
[Chen Change Loy](https://www.mmlab-ntu.com/person/ccloy/).

We introduce Video K-Net, a simple, strong, and unified framework for fully end-to-end dense video segmentation. 

The method is built upon K-Net, a method of unifying image segmentation via a group of learnable kernels.

This project contains the training and testing code of Video K-Net for both VPS (Video Panoptic Segmentation), 
VSS(Video Semantic Segmentation), VIS(Video Instance Segmentation).

To the best of our knowledge, our Video K-Net is the first open-sourced method that supports three different video segmentation tasks (VIS, VPS, VSS) for Video Scene Understanding.

## News! Video K-Net is acknowledged as a strong baseline for CVPR-2023 workshop ["The 2nd Pixel-level Video Understanding in the Wild"](https://www.vspwdataset.com/Workshop%202023.html). 
## News! Video K-Net also supports [VIP-Seg](https://github.com/VIPSeg-Dataset/VIPSeg-Dataset) dataset(CVPR-2022). It also achieves the new state-of-the-art result.


### Environment and DataSet Preparation 
Our codebase is based on MMDetection and MMSegmentation. Parts of the code is borrowed from MMtracking and UniTrack.

- MIM >= 0.1.1
- MMCV-full >= v1.3.8
- MMDetection == v2.18.0
- timm
- scipy
- panopticapi

See the [DATASET.md](https://github.com/lxtGH/Video-K-Net/blob/main/DATASET.md)

knet folder contains the Video K-Net for VPS.

knet_vis folder contains the Video K-Net for VIS.


### Pretrained CKPTs and Trained Models

We provide the pretrained models for VPS and VIS.

Baidu Yun Link: [here](https://pan.baidu.com/s/12dIinkAF3o60fcAoggVhjQ)  Code:i034

One Drive Link: [here](https://1drv.ms/u/s!Ai4mxaXd6lVBgSCTUS0QWNim2zGx?e=uceSee)

The pretrained models are provided to train the Video K-Net.

The trained models are also provided for play and test.


### [VPS] KITTI-STEP

1. First pretrain K-Net on Cityscapes-STEP datasset. As shown in original STEP paper(Appendix Part) and our own EXP results, this step is very important to improve the segmentation performance.
You can also use our trained model for verification.

Cityscape-STEP follows the format of STEP: 17 stuff classes and 2 thing classes. 

```bash
# train cityscapes step panoptic segmentation models
sh ./tools/slurm_train.sh $PARTITION knet_step configs/det/knet_cityscapes_step/knet_s3_r50_fpn.py $WORK_DIR --no-validate
```

2. Then train the Video K-Net on KITTI-STEP. We have provided the pretrained models from Cityscapes of Video K-Net.

For slurm users:

```bash
# train Video K-Net on KITTI-step using R-50
GPUS=8 sh ./tools/slurm_train.sh $PARTITION video_knet_step configs/det/video_knet_kitti_step/video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train.py $WORK_DIR --no-validate --load-from /path_to_knet_step_city_r50
```

```bash
# train Video K-Net on KITTI-step using Swin-base
GPUS=16 GPUS_PER_NODE=8 sh ./tools/slurm_train.sh $PARTITION video_knet_step configs/det/video_knet_kitti_step/video_knet_s3_swinb_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train.py $WORK_DIR --no-validate --load-from /path_to_knet_step_city_r50
```

Our models are trained with two V100 machines. 

For Local machine:

```bash
# train Video K-Net on KITTI-step with 8 GPUs
sh ./tools/dist_train.sh video_knet_step configs/det/video_knet_kitti_step/video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train.py 8 $WORK_DIR --no-validate
```


3. Testing and Demo.

We provide both VPQ and STQ metrics to evaluate VPS models. 

```bash
# test locally 
sh ./tools/dist_step_test.sh configs/det/knet_cityscapes_ste/knet_s3_r50_fpn.py $MODEL_DIR 
```

We also dump the colored images for debug.

```bash
# eval STEP STQ
python tools/eval_dstq_step.py result_path gt_path
```

```bash
# eval STEP VPQ
python tools/eval_dvpq_step.py result_path gt_path
```

#### Toy Video K-Net 

As shown in the paper, we also provide toy video K-Net in knet/video/knet_quansi_dense_embed_fc_toy_exp.py. 
You use the K-Net pre-trained on image-level KITTI-STEP without tracking.


### [VIS] YouTube-VIS-2019

1. First Download the pre-trained Image K-Net instance segmentation models. All the models are pretrained on COCO which is
a common. You can also pretrain it by yourself. We also provide the config for pretraining.

For slurm users:

```bash
# train K-Net instance segmentation models on COCO using R-50
GPUS=8 sh ./tools/slurm_train.sh $PARTITION knet_instance configs/det/coco/knet_s3_r50_fpn_ms-3x_coco.py $WORK_DIR 
```

2. Then train the video K-Net in a clip-wised manner. 

```bash
# train Video K-Net VIS models using R-50
GPUS=8 sh ./tools/slurm_train.sh $PARTITION video_knet_vis configs/video_knet_vis/video_knet_vis/knet_track_r50_1x_youtubevis.py $WORK_DIR --load-from /path_to_knet_instance_coco
```

3. To evaluate the results of Video K-Net on VIS. Dump the prediction results for submission to the conda server. 

```bash
# test Video K-Net VIS models using R-50
GPUS=8 sh tools_vis/dist_test_whole_video.sh $PARTITION video_knet_vis configs/video_knet_vis/video_knet_vis/knet_track_r50_1x_youtubevis.py $WORK_DIR --format-only
```
The result json is dumped into the root of this codebase. 

### [VPS] VIP-Seg

1. First Download the pre-trained Image K-Net panoptic segmentation models. All the models are pretrained on COCO which is
a common step following VIP-Seg. You can also pretrain it by yourself. We also provide the config for pretraining.
```bash
# train K-Net on COCO Panoptic Segmetnation
GPUS=8 sh ./tools/slurm_train.sh $PARTITION knet_coco configs/det/coco/knet_s3_r50_fpn_ms-3x_coco-panoptic.py $WORK_DIR 
```

2. Train the Video K-Net on the VIP-Seg dataset. 
```bash
# train Video K-Net on VIP-Seg
GPUS=8 sh ./tools/slurm_train.sh $PARTITION video_knet_vis configs/det/video_knet_vipseg/video_knet_s3_r50_rpn_vipseg_mask_embed_link_ffn_joint_train.py $WORK_DIR --load-from /path/knet_coco_pretrained_r50
```

3. Test the Video K-Net on VIP-Seg val dataset.
```bash
# test locally on VIP-Seg
sh ./tools/dist_step_test.sh configs/det/video_knet_vipseg/video_knet_s3_r50_rpn_vipseg_mask_embed_link_ffn_joint_train.py $MODEL_DIR 
```

We also dump the colored images for debug.

```bash
# eval STEP STQ
python tools/eval_dstq_vipseg.py result_path gt_path
```

```bash
# eval STEP VPQ
python tools/eval_dvpq_vipseg.py result_path gt_path
```


## Visualization Results


### Results on KITTI-STEP DataSet


### Results on VIP-Seg DataSet


### Results on YouTube-VIS DataSet


### Short term segmentation and tracking results on Cityscapes VPS dataset.

images(left), Video K-Net(middle), Ground Truth 
![Alt Text](./figs/cityscapes_vps_video_1_20220318131729.gif)

![Alt Text](./figs/cityscapes_vps_video_2_20220318132943.gif)

### Long term segmentation and tracking results on STEP dataset.

![Alt Text](./figs/step_video_1_20220318133227.gif)

![Alt Text](./figs/step_video_2_20220318133423.gif)


## Related Project and Acknowledgement
## Citing Video K-Net :pray:

If you use our codebase in your research or used for CVPR-2023 pixel-level video workshop, please use the following BibTeX entry.

NIPS-2021, K-Net: Unified Segmentation: Our Image baseline (https://github.com/ZwwWayne/K-Net)

ECCV-2022, PolyphonicFormer: A Unified Framework For Panoptic Segmentation + Depth Estimation (winner of ICCV-2021 BMTT workshop)
(https://github.com/HarborYuan/PolyphonicFormer)

```bibtex
@inproceedings{li2022videoknet,
  title={Video k-net: A simple, strong, and unified baseline for video segmentation},
  author={Li, Xiangtai and Zhang, Wenwei and Pang, Jiangmiao and Chen, Kai and Cheng, Guangliang and Tong, Yunhai and Loy, Chen Change},
  booktitle={CVPR},
  year={2022}
}

@article{zhang2021k,
  title={K-net: Towards unified image segmentation},
  author={Zhang, Wenwei and Pang, Jiangmiao and Chen, Kai and Loy, Chen Change},
  journal={NeurIPS},
  year={2021}
}
```


================================================
FILE: configs/det/_base_/datasets/cityscapes_panoptic.py
================================================
# dataset settings
dataset_type = 'CityscapesPanopticDataset'
data_root = 'data/cityscapes/'

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
    dict(
        type='Resize', img_scale=[(2048, 800), (2048, 1024)], multiscale_mode='range', keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
]


test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(2048, 1024),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]

data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=8,
        dataset=dict(
            type=dataset_type,
            ann_file=dict(
                ins_ann=data_root + 'annotations/instancesonly_filtered_gtFine_train.json',
                panoptic_ann=data_root + 'annotations/cityscapes_panoptic_train.json'
            ),
            img_prefix=data_root + 'leftImg8bit/train/',
            seg_prefix=data_root + 'gtFine/train',
            pipeline=train_pipeline)),
    val=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root +'annotations/instancesonly_filtered_gtFine_val.json',
            panoptic_ann=data_root + "annotations/cityscapes_panoptic_val.json"
        ),
        img_prefix=data_root + 'leftImg8bit/val/',
        seg_prefix=data_root + 'gtFine/cityscapes_panoptic_val',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instancesonly_filtered_gtFine_val.json',
            panoptic_ann=data_root + "annotations/cityscapes_panoptic_val.json"
        ),
        img_prefix=data_root + 'leftImg8bit/val/',
        seg_prefix=data_root + 'gtFine/cityscapes_panoptic_val',
        pipeline=test_pipeline))

evaluation = dict(metric=['panoptic'])


================================================
FILE: configs/det/_base_/datasets/cityscapes_step.py
================================================
dataset_type = 'CityscapesSTEP'
data_root = 'data/cityscapes'

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53],
    std=[58.395, 57.12, 57.375],
    to_rgb=True
)

train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotationsInstanceMasks', cherry=[11, 13]),
    dict(type='KNetInsAdapterCherryPick', stuff_nums=11, cherry=[11, 13]),
    dict(type='Resize', img_scale=(1024, 2048), ratio_range=[0.5, 2.0], keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='RandomCrop', crop_size=(1024, 2048)),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='PadFutureMMDet', size_divisor=32, pad_val=dict(img=0, masks=0, seg=255)),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_masks', 'gt_labels', 'gt_semantic_seg'],
         meta_keys=('ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                    'flip_direction', 'img_norm_cfg')
         ),
]

test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        scale_factor=[1.0],
        flip=False,
        transforms=[
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect',
                 keys=['img'],
                 meta_keys=[
                     'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                     'flip_direction', 'img_norm_cfg'
                 ]),
        ])
]

data = dict(
    samples_per_gpu=4,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=8,
        dataset=dict(
            type=dataset_type,
            data_root=data_root,
            split='train',
            test_mode=False,
            pipeline=train_pipeline
        )),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        split='val',
        test_mode=True,
        pipeline=test_pipeline
    ),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        split='val',
        test_mode=True,
        pipeline=test_pipeline
    )
)

evaluation = dict()


================================================
FILE: configs/det/_base_/datasets/cityscapes_vps_clips.py
================================================
dataset_type = 'CityscapesVPSDataset'
data_root = 'data/cityscapes_vps/'
dataset_type_test = "CityscapesPanopticDataset"

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

train_pipeline = [
    dict(type='LoadMultiImagesFromFile'),
    dict(type='SeqLoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
    dict(type='SeqResize', img_scale=[(512, 1024), (2048, 4096)], multiscale_mode='range', keep_ratio=True),
    dict(type='SeqRandomFlip',  share_params=True, flip_ratio=0.5),
    dict(type='SeqRandomCrop',  crop_size=(1024, 1024), share_params=True),
    dict(type='SeqNormalize', **img_norm_cfg),
    dict(type='SeqPad', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', "gt_instance_ids"]),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]


test_pipeline = [
    dict(type='LoadRefImageFromFile'),

    dict(
        type='MultiScaleFlipAug',
        img_scale=[(2048, 1024)],
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img', 'ref_img']),
            dict(type='Collect', keys=['img', 'ref_img']),
        ])
]

data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=8,
        dataset=dict(
            type=dataset_type,
            ann_file=dict(ins_ann=data_root +'instances_train_city_vps_rle.json',
                          panoptic_ann=data_root + 'panoptic_im_train_city_vps.json'
                          ),
            img_prefix=data_root + 'train/img/',
            seg_prefix=data_root + 'train/labelmap/',
            pipeline=train_pipeline,
            offsets=[-1,+1])),
    val=dict(
        type=dataset_type_test,
        ann_file=dict(ins_ann=data_root + 'instances_val_city_vps_rle.json',
                      panoptic_ann=data_root + 'panoptic_gt_val_city_vps.json',
                      vps=True
                      ),
        img_prefix=data_root + 'val/img/',
        seg_prefix=data_root + 'val/panoptic_video/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type_test,
        ann_file=dict(ins_ann=data_root + 'instances_val_city_vps_rle.json',
                      panoptic_ann=data_root + 'panoptic_gt_val_city_vps.json',
                      vps=True
                      ),
        img_prefix=data_root + 'val/img_all/',     # img for validation
        ref_prefix=data_root + 'val/img_all/',  # ref_images
        nframes_span_test=30,
        pipeline=test_pipeline))

evaluation = dict(metric=['panoptic'])

================================================
FILE: configs/det/_base_/datasets/cityscapes_vps_clips_trainval.py
================================================
dataset_type = 'CityscapesVPSDataset'
data_root = 'data/cityscapes_vps/'
dataset_type_test = "CityscapesPanopticDataset"

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

train_pipeline = [
    dict(type='LoadMultiImagesFromFile'),
    dict(type='SeqLoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
    dict(type='SeqResize', img_scale=[(512, 1024), (2048, 4096)], multiscale_mode='range', keep_ratio=True),
    dict(type='SeqRandomFlip',  share_params=True, flip_ratio=0.5),
    dict(type='SeqRandomCrop',  crop_size=(1024, 2048), share_params=True),
    dict(type='SeqNormalize', **img_norm_cfg),
    dict(type='SeqPad', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', "gt_instance_ids"]),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]


test_pipeline = [
    dict(type='LoadRefImageFromFile'),

    dict(
        type='MultiScaleFlipAug',
        img_scale=[(2048, 1024)],
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img', 'ref_img']),
            dict(type='Collect', keys=['img', 'ref_img']),
        ])
]

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=8,
        dataset=dict(
            type='ConcatDataset',
            separate_eval=False,
            datasets=[
                dict(
                    type=dataset_type,
                    ann_file=dict(ins_ann=data_root +'instances_train_city_vps_rle.json',
                                  panoptic_ann=data_root + 'panoptic_im_train_city_vps.json'
                                  ),
                    img_prefix=data_root + 'train/img/',
                    seg_prefix=data_root + 'train/labelmap/',
                    pipeline=train_pipeline,
                    offsets=[-1,+1]
                ),
            dict(
                type=dataset_type,
                ann_file=dict(ins_ann=data_root +'instances_val_city_vps_rle.json',
                              panoptic_ann=data_root + 'panoptic_gt_val_city_vps.json'
                              ),
                img_prefix=data_root + 'val/img/',
                seg_prefix=data_root + 'val/labelmap/',
                pipeline=train_pipeline,
                offsets=[-1,+1]),
            ],
        )
    ),
    val=dict(
        type=dataset_type,
        ann_file=dict(ins_ann=data_root + 'instances_val_city_vps_rle.json',
                      panoptic_ann=data_root + 'panoptic_gt_val_city_vps.json',
                      vps=True
                      ),
        img_prefix=data_root + 'val/img_all/',     # img for validation
        ref_prefix=data_root + 'val/img_all/',  # ref_images
        nframes_span_test=30,
        pipeline=test_pipeline)

)

evaluation = dict(metric=['panoptic'])

================================================
FILE: configs/det/_base_/datasets/coco_instance.py
================================================
dataset_type = 'CocoDataset'
data_root = 'data/coco/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1333, 800),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_train2017.json',
        img_prefix=data_root + 'train2017/',
        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline))
# we do not evaluate bbox because K-Net does not predict bounding boxes
evaluation = dict(metric=['segm'])


================================================
FILE: configs/det/_base_/datasets/coco_panoptic.py
================================================
dataset_type = 'CocoPanopticDatasetCustom'
data_root = 'data/coco/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(
        type='Collect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1333, 800),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip', flip_ratio=0.5),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_train2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_train2017.json'),
        img_prefix=data_root + 'train2017/',
        seg_prefix=data_root + 'panoptic_stuff_train2017/',
        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_val2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_val2017.json'),
        seg_prefix=data_root + 'panoptic_val2017/',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_val2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_val2017.json'),
        seg_prefix=data_root + 'panoptic_val2017/',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline))

evaluation = dict(metric=['segm', 'panoptic'])


================================================
FILE: configs/det/_base_/datasets/coco_panoptic_instance_annotations.py
================================================
dataset_type = 'CocoPanopticDatasetCustom'
data_root = 'data/coco/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(
        type='Collect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1333, 800),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip', flip_ratio=0.5),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_train2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_train2017.json'),
        img_prefix=data_root + 'train2017/',
        seg_prefix=data_root + 'panoptic_stuff_train2017/',
        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_val2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_val2017.json'),
        seg_prefix=data_root + 'panoptic_val2017/',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_val2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_val2017.json'),
        seg_prefix=data_root + 'panoptic_val2017/',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline))

evaluation = dict(metric=['segm', 'panoptic'])


================================================
FILE: configs/det/_base_/datasets/kitti_step_dvps.py
================================================
dataset_type = 'KITTISTEPDVPSDataset'
data_root = 'data/kitti-step'

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)

# The kitti dataset contains 1226 x 370 and 1241 x 376
train_pipeline = [
    dict(type='LoadMultiImagesDirect'),
    dict(type='LoadMultiAnnotationsDirect', with_depth=True, divisor=-1, cherry_pick=True, cherry=[11, 13]),
    # dict(type='SeqResizeWithDepth', img_scale=(370, 1226), ratio_range=[1.0, 2.0], keep_ratio=True),
    dict(type='SeqFlipWithDepth', flip_ratio=0.5),
    # dict(type='SeqRandomCropWithDepth', crop_size=(352, 1024), share_params=True),
    dict(type='SeqNormalizeWithDepth', **img_norm_cfg),
    dict(type='SeqPadWithDepth', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_depth', 'gt_instance_ids', ]),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]

test_pipeline = [
    dict(type='LoadImgDirect'),
    dict(
        type='MultiScaleFlipAug',
        scale_factor=[1.0],
        flip=False,
        transforms=[
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect',
                 keys=['img'],
                 meta_keys=[
                     'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                     'flip_direction', 'img_norm_cfg', 'ori_filename'
                 ]),
        ])
]

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=4,
        dataset=dict(
            type=dataset_type,
            data_root=data_root,
            split='train',
            ref_seq_index=None,
            test_mode=False,
            pipeline=train_pipeline,
            with_depth=True,
        )),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        split='val',
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
        with_depth=True,
    ),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        split='val',
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
        with_depth=True,
    )
)

evaluation = dict()


================================================
FILE: configs/det/_base_/datasets/kitti_step_vps.py
================================================
dataset_type = 'KITTISTEPDVPSDataset'
data_root = 'data/kitti-step'

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)

# The kitti dataset contains 1226 x 370 and 1241 x 376
# 384 x 1248 is the minimum size that is 32-divisible
train_pipeline = [
    dict(type='LoadMultiImagesDirect'),
    dict(type='LoadMultiAnnotationsDirect', with_depth=False, divisor=-1, cherry_pick=True, cherry=[11, 13]),
    dict(type='SeqResizeWithDepth', img_scale=(384, 1248), ratio_range=[0.5, 2.0], keep_ratio=True),
    dict(type='SeqFlipWithDepth', flip_ratio=0.5),
    dict(type='SeqRandomCropWithDepth', crop_size=(384, 1248), share_params=True),
    dict(type='SeqNormalizeWithDepth', **img_norm_cfg),
    dict(type='SeqPadWithDepth', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids']),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]

test_pipeline = [
    dict(type='LoadImgDirect'),
    dict(
        type='MultiScaleFlipAug',
        scale_factor=[1.0],
        flip=False,
        transforms=[
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect',
                 keys=['img'],
                 meta_keys=[
                     'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                     'flip_direction', 'img_norm_cfg', 'ori_filename', "filename"
                 ]),
        ])
]

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=4,
        dataset=dict(
            type=dataset_type,
            data_root=data_root,
            split='train',
            ref_seq_index=None,
            test_mode=False,
            pipeline=train_pipeline,
            with_depth=False,
        )),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        split='val',
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
        with_depth=False,
    ),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        split='val',
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
        with_depth=False,
    )
)

evaluation = dict()


================================================
FILE: configs/det/_base_/datasets/kitti_step_vps_trainval.py
================================================
dataset_type = 'KITTISTEPDVPSDataset'
data_root = 'data/kitti-step'

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)

# The kitti dataset contains 1226 x 370 and 1241 x 376
# 384 x 1248 is the minimum size that is 32-divisible
train_pipeline = [
    dict(type='LoadMultiImagesDirect'),
    dict(type='LoadMultiAnnotationsDirect', with_depth=False, divisor=-1, cherry_pick=True, cherry=[11, 13]),
    dict(type='SeqResizeWithDepth', img_scale=(384, 1248), ratio_range=[0.5, 2.0], keep_ratio=True),
    dict(type='SeqFlipWithDepth', flip_ratio=0.5),
    dict(type='SeqRandomCropWithDepth', crop_size=(384, 1248), share_params=True),
    dict(type='SeqNormalizeWithDepth', **img_norm_cfg),
    dict(type='SeqPadWithDepth', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids']),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]

test_pipeline = [
    dict(type='LoadImgDirect'),
    dict(
        type='MultiScaleFlipAug',
        scale_factor=[1.0],
        flip=False,
        transforms=[
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect',
                 keys=['img'],
                 meta_keys=[
                     'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                     'flip_direction', 'img_norm_cfg', 'ori_filename', "filename"
                 ]),
        ])
]

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=4,
        dataset=dict(
            type='ConcatDataset',
            separate_eval=False,
            datasets=[
                dict(
                    type=dataset_type,
                    data_root=data_root,
                    split='train',
                    ref_seq_index=None,
                    test_mode=False,
                    pipeline=train_pipeline,
                    with_depth=False,
                ),
                dict(
                    type=dataset_type,
                    data_root=data_root,
                    split='val',
                    ref_seq_index=None,
                    test_mode=False,
                    pipeline=train_pipeline,
                    with_depth=False,
                )
            ]
        ),
    ),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        split='val',
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
        with_depth=False,
    ),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        split='val',
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
        with_depth=False,
    )
)

evaluation = dict()


================================================
FILE: configs/det/_base_/datasets/mapillary_panoptic.py
================================================
dataset_type = 'MapillaryPanopticDataset'
data_root = 'data/mapillary/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
    dict(type='Resize', img_scale=[(1024, 4096), (2048, 4096)], multiscale_mode='range', keep_ratio=True),
    dict(type='RandomCrop', crop_size=(1024, 1024)),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(
        type='Collect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
]

test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(2048, 4096),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip', flip_ratio=0.5),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]

data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/coco/training.json',
            panoptic_ann=data_root + 'annotations/panoptic_train.json'
        ),
        img_prefix=data_root + 'training/images',
        seg_prefix=data_root + 'training/panoptic_stuff_train',
        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/coco/validation.json',
            panoptic_ann=data_root + 'annotations/panoptic_val.json'),
        seg_prefix=data_root + 'validation/panoptic',
        img_prefix=data_root + 'validation/images',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/coco/validation.json',
            panoptic_ann=data_root + 'annotations/panoptic_val.json'),
        seg_prefix=data_root + 'validation/panoptic',
        img_prefix=data_root + 'validation/images',
        pipeline=test_pipeline))

evaluation = dict(metric=['segm', 'panoptic'])


================================================
FILE: configs/det/_base_/datasets/vipseg_dvps.py
================================================
dataset_type = 'VIPSegDVPSDataset'
data_root = 'data/VIPSeg'

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)

crop_size = (736, 736)

train_pipeline = [
    dict(type='LoadMultiImagesDirect'),
    dict(type='LoadMultiAnnotationsDirect', with_depth=False, vipseg=True),
    dict(type='SeqResizeWithDepth', img_scale=(720, 100000), ratio_range=[1., 2.], keep_ratio=True),
    dict(type='SeqFlipWithDepth', flip_ratio=0.5),
    dict(type='SeqRandomCropWithDepth', crop_size=(736, 736), share_params=True),
    dict(type='SeqPadWithDepth', size_divisor=32),
    dict(type='SeqNormalize', **img_norm_cfg),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids']),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]


test_pipeline = [
    dict(type='LoadImgDirect'),
    dict(
        type='MultiScaleFlipAug',
        scale_factor=[1.0],
        flip=False,
        transforms=[
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect',
                 keys=['img', 'img_id', 'seq_id'],
                 meta_keys=[
                     'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                     'flip_direction', 'img_norm_cfg', 'ori_filename', "filename"
                 ]),
        ])
]

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=1,
        dataset=dict(
            type=dataset_type,
            data_root=data_root,
            test_mode=False,
            split='train',
            ref_seq_index=[-2, -1, 1, 2],
            is_instance_only=True,
            pipeline=train_pipeline,
        )),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        split='val',
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
    ),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        split='val',
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
    )
)

evaluation = dict()


================================================
FILE: configs/det/_base_/default_runtime.py
================================================
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
    ])
# yapf:enable

dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]


================================================
FILE: configs/det/_base_/models/knet_citystep_s3_r50_fpn.py
================================================
num_stages = 3
num_proposals = 100
conv_kernel_size = 1

model = dict(
    type='KNet',
    cityscapes=False,
    kitti_step=True,
    num_thing_classes=2,
    num_stuff_classes=17,
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=0,
        add_extra_convs='on_input',
        num_outs=4
    ),
    rpn_head=dict(
        type='ConvKernelHead',
        num_classes=19,
        num_thing_classes=2,
        num_stuff_classes=17,
        cat_stuff_mask=True,
        conv_kernel_size=conv_kernel_size,
        feat_downsample_stride=2,
        feat_refine_stride=1,
        feat_refine=False,
        use_binary=True,
        num_loc_convs=1,
        num_seg_convs=1,
        conv_normal_init=True,
        localization_fpn=dict(
            type='SemanticFPNWrapper',
            in_channels=256,
            feat_channels=256,
            out_channels=256,
            start_level=0,
            end_level=3,
            upsample_times=2,
            positional_encoding=dict(
                type='SinePositionalEncoding', num_feats=128, normalize=True),
            cat_coors=False,
            cat_coors_level=3,
            fuse_by_cat=False,
            return_list=False,
            num_aux_convs=1,
            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)),
        num_proposals=num_proposals,
        proposal_feats_with_obj=True,
        xavier_init_kernel=False,
        kernel_init_std=1,
        num_cls_fcs=1,
        in_channels=256,
        feat_transform_cfg=None,
        loss_rank=dict(
            type='CrossEntropyLoss',
            use_sigmoid=False,
            loss_weight=0.1),
        loss_seg=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_mask=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_dice=dict(type='DiceLoss', loss_weight=4.0)),
    roi_head=dict(
        type='KernelIterHead',
        num_thing_classes=2,
        num_stuff_classes=17,
        do_panoptic=True,
        num_stages=num_stages,
        stage_loss_weights=[1] * num_stages,
        proposal_feature_channel=256,
        mask_head=[
            dict(
                type='KernelUpdateHead',
                num_classes=19,
                num_thing_classes=2,
                num_stuff_classes=17,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=2,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'),
                    act_cfg=None
                ),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_rank=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=0.1
                ),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0
                ),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0
                ),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0))
            for _ in range(num_stages)
        ]
    ),
    # training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaskHungarianAssigner',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
            sampler=dict(type='MaskPseudoSampler'),
            pos_weight=1),
        rcnn=[
            dict(
                assigner=dict(
                    type='MaskHungarianAssigner',
                    cls_cost=dict(type='FocalLossCost', weight=2.0),
                    dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                    mask_cost=dict(type='MaskCost', weight=1.0,
                                   pred_act=True)),
                sampler=dict(type='MaskPseudoSampler'),
                pos_weight=1)

            for _ in range(num_stages)
        ]),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=num_proposals,
            mask_thr=0.5,
            stuff_score_thr=0.05,
            merge_stuff_thing=dict(
                iou_thr=0.5,
                stuff_max_area=4096,
                instance_score_thr=0.25
            )
        )
    )
)

custom_imports = dict(
    imports=[
        'knet.det.kernel_head',
        'knet.det.kernel_iter_head',
        'knet.det.kernel_update_head',
        'knet.det.semantic_fpn_wrapper',
        'knet.det.dice_loss',
        'knet.det.mask_hungarian_assigner',
        'knet.det.mask_pseudo_sampler',
        'knet.kernel_updator',
        'knet.cross_entropy_loss',
        'swin.swin_transformer',
        'swin.mix_transformer',
        'swin.DetectRS',
        'swin.swin_transformer_rfp',
        'external.cityscapes_step',
        'external.dataset.pipelines.transforms',
        'external.dataset.pipelines.loading',
    ],
    allow_failed_imports=False
)


================================================
FILE: configs/det/_base_/models/knet_kitti_step_s3_r50_fpn.py
================================================
num_stages = 3
num_proposals = 100
conv_kernel_size = 1

model = dict(
    type='KNet',
    cityscapes=False,
    kitti_step=True,
    num_thing_classes=2,
    num_stuff_classes=17,
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=0,
        add_extra_convs='on_input',
        num_outs=4
    ),
    rpn_head=dict(
        type='ConvKernelHead',
        num_classes=19,
        num_thing_classes=2,
        num_stuff_classes=17,
        cat_stuff_mask=True,
        conv_kernel_size=conv_kernel_size,
        feat_downsample_stride=2,
        feat_refine_stride=1,
        feat_refine=False,
        use_binary=True,
        num_loc_convs=1,
        num_seg_convs=1,
        conv_normal_init=True,
        localization_fpn=dict(
            type='SemanticFPNWrapper',
            in_channels=256,
            feat_channels=256,
            out_channels=256,
            start_level=0,
            end_level=3,
            upsample_times=2,
            positional_encoding=dict(
                type='SinePositionalEncoding', num_feats=128, normalize=True),
            cat_coors=False,
            cat_coors_level=3,
            fuse_by_cat=False,
            return_list=False,
            num_aux_convs=1,
            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)),
        num_proposals=num_proposals,
        proposal_feats_with_obj=True,
        xavier_init_kernel=False,
        kernel_init_std=1,
        num_cls_fcs=1,
        in_channels=256,
        feat_transform_cfg=None,
        loss_rank=dict(
            type='CrossEntropyLoss',
            use_sigmoid=False,
            loss_weight=0.1),
        loss_seg=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_mask=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_dice=dict(type='DiceLoss', loss_weight=4.0)),
    roi_head=dict(
        type='KernelIterHead',
        num_thing_classes=2,
        num_stuff_classes=17,
        do_panoptic=True,
        num_stages=num_stages,
        stage_loss_weights=[1] * num_stages,
        proposal_feature_channel=256,
        mask_head=[
            dict(
                type='KernelUpdateHead',
                num_classes=19,
                num_thing_classes=2,
                num_stuff_classes=17,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=2,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'),
                    act_cfg=None
                ),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_rank=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=0.1
                ),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0
                ),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0
                ),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0))
            for _ in range(num_stages)
        ]
    ),
    # training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaskHungarianAssigner',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
            sampler=dict(type='MaskPseudoSampler'),
            pos_weight=1),
        rcnn=[
            dict(
                assigner=dict(
                    type='MaskHungarianAssigner',
                    cls_cost=dict(type='FocalLossCost', weight=2.0),
                    dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                    mask_cost=dict(type='MaskCost', weight=1.0,
                                   pred_act=True)),
                sampler=dict(type='MaskPseudoSampler'),
                pos_weight=1)

            for _ in range(num_stages)
        ]),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=num_proposals,
            mask_thr=0.5,
            stuff_score_thr=0.05,
            merge_stuff_thing=dict(
                overlap_thr=0.6,
                iou_thr=0.5,
                stuff_max_area=4096,
                instance_score_thr=0.25
            )
        )
    )
)

custom_imports = dict(
    imports=[
        'knet.det.kernel_head',
        'knet.det.kernel_iter_head',
        'knet.det.kernel_update_head',
        'knet.det.semantic_fpn_wrapper',
        'knet.det.dice_loss',
        'knet.det.mask_hungarian_assigner',
        'knet.det.mask_pseudo_sampler',
        'knet.kernel_updator',
        'knet.cross_entropy_loss',
        'swin.swin_transformer',
        'swin.mix_transformer',
        'swin.DetectRS',
        'swin.swin_transformer_rfp',
        'external.cityscapes_step',
        'external.kitti_step_dvps',
        'external.dataset.dvps_pipelines.transforms',
        'external.dataset.dvps_pipelines.loading',
        'external.dataset.dvps_pipelines.tricks',
        'external.dataset.pipelines.formatting',
        # 'knet.video.knet_track',
        # 'knet.video.knet_track_head',
        'knet.video.track_heads',
        'knet.video.kernel_head',
        'knet.video.kernel_iter_head',
        'knet.video.kernel_update_head',
        'knet.video.knet_uni_track',
        'knet.video.knet_quansi_dense',
        # 'knet.video.knet_quansi_dense_roi',
        'knet.video.knet_quansi_dense_roi_gt_box',
        'knet.video.knet_quansi_dense_embed_fc',
        'knet.video.knet_quansi_dense_embed_fc_joint_train',
        # 'knet.video.knet_quansi_dense_embed_fc_with_appearance',
        'knet.video.knet_quansi_dense_roi_gt_box_joint_train',
        # 'knet.video.knet_quansi_dense_embed_fc_toy_exp',
        'knet.video.qdtrack.losses.l2_loss',
        'knet.video.qdtrack.losses.multipos_cross_entropy_loss',
        'knet.video.qdtrack.trackers.quasi_dense_embed_tracker',
    ],
    allow_failed_imports=False
)


================================================
FILE: configs/det/_base_/models/knet_s3_r50_deformable_fpn.py
================================================
num_stages = 3
num_proposals = 100
conv_kernel_size = 1
model = dict(
    type='KNet',
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    neck=dict(
        type='MSDeformAttnPixelDecoder',
        num_outs=3,
        norm_cfg=dict(type='GN', num_groups=32),
        act_cfg=dict(type='ReLU'),
        return_one_list=True,
        encoder=dict(
            type='DetrTransformerEncoder',
            num_layers=6,
            transformerlayers=dict(
                type='BaseTransformerLayer',
                attn_cfgs=dict(
                    type='MultiScaleDeformableAttention',
                    embed_dims=256,
                    num_heads=8,
                    num_levels=3,
                    num_points=4,
                    im2col_step=64,
                    dropout=0.0,
                    batch_first=False,
                    norm_cfg=None,
                    init_cfg=None),
                ffn_cfgs=dict(
                    type='FFN',
                    embed_dims=256,
                    feedforward_channels=1024,
                    num_fcs=2,
                    ffn_drop=0.0,
                    act_cfg=dict(type='ReLU', inplace=True)),
                operation_order=('self_attn', 'norm', 'ffn', 'norm')),
            init_cfg=None),
        positional_encoding=dict(
            type='SinePositionalEncoding', num_feats=128, normalize=True),
        init_cfg=None),
    rpn_head=dict(
        type='ConvKernelHead',
        conv_kernel_size=conv_kernel_size,
        feat_downsample_stride=2,
        feat_refine_stride=1,
        feat_refine=False,
        use_binary=True,
        num_loc_convs=1,
        num_seg_convs=1,
        conv_normal_init=True,
        localization_fpn=dict(
            type='SemanticFPNWrapper',
            in_channels=256,
            feat_channels=256,
            out_channels=256,
            start_level=0,
            end_level=3,
            upsample_times=2,
            positional_encoding=dict(
                type='SinePositionalEncoding', num_feats=128, normalize=True),
            cat_coors=False,
            cat_coors_level=3,
            fuse_by_cat=False,
            return_list=False,
            num_aux_convs=1,
            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)),
        num_proposals=num_proposals,
        proposal_feats_with_obj=True,
        xavier_init_kernel=False,
        kernel_init_std=1,
        num_cls_fcs=1,
        in_channels=256,
        num_classes=80,
        feat_transform_cfg=None,
        loss_seg=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_mask=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_dice=dict(type='DiceLoss', loss_weight=4.0)),
    roi_head=dict(
        type='KernelIterHead',
        num_stages=num_stages,
        stage_loss_weights=[1] * num_stages,
        proposal_feature_channel=256,
        mask_head=[
            dict(
                type='KernelUpdateHead',
                num_classes=80,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=2,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0)) for _ in range(num_stages)
        ]),
    # training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaskHungarianAssigner',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
            sampler=dict(type='MaskPseudoSampler'),
            pos_weight=1),
        rcnn=[
            dict(
                assigner=dict(
                    type='MaskHungarianAssigner',
                    cls_cost=dict(type='FocalLossCost', weight=2.0),
                    dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                    mask_cost=dict(type='MaskCost', weight=1.0,
                                   pred_act=True)),
                sampler=dict(type='MaskPseudoSampler'),
                pos_weight=1) for _ in range(num_stages)
        ]),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=num_proposals,
            mask_thr=0.5,
            merge_stuff_thing=dict(
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3))))

custom_imports = dict(
    imports=[
        'knet.det.kernel_head',
        'knet.det.kernel_iter_head',
        'knet.det.kernel_update_head',
        'knet.det.semantic_fpn_wrapper',
        'knet.det.dice_loss',
        'knet.kernel_updator',
        'knet.det.msdeformattn_decoder',
        'knet.det.mask_hungarian_assigner',
        'knet.det.mask_pseudo_sampler',
        'external.coco_panoptic',
        'swin.swin_transformer'
    ],
    allow_failed_imports=False)


================================================
FILE: configs/det/_base_/models/knet_s3_r50_fpn.py
================================================
num_stages = 3
num_proposals = 100
conv_kernel_size = 1
model = dict(
    type='KNet',
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=0,
        add_extra_convs='on_input',
        num_outs=4),
    rpn_head=dict(
        type='ConvKernelHead',
        conv_kernel_size=conv_kernel_size,
        feat_downsample_stride=2,
        feat_refine_stride=1,
        feat_refine=False,
        use_binary=True,
        num_loc_convs=1,
        num_seg_convs=1,
        conv_normal_init=True,
        localization_fpn=dict(
            type='SemanticFPNWrapper',
            in_channels=256,
            feat_channels=256,
            out_channels=256,
            start_level=0,
            end_level=3,
            upsample_times=2,
            positional_encoding=dict(
                type='SinePositionalEncoding', num_feats=128, normalize=True),
            cat_coors=False,
            cat_coors_level=3,
            fuse_by_cat=False,
            return_list=False,
            num_aux_convs=1,
            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)),
        num_proposals=num_proposals,
        proposal_feats_with_obj=True,
        xavier_init_kernel=False,
        kernel_init_std=1,
        num_cls_fcs=1,
        in_channels=256,
        num_classes=80,
        feat_transform_cfg=None,
        loss_seg=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_mask=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_dice=dict(type='DiceLoss', loss_weight=4.0)),
    roi_head=dict(
        type='KernelIterHead',
        num_stages=num_stages,
        stage_loss_weights=[1] * num_stages,
        proposal_feature_channel=256,
        mask_head=[
            dict(
                type='KernelUpdateHead',
                num_classes=80,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=2,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0)) for _ in range(num_stages)
        ]),
    # training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaskHungarianAssigner',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
            sampler=dict(type='MaskPseudoSampler'),
            pos_weight=1),
        rcnn=[
            dict(
                assigner=dict(
                    type='MaskHungarianAssigner',
                    cls_cost=dict(type='FocalLossCost', weight=2.0),
                    dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                    mask_cost=dict(type='MaskCost', weight=1.0,
                                   pred_act=True)),
                sampler=dict(type='MaskPseudoSampler'),
                pos_weight=1) for _ in range(num_stages)
        ]),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=num_proposals,
            mask_thr=0.5,
            merge_stuff_thing=dict(
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3))))

custom_imports = dict(
    imports=[
        'knet.det.knet',
        'knet.det.kernel_head',
        'knet.det.kernel_iter_head',
        'knet.det.kernel_update_head',
        'knet.det.semantic_fpn_wrapper',
        'knet.det.dice_loss',
        'knet.kernel_updator',
        'knet.det.msdeformattn_decoder',
        'knet.det.mask_hungarian_assigner',
        'knet.det.mask_pseudo_sampler',
        'panoptic_fpn.coco_panoptic',
    ],
    allow_failed_imports=False)


================================================
FILE: configs/det/_base_/models/knet_s3_r50_fpn_panoptic.py
================================================
num_stages = 3
num_proposals = 100
conv_kernel_size = 1
model = dict(
    type='KNet',
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=0,
        add_extra_convs='on_input',
        num_outs=4),
    rpn_head=dict(
        type='ConvKernelHead',
        num_classes=133,  # modified for panoptic
        cat_stuff_mask=True,  # modified for panoptic
        conv_kernel_size=conv_kernel_size,
        feat_downsample_stride=2,
        feat_refine_stride=1,
        feat_refine=False,
        use_binary=True,
        num_loc_convs=1,
        num_seg_convs=1,
        conv_normal_init=True,
        localization_fpn=dict(
            type='SemanticFPNWrapper',
            in_channels=256,
            feat_channels=256,
            out_channels=256,
            start_level=0,
            end_level=3,
            upsample_times=2,
            positional_encoding=dict(
                type='SinePositionalEncoding', num_feats=128, normalize=True),
            cat_coors=False,
            cat_coors_level=3,
            fuse_by_cat=False,
            return_list=False,
            num_aux_convs=1,
            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)),
        num_proposals=num_proposals,
        proposal_feats_with_obj=True,
        xavier_init_kernel=False,
        kernel_init_std=1,
        num_cls_fcs=1,
        in_channels=256,
        feat_transform_cfg=None,
        loss_rank=dict(
            type='CrossEntropyLoss',
            use_sigmoid=False,
            loss_weight=0.1),
        loss_seg=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_mask=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_dice=dict(type='DiceLoss', loss_weight=4.0)),
    roi_head=dict(
        type='KernelIterHead',
        do_panoptic=True,
        num_stages=num_stages,
        stage_loss_weights=[1] * num_stages,
        proposal_feature_channel=256,
        mask_head=[
            dict(
                type='KernelUpdateHead',
                num_classes=133,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=2,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_rank=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=0.1),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0)) for _ in range(num_stages)
        ]),
    # training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaskHungarianAssigner',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
            sampler=dict(type='MaskPseudoSampler'),
            pos_weight=1),
        rcnn=[
            dict(
                assigner=dict(
                    type='MaskHungarianAssigner',
                    cls_cost=dict(type='FocalLossCost', weight=2.0),
                    dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                    mask_cost=dict(type='MaskCost', weight=1.0,
                                   pred_act=True)),
                sampler=dict(type='MaskPseudoSampler'),
                pos_weight=1) for _ in range(num_stages)
        ]),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=num_proposals,
            mask_thr=0.5,
            stuff_score_thr=0.05,
            merge_stuff_thing=dict(
                overlap_thr=0.6,
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3))))

custom_imports = dict(
    imports=[
        'knet.det.kernel_head',
        'knet.det.kernel_iter_head',
        'knet.det.kernel_update_head',
        'knet.det.semantic_fpn_wrapper',
        'knet.det.dice_loss',
        'knet.kernel_updator',
        'knet.cross_entropy_loss',
        'knet.det.mask_hungarian_assigner',
        'knet.det.mask_pseudo_sampler',
        'swin.swin_transformer',
        'external.mot_step',
        'swin.mix_transformer',
        'swin.DetectRS',
        'swin.swin_transformer_rfp',
        'external.coco_panoptic',
        'external.mapillary_panoptic',
        'external.cityscape_panoptic',
        'external.kitti_step_dvps',
        'external.mot_step',
        'external.dataset.dvps_pipelines.transforms',
        'external.dataset.dvps_pipelines.loading',
        'external.dataset.dvps_pipelines.tricks',
        'external.dataset.pipelines.formatting',
    ],
    allow_failed_imports=False)


================================================
FILE: configs/det/_base_/models/knet_vipseg_s3_r50_fpn.py
================================================
num_stages = 3
num_proposals = 100
conv_kernel_size = 1

num_thing_classes = 58
num_stuff_classes = 66
num_classes = num_stuff_classes + num_thing_classes

model = dict(
    type='KNet',
    cityscapes=False,
    kitti_step=True,
    num_thing_classes=num_thing_classes,
    num_stuff_classes=num_stuff_classes,
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=0,
        add_extra_convs='on_input',
        num_outs=4
    ),
    rpn_head=dict(
        type='ConvKernelHead',
        num_classes=num_classes,
        num_thing_classes=num_thing_classes,
        num_stuff_classes=num_stuff_classes,
        cat_stuff_mask=True,
        conv_kernel_size=conv_kernel_size,
        feat_downsample_stride=2,
        feat_refine_stride=1,
        feat_refine=False,
        use_binary=True,
        num_loc_convs=1,
        num_seg_convs=1,
        conv_normal_init=True,
        localization_fpn=dict(
            type='SemanticFPNWrapper',
            in_channels=256,
            feat_channels=256,
            out_channels=256,
            start_level=0,
            end_level=3,
            upsample_times=2,
            positional_encoding=dict(
                type='SinePositionalEncoding', num_feats=128, normalize=True),
            cat_coors=False,
            cat_coors_level=3,
            fuse_by_cat=False,
            return_list=False,
            num_aux_convs=1,
            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)),
        num_proposals=num_proposals,
        proposal_feats_with_obj=True,
        xavier_init_kernel=False,
        kernel_init_std=1,
        num_cls_fcs=1,
        in_channels=256,
        feat_transform_cfg=None,
        loss_rank=dict(
            type='CrossEntropyLoss',
            use_sigmoid=False,
            loss_weight=0.1),
        loss_seg=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_mask=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_dice=dict(type='DiceLoss', loss_weight=4.0)),
    roi_head=dict(
        type='KernelIterHead',
        num_thing_classes=num_thing_classes,
        num_stuff_classes=num_stuff_classes,
        do_panoptic=True,
        num_stages=num_stages,
        stage_loss_weights=[1] * num_stages,
        proposal_feature_channel=256,
        mask_head=[
            dict(
                type='KernelUpdateHead',
                num_classes=num_classes,
                num_thing_classes=num_thing_classes,
                num_stuff_classes=num_stuff_classes,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=2,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'),
                    act_cfg=None
                ),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_rank=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=0.1
                ),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0
                ),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0
                ),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0))
            for _ in range(num_stages)
        ]
    ),
    # training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaskHungarianAssigner',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
            sampler=dict(type='MaskPseudoSampler'),
            pos_weight=1),
        rcnn=[
            dict(
                assigner=dict(
                    type='MaskHungarianAssigner',
                    cls_cost=dict(type='FocalLossCost', weight=2.0),
                    dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                    mask_cost=dict(type='MaskCost', weight=1.0,
                                   pred_act=True)),
                sampler=dict(type='MaskPseudoSampler'),
                pos_weight=1)

            for _ in range(num_stages)
        ]),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=num_proposals,
            mask_thr=0.5,
            stuff_score_thr=0.05,
            merge_stuff_thing=dict(
                overlap_thr=0.6,
                iou_thr=0.5,
                stuff_max_area=4096,
                instance_score_thr=0.25
            )
        )
    )
)

custom_imports = dict(
    imports=[
        'knet.det.knet',
        'knet.det.kernel_head',
        'knet.det.kernel_iter_head',
        'knet.det.kernel_update_head',
        'knet.det.semantic_fpn_wrapper',
        'knet.det.dice_loss',
        'knet.det.mask_hungarian_assigner',
        'knet.det.mask_pseudo_sampler',
        'knet.kernel_updator',
        'knet.cross_entropy_loss',
        'swin.swin_transformer',
        'swin.mix_transformer',
        'swin.DetectRS',
        'swin.swin_transformer_rfp',
        'external.cityscapes_step',
        'external.kitti_step_dvps',
        'external.vipseg_dvps',
        'external.dataset.dvps_pipelines.transforms',
        'external.dataset.dvps_pipelines.loading',
        'external.dataset.dvps_pipelines.tricks',
        'external.dataset.pipelines.formatting',
        'external.dataset.pipelines.transforms',
        'knet.video.knet',
        'knet.video.knet_quansi_dense',
        'knet.video.knet_quansi_dense_roi_gt_box',
        # 'knet.video.knet_track',
        # 'knet.video.knet_track_head',
        'knet.video.track_heads',
        'knet.video.kernel_head',
        'knet.video.kernel_iter_head',
        'knet.video.kernel_update_head',
        'knet.video.knet_uni_track',
        'knet.video.knet_quansi_dense',
        'knet.video.knet_quansi_dense_roi_gt_box',
        'knet.video.knet_quansi_dense_embed_fc',
        'knet.video.knet_quansi_dense_embed_fc_joint_train',
        'knet.video.qdtrack.losses.l2_loss',
        'knet.video.qdtrack.losses.multipos_cross_entropy_loss',
        'knet.video.qdtrack.trackers.quasi_dense_embed_tracker',

    ],
    allow_failed_imports=False
)


================================================
FILE: configs/det/_base_/models/video_knet_s3_r50_fpn_panoptic.py
================================================
num_stages = 3
num_proposals = 100
conv_kernel_size = 1
model = dict(
    type='VideoKNet',
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=0,
        add_extra_convs='on_input',
        num_outs=4),
    rpn_head=dict(
        type='VideoConvKernelHead',
        num_classes=133,  # modified for panoptic
        cat_stuff_mask=True,  # modified for panoptic
        conv_kernel_size=conv_kernel_size,
        feat_downsample_stride=2,
        feat_refine_stride=1,
        feat_refine=False,
        use_binary=True,
        num_loc_convs=1,
        num_seg_convs=1,
        conv_normal_init=True,
        localization_fpn=dict(
            type='SemanticFPNWrapper',
            in_channels=256,
            feat_channels=256,
            out_channels=256,
            start_level=0,
            end_level=3,
            upsample_times=2,
            positional_encoding=dict(
                type='SinePositionalEncoding', num_feats=128, normalize=True),
            cat_coors=False,
            cat_coors_level=3,
            fuse_by_cat=False,
            return_list=False,
            num_aux_convs=1,
            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)),
        num_proposals=num_proposals,
        proposal_feats_with_obj=True,
        xavier_init_kernel=False,
        kernel_init_std=1,
        num_cls_fcs=1,
        in_channels=256,
        feat_transform_cfg=None,
        loss_rank=dict(
            type='CrossEntropyLoss',
            use_sigmoid=False,
            loss_weight=0.1),
        loss_seg=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_mask=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_dice=dict(type='DiceLoss', loss_weight=4.0)),
    roi_head=dict(
        type='VideoKernelIterHead',
        do_panoptic=True,
        num_stages=num_stages,
        stage_loss_weights=[1] * num_stages,
        proposal_feature_channel=256,
        mask_head=[
            dict(
                type='VideoKernelUpdateHead',
                num_classes=133,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=2,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_rank=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=0.1),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0)) for _ in range(num_stages)
        ]),
    # training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaskHungarianAssigner',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
            sampler=dict(type='MaskPseudoSampler'),
            pos_weight=1),
        rcnn=[
            dict(
                assigner=dict(
                    type='MaskHungarianAssigner',
                    cls_cost=dict(type='FocalLossCost', weight=2.0),
                    dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                    mask_cost=dict(type='MaskCost', weight=1.0,
                                   pred_act=True)),
                sampler=dict(type='MaskPseudoSampler'),
                pos_weight=1) for _ in range(num_stages)
        ]),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=num_proposals,
            mask_thr=0.5,
            stuff_score_thr=0.05,
            merge_stuff_thing=dict(
                overlap_thr=0.6,
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3))))

custom_imports = dict(
    imports=[
        'knet.det.kernel_head',
        'knet.det.kernel_iter_head',
        'knet.det.kernel_update_head',
        'knet.det.semantic_fpn_wrapper',
        'knet.det.dice_loss',
        'knet.cross_entropy_loss',
        'knet.kernel_updator',
        'knet.det.mask_hungarian_assigner',
        'knet.det.mask_pseudo_sampler',
        'external.coco_panoptic',
        'external.youtubevis_clips',
        'external.cityscapes_vps',
        'external.cityscape_panoptic',
        'external.cityscapes_dvps',
        'swin.swin_transformer',
        'swin.mix_transformer',
        'swin.DetectRS',
        'swin.swin_transformer_rfp',
        # 'knet.video.knet_track',
        # 'knet.video.knet_track_head',
        'knet.video.track_heads',
        'knet.video.kernel_head',
        'knet.video.kernel_iter_head',
        'knet.video.kernel_update_head',
        'knet.video.knet_uni_track',
        'knet.video.knet_quansi_dense',
        'knet.video.knet_quansi_dense_conv_mask',
        'knet.video.knet_quansi_dense_roi_gt_box',
        'knet.video.knet_quansi_dense_embed_fc',
        # 'knet.video.knet_quansi_dense_embed_fc_joint_train',
        'knet.video.knet_quansi_dense_roi_gt_box_joint_train',
        'knet.video.qdtrack.losses.l2_loss',
        'knet.video.qdtrack.losses.multipos_cross_entropy_loss',
        'knet.video.qdtrack.trackers.quasi_dense_embed_tracker',

        'knet.video.knet_quansi_dense_embed_fc_toy_exp',
        'external.ext.ytvos',
        'external.ext.mask',

        'external.dataset.pipelines.transforms',
        'external.dataset.pipelines.loading',
        'external.dataset.pipelines.formatting',

        'external.dataset.dvps_pipelines.transforms',
        'external.dataset.dvps_pipelines.loading',
        'external.dataset.dvps_pipelines.tricks',
        'external.dataset.pipelines.formatting',
    ],
    allow_failed_imports=False)


================================================
FILE: configs/det/_base_/schedules/schedule_10e.py
================================================
# optimizer
# this is different from the original 1x schedule that use SGD
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    weight_decay=0.05,
    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)}))
optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))
# learning policy
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[8,])
runner = dict(type='EpochBasedRunner', max_epochs=10)


================================================
FILE: configs/det/_base_/schedules/schedule_1x.py
================================================
# optimizer
# this is different from the original 1x schedule that use SGD
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    weight_decay=0.05,
    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)}))
optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))
# learning policy
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[8, 11])
runner = dict(type='EpochBasedRunner', max_epochs=12)


================================================
FILE: configs/det/coco/knet_s3_r50_deformable_fpn_ms-3x_coco.py
================================================
_base_ = [
    '../_base_/models/knet_s3_r50_deformable_fpn.py',
    '../common/mstrain_3x_coco_instance.py'
]

model = dict(
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
        norm_eval=True,),

)

================================================
FILE: configs/det/coco/knet_s3_r50_fpn_ms-3x_coco-panoptic.py
================================================
_base_ = [
    '../_base_/models/knet_s3_r50_fpn_panoptic.py',
    '../common/mstrain_3x_coco_panoptic.py'
]
num_stages = 3
num_proposals = 100
conv_kernel_size = 1
model = dict(
    type='KNet',
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=0,
        add_extra_convs='on_input',
        num_outs=4),
    rpn_head=dict(
        type='ConvKernelHead',
        num_classes=133,  # modified for panoptic
        cat_stuff_mask=True,  # modified for panoptic
        conv_kernel_size=conv_kernel_size,
        feat_downsample_stride=2,
        feat_refine_stride=1,
        feat_refine=False,
        use_binary=True,
        num_loc_convs=1,
        num_seg_convs=1,
        conv_normal_init=True,
        localization_fpn=dict(
            type='SemanticFPNWrapper',
            in_channels=256,
            feat_channels=256,
            out_channels=256,
            start_level=0,
            end_level=3,
            upsample_times=2,
            positional_encoding=dict(
                type='SinePositionalEncoding', num_feats=128, normalize=True),
            cat_coors=False,
            cat_coors_level=3,
            fuse_by_cat=False,
            return_list=False,
            num_aux_convs=1,
            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)),
        num_proposals=num_proposals,
        proposal_feats_with_obj=True,
        xavier_init_kernel=False,
        kernel_init_std=1,
        num_cls_fcs=1,
        in_channels=256,
        feat_transform_cfg=None,
        loss_rank=dict(
            type='CrossEntropyLoss',
            use_sigmoid=False,
            loss_weight=0.1),
        loss_seg=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_mask=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_dice=dict(type='DiceLoss', loss_weight=4.0)),
    roi_head=dict(
        type='KernelIterHead',
        do_panoptic=True,
        merge_joint=True,
        num_stages=num_stages,
        stage_loss_weights=[1] * num_stages,
        proposal_feature_channel=256,
        mask_head=[
            dict(
                type='KernelUpdateHead',
                num_classes=133,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=2,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_rank=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=0.1),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0)) for _ in range(num_stages)
        ]),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=num_proposals,
            mask_thr=0.5,
            stuff_score_thr=0.05,
            merge_stuff_thing=dict(
                overlap_thr=0.6,
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3)))
)

================================================
FILE: configs/det/coco/knet_s3_r50_fpn_ms-3x_coco.py
================================================
_base_ = [
    '../_base_/models/knet_s3_r50_fpn.py',
    '../common/mstrain_3x_coco_instance.py'
]


================================================
FILE: configs/det/coco/knet_s3_swin-b_deformable_fpn_ms-3x_coco.py
================================================
_base_ = [
    '../_base_/models/knet_s3_r50_deformable_fpn.py',
    '../common/mstrain_3x_coco_instance.py'
]

model = dict(
    pretrained='/mnt/lustre/lixiangtai/pretrained/swin/swin_base_patch4_window7_224_22k.pth',
    backbone=dict(
        _delete_=True,
        type='SwinTransformerDIY',
        embed_dims=128,
        depths=[2, 2, 18, 2],
        num_heads=[4, 8, 16, 32],
        window_size=7,
        mlp_ratio=4.,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.,
        attn_drop_rate=0.,
        drop_path_rate=0.3,
        use_abs_pos_embed=False,
        patch_norm=True,
        out_indices=(0, 1, 2, 3),
        with_cp=False),
    neck=dict(in_channels=[128, 256, 512, 1024])
)


================================================
FILE: configs/det/common/lsj_coco_panoptic_50e.py
================================================
_base_ = '../_base_/default_runtime.py'
# dataset settings
dataset_type = 'CocoPanopticDatasetCustom'
data_root = 'data/coco/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
image_size = (1024, 1024)

# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
# multiscale_mode='range'
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
    dict(
        type='Resize',
        img_scale=image_size,
        ratio_range=(0.1, 2.0),
        multiscale_mode='range',
        keep_ratio=True),
    dict(
        type='RandomCrop',
        crop_type='absolute_range',
        crop_size=image_size,
        recompute_bbox=True,
        allow_negative_crop=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(
        type='Collect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
]

test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1333, 800),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]

# Use RepeatDataset to speed up training
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=1,
        dataset=dict(
            type=dataset_type,
            ann_file=dict(
                ins_ann=data_root + 'annotations/panoptic_train2017_thing_only_coco.json',
                panoptic_ann=data_root + 'annotations/panoptic_train2017.json'),
            img_prefix=data_root + 'train2017/',
            seg_prefix=data_root + 'panoptic_stuff_train2017/',
            pipeline=train_pipeline)),
    val=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_val2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_val2017.json'),
        seg_prefix=data_root + 'panoptic_val2017/',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_val2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_val2017.json'),
        seg_prefix=data_root + 'panoptic_val2017/',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline))

evaluation = dict(metric=['segm', 'panoptic'], interval=5)

checkpoint_config = dict(interval=5)

# optimizer
# this is different from the original 1x schedule that use SGD
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    weight_decay=0.05,
    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)}))
optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))

# learning policy
# Experiments show that using step=[9, 11] has higher performance
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[42, 48])
runner = dict(type='EpochBasedRunner', max_epochs=50)


================================================
FILE: configs/det/common/mstrain_3x_coco_instance.py
================================================
_base_ = '../_base_/default_runtime.py'
# dataset settings
dataset_type = 'CocoDataset'
data_root = 'data/coco/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
# multiscale_mode='range'
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
    dict(
        type='Resize',
        img_scale=[(1333, 640), (1333, 800)],
        multiscale_mode='range',
        keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1333, 800),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]

# Use RepeatDataset to speed up training
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=3,
        dataset=dict(
            type=dataset_type,
            ann_file=data_root + 'annotations/instances_train2017.json',
            img_prefix=data_root + 'train2017/',
            pipeline=train_pipeline)),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline))
evaluation = dict(interval=1, metric=['segm'])

# optimizer
# this is different from the original 1x schedule that use SGD
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    weight_decay=0.05,
    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)}))
optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))

# learning policy
# Experiments show that using step=[9, 11] has higher performance
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[9, 11])
runner = dict(type='EpochBasedRunner', max_epochs=12)


================================================
FILE: configs/det/common/mstrain_3x_coco_panoptic_inst_anno.py
================================================
_base_ = '../_base_/default_runtime.py'
# dataset settings
dataset_type = 'CocoPanopticDatasetCustom'
data_root = 'data/coco/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
# multiscale_mode='range'
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
    dict(
        type='Resize',
        img_scale=[(1333, 640), (1333, 800)],
        multiscale_mode='range',
        keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(
        type='Collect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
]

test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1333, 800),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]

# Use RepeatDataset to speed up training
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=3,
        dataset=dict(
            type=dataset_type,
            ann_file=dict(
                ins_ann=data_root + 'annotations/panoptic_train2017_thing_only_coco.json',
                panoptic_ann=data_root + 'annotations/panoptic_train2017.json'),
            img_prefix=data_root + 'train2017/',
            seg_prefix=data_root + 'panoptic_stuff_train2017/',
            pipeline=train_pipeline)),
    val=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_val2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_val2017.json'),
        seg_prefix=data_root + 'panoptic_val2017/',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_val2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_val2017.json'),
        seg_prefix=data_root + 'panoptic_val2017/',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline))
evaluation = dict(metric=['segm', 'panoptic'])

# optimizer
# this is different from the original 1x schedule that use SGD
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    weight_decay=0.05,
    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)}))
optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))

# learning policy
# Experiments show that using step=[9, 11] has higher performance
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[9, 11])
runner = dict(type='EpochBasedRunner', max_epochs=12)


================================================
FILE: configs/det/common/mstrain_3x_coco_panoptic_inst_anno_detr_aug.py
================================================
_base_ = '../_base_/default_runtime.py'
# dataset settings
dataset_type = 'CocoPanopticDatasetCustom'
data_root = 'data/coco/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
# multiscale_mode='range'
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(
        type='AutoAugment',
        policies=[[
            dict(
                type='Resize',
                img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
                           (608, 1333), (640, 1333), (672, 1333), (704, 1333),
                           (736, 1333), (768, 1333), (800, 1333)],
                multiscale_mode='value',
                keep_ratio=True)
        ],
          [
              dict(
                  type='Resize',
                  img_scale=[(400, 1333), (500, 1333), (600, 1333)],
                  multiscale_mode='value',
                  keep_ratio=True),
              dict(
                  type='RandomCrop',
                  crop_type='relative',
                  crop_size=(0.7, 0.7),
                  allow_negative_crop=True),
              dict(
                  type='Resize',
                  img_scale=[(480, 1333), (512, 1333), (544, 1333),
                             (576, 1333), (608, 1333), (640, 1333),
                             (672, 1333), (704, 1333), (736, 1333),
                             (768, 1333), (800, 1333)],
                  multiscale_mode='value',
                  override=True,
                  keep_ratio=True)
          ]]),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(
        type='Collect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
]

test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1333, 800),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]

# Use RepeatDataset to speed up training
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=3,
        dataset=dict(
            type=dataset_type,
            ann_file=dict(
                ins_ann=data_root + 'annotations/panoptic_train2017_thing_only_coco.json',
                panoptic_ann=data_root + 'annotations/panoptic_train2017.json'),
            img_prefix=data_root + 'train2017/',
            seg_prefix=data_root + 'panoptic_stuff_train2017/',
            pipeline=train_pipeline)),
    val=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_val2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_val2017.json'),
        seg_prefix=data_root + 'panoptic_val2017/',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_val2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_val2017.json'),
        seg_prefix=data_root + 'panoptic_val2017/',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline))
evaluation = dict(metric=['segm', 'panoptic'])

# optimizer
# this is different from the original 1x schedule that use SGD
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    weight_decay=0.05,
    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)}))
optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))

# learning policy
# Experiments show that using step=[9, 11] has higher performance
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[9, 11])
runner = dict(type='EpochBasedRunner', max_epochs=12)


================================================
FILE: configs/det/common/mstrain_64e_city_panoptic.py
================================================
_base_ = '../_base_/default_runtime.py'
# dataset settings
dataset_type = 'CityscapesPanopticDataset'
data_root = 'data/cityscapes/'

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
    dict(
        type='Resize', img_scale=[(512, 1024), (2048, 4096)], multiscale_mode='range', keep_ratio=True),
    dict(type='RandomCrop', crop_size=(1024, 2048)),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
]


test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(2048, 1024),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]

data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=8,
        dataset=dict(
            type=dataset_type,
            ann_file=dict(
                ins_ann=data_root + 'annotations/instancesonly_filtered_gtFine_train.json',
                panoptic_ann=data_root + 'annotations/cityscapes_panoptic_train.json'
            ),
            img_prefix=data_root + 'leftImg8bit/train/',
            seg_prefix=data_root + 'gtFine/train',
            pipeline=train_pipeline)),
    val=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root +'annotations/instancesonly_filtered_gtFine_val.json',
            panoptic_ann=data_root + "annotations/cityscapes_panoptic_val.json"
        ),
        img_prefix=data_root + 'leftImg8bit/val/',
        seg_prefix=data_root + 'gtFine/cityscapes_panoptic_val',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instancesonly_filtered_gtFine_val.json',
            panoptic_ann=data_root + "annotations/cityscapes_panoptic_val.json"
        ),
        img_prefix=data_root + 'leftImg8bit/val/',
        seg_prefix=data_root + 'gtFine/cityscapes_panoptic_val',
        pipeline=test_pipeline))

evaluation = dict(metric=['panoptic'])

# optimizer
# this is different from the original 1x schedule that use SGD
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    weight_decay=0.05,
    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)}))
optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))

# learning policy
# Experiments show that using step=[9, 11] has higher performance
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=0.001,
    # [7] yields higher performance than [6]
    step=[7])
runner = dict(
    type='EpochBasedRunner', max_epochs=8)  # actual epoch = 8 * 8 = 64


================================================
FILE: configs/det/knet_cityscapes_step/knet_s3_r50_fpn.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_citystep_s3_r50_fpn.py',
    '../_base_/datasets/cityscapes_step.py',
]


num_proposals = 100
# load_from = "/mnt/lustre/lixiangtai/pretrained/video_knet_vis/knet_r50_city.pth"
load_from = None

work_dir = 'logger/blackhole'

runner = dict(type='EpochBasedRunner', max_epochs=8)

model = dict(
    type='KNet',
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    roi_head=dict(
            type='KernelIterHead',
            merge_joint=True,),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=num_proposals,
            mask_thr=0.5,
            stuff_score_thr=0.05,
            merge_stuff_thing=dict(
                overlap_thr=0.6,
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3)))
)

lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[7, ],
)
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
)


================================================
FILE: configs/det/knet_cityscapes_step/knet_s3_swin_b_fpn.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_citystep_s3_r50_fpn.py',
    '../_base_/datasets/cityscapes_step.py',
]


num_proposals = 100
# load_from = "/mnt/lustre/lixiangtai/pretrained/video_knet_vis/knet_swin_b_city.pth"
load_from = None

work_dir = 'logger/blackhole'

runner = dict(type='EpochBasedRunner', max_epochs=8)

model = dict(
    type='KNet',
    backbone=dict(
        _delete_=True,
        type='SwinTransformerDIY',
        embed_dims=128,
        depths=[2, 2, 18, 2],
        num_heads=[4, 8, 16, 32],
        window_size=7,
        mlp_ratio=4.,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.,
        attn_drop_rate=0.,
        drop_path_rate=0.3,
        use_abs_pos_embed=False,
        patch_norm=True,
        out_indices=(0, 1, 2, 3),
        with_cp=False),
    neck=dict(in_channels=[128, 256, 512, 1024]),
    roi_head=dict(
        type='KernelIterHead',
        merge_joint=True,
    ),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=num_proposals,
            mask_thr=0.5,
            stuff_score_thr=0.05,
            merge_stuff_thing=dict(
                overlap_thr=0.6,
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3)))
)

lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[7, ],
)
data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
)


================================================
FILE: configs/det/knet_cityscapes_step/knet_s3_swin_l_fpn.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_citystep_s3_r50_fpn.py',
    '../_base_/datasets/cityscapes_step.py',
]


num_proposals = 100
# load_from = "/mnt/lustre/lixiangtai/pretrained/video_knet_vis/knet_swin_l_city.pth"
load_from = None

work_dir = 'logger/blackhole'

runner = dict(type='EpochBasedRunner', max_epochs=8)

model = dict(
    type='KNet',
    backbone=dict(
        _delete_=True,
        type='SwinTransformerDIY',
        embed_dims=192,
        depths=[2, 2, 18, 2],
        num_heads=[6, 12, 24, 48],
        window_size=7,
        mlp_ratio=4.,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.,
        attn_drop_rate=0.,
        drop_path_rate=0.2,
        use_abs_pos_embed=False,
        patch_norm=True,
        out_indices=(0, 1, 2, 3),
        with_cp=False),
    neck=dict(in_channels=[192, 384, 768, 1536]),
    roi_head=dict(
        type='KernelIterHead',
        merge_joint=True,
    ),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=num_proposals,
            mask_thr=0.5,
            stuff_score_thr=0.05,
            merge_stuff_thing=dict(
                overlap_thr=0.6,
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3)))
)

lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[7, ],
)
data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
)


================================================
FILE: configs/det/video_knet_kitti_step/video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_kitti_step_s3_r50_fpn.py',
    '../_base_/datasets/kitti_step_vps.py',
]

load_from = None

num_stages = 3
conv_kernel_size = 1
num_thing_classes = 2
num_stuff_classes = 17
num_classes = num_thing_classes + num_stuff_classes


model = dict(
    type="VideoKNetQuansiEmbedFCJointTrain",
    cityscapes=False,
    kitti_step=True,
    link_previous=True,
    mask_assign_stride=2,
    num_thing_classes=2,
    num_stuff_classes=17,
    ignore_label=255,
    backbone=dict(
            type='ResNet',
            depth=50,
            num_stages=4,
            out_indices=(0, 1, 2, 3),
            frozen_stages=1,
            norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    rpn_head=dict(
        loss_seg=dict(
                _delete_=True,
                type='CrossEntropyLoss',
                use_sigmoid=False,
                loss_weight=1.0),
        feat_downsample_stride=4,
    ),
    # add video_knet_vis roi head
    track_head=dict(
        type='QuasiDenseMaskEmbedHeadGTMask',
        num_convs=0,
        num_fcs=2,
        roi_feat_size=1,
        in_channels=256,
        fc_out_channels=256,
        embed_channels=256,
        norm_cfg=dict(type='GN', num_groups=32),
        loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
        loss_track_aux=dict(
            type='L2Loss',
            neg_pos_ub=3,
            pos_margin=0,
            neg_margin=0.1,
            hard_mining=True,
            loss_weight=1.0),
    ),
    # add tracker config
    tracker=dict(
        type='QuasiDenseEmbedTracker',
        init_score_thr=0.35,
        obj_score_thr=0.3,
        match_score_thr=0.5,
        memo_tracklet_frames=5,
        memo_backdrop_frames=1,
        memo_momentum=0.8,
        nms_conf_thr=0.5,
        nms_backdrop_iou_thr=0.3,
        nms_class_iou_thr=0.7,
        with_cats=True,
        match_metric='bisoftmax'
    ),
    # roi head
    roi_head=dict(
        type='VideoKernelIterHead',
        num_stages=num_stages,
        num_thing_classes=num_thing_classes,
        num_stuff_classes=num_stuff_classes,
        with_track=True,
        merge_joint=True,
        mask_head=[
            dict(
                type='VideoKernelUpdateHead',
                num_classes=19,
                previous='placeholder',
                previous_type="ffn",
                num_thing_classes=2,
                num_stuff_classes=17,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=4,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0),
            ) for _ in range(num_stages)
        ]
    ),
    track_train_cfg=dict(
        assigner=dict(
            type='MaskHungarianAssigner',
            cls_cost=dict(type='FocalLossCost', weight=2.0),
            dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
            mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
        sampler=dict(type='MaskPseudoSampler'),),
    bbox_roi_extractor=None
)

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)

train_pipeline = [
    dict(type='LoadMultiImagesDirect'),
    dict(type='LoadMultiAnnotationsDirect', with_depth=False, divisor=-1, cherry_pick=True, cherry=[11, 13]),
    dict(type='SeqResizeWithDepth', img_scale=(384, 1248), ratio_range=[0.5, 2.0], keep_ratio=True),
    dict(type='SeqFlipWithDepth', flip_ratio=0.5),
    dict(type='SeqRandomCropWithDepth', crop_size=(384, 1248), share_params=True),
    dict(type='SeqNormalizeWithDepth', **img_norm_cfg),
    dict(type='SeqPadWithDepth', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids',]),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]

test_pipeline = [
    dict(type='LoadImgDirect'),
    dict(
        type='MultiScaleFlipAug',
        scale_factor=[1.0],
        flip=False,
        transforms=[
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect',
                 keys=['img', 'img_id', 'seq_id'],
                 meta_keys=[
                     'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                     'flip_direction', 'img_norm_cfg', 'ori_filename'
                 ]),
        ])
]

runner = dict(type='EpochBasedRunner', max_epochs=12)

lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[9, 11])

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=2,
        dataset=dict(
            split='train',
            ref_seq_index=[-2, -1, 1, 2],
            test_mode=False,
            pipeline=train_pipeline
        )),
    test=dict(
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
        split='val',
    )
)

find_unused_parameters=True

================================================
FILE: configs/det/video_knet_kitti_step/video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train_8e.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_kitti_step_s3_r50_fpn.py',
    '../_base_/datasets/kitti_step_vps.py',
]

load_from = None

num_stages = 3
conv_kernel_size = 1
num_thing_classes = 2
num_stuff_classes = 17
num_classes = num_thing_classes + num_stuff_classes


model = dict(
    type="VideoKNetQuansiEmbedFCJointTrain",
    cityscapes=False,
    kitti_step=True,
    link_previous=True,
    mask_assign_stride=2,
    num_thing_classes=2,
    num_stuff_classes=17,
    ignore_label=255,
    backbone=dict(
            type='ResNet',
            depth=50,
            num_stages=4,
            out_indices=(0, 1, 2, 3),
            frozen_stages=1,
            norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    rpn_head=dict(
        loss_seg=dict(
                _delete_=True,
                type='CrossEntropyLoss',
                use_sigmoid=False,
                loss_weight=1.0),
        feat_downsample_stride=4,
    ),
    # add video_knet_vis roi head
    track_head=dict(
        type='QuasiDenseMaskEmbedHeadGTMask',
        num_convs=0,
        num_fcs=2,
        roi_feat_size=1,
        in_channels=256,
        fc_out_channels=256,
        embed_channels=256,
        norm_cfg=dict(type='GN', num_groups=32),
        loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
        loss_track_aux=dict(
            type='L2Loss',
            neg_pos_ub=3,
            pos_margin=0,
            neg_margin=0.1,
            hard_mining=True,
            loss_weight=1.0),
    ),
    # add tracker config
    tracker=dict(
        type='QuasiDenseEmbedTracker',
        init_score_thr=0.35,
        obj_score_thr=0.3,
        match_score_thr=0.5,
        memo_tracklet_frames=5,
        memo_backdrop_frames=1,
        memo_momentum=0.8,
        nms_conf_thr=0.5,
        nms_backdrop_iou_thr=0.3,
        nms_class_iou_thr=0.7,
        with_cats=True,
        match_metric='bisoftmax'
    ),
    # roi head
    roi_head=dict(
        type='VideoKernelIterHead',
        num_stages=num_stages,
        num_thing_classes=num_thing_classes,
        num_stuff_classes=num_stuff_classes,
        with_track=True,
        merge_joint=True,
        mask_head=[
            dict(
                type='VideoKernelUpdateHead',
                num_classes=19,
                previous='placeholder',
                previous_type="ffn",
                num_thing_classes=2,
                num_stuff_classes=17,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=4,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0),
            ) for _ in range(num_stages)
        ]
    ),
    track_train_cfg=dict(
        assigner=dict(
            type='MaskHungarianAssigner',
            cls_cost=dict(type='FocalLossCost', weight=2.0),
            dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
            mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
        sampler=dict(type='MaskPseudoSampler'),),
    bbox_roi_extractor=None
)

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)

train_pipeline = [
    dict(type='LoadMultiImagesDirect'),
    dict(type='LoadMultiAnnotationsDirect', with_depth=False, divisor=-1, cherry_pick=True, cherry=[11, 13]),
    dict(type='SeqResizeWithDepth', img_scale=(384, 1248), ratio_range=[0.5, 2.0], keep_ratio=True),
    dict(type='SeqFlipWithDepth', flip_ratio=0.5),
    dict(type='SeqRandomCropWithDepth', crop_size=(384, 1248), share_params=True),
    dict(type='SeqNormalizeWithDepth', **img_norm_cfg),
    dict(type='SeqPadWithDepth', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids',]),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]

test_pipeline = [
    dict(type='LoadImgDirect'),
    dict(
        type='MultiScaleFlipAug',
        scale_factor=[1.0],
        flip=False,
        transforms=[
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect',
                 keys=['img', 'img_id', 'seq_id'],
                 meta_keys=[
                     'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                     'flip_direction', 'img_norm_cfg', 'ori_filename'
                 ]),
        ])
]

runner = dict(type='EpochBasedRunner', max_epochs=8)

lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[7])

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=2,
        dataset=dict(
            split='train',
            ref_seq_index=[-2, -1, 1, 2],
            test_mode=False,
            pipeline=train_pipeline
        )),
    test=dict(
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
        split='val',
    )
)

find_unused_parameters=True

================================================
FILE: configs/det/video_knet_kitti_step/video_knet_s3_swinb_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_update.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_kitti_step_s3_r50_fpn.py',
    '../_base_/datasets/kitti_step_vps.py',
]

load_from = None

num_stages = 3
conv_kernel_size = 1
num_thing_classes = 2
num_stuff_classes = 17
num_classes = num_thing_classes + num_stuff_classes

model = dict(
    type="VideoKNetQuansiEmbedFCJointTrain",
    cityscapes=False,
    kitti_step=True,
    link_previous=True,
    mask_assign_stride=2,
    num_thing_classes=num_thing_classes,
    num_stuff_classes=num_stuff_classes,
    ignore_label=255,
    backbone=dict(
        _delete_=True,
        type='SwinTransformerDIY',
        embed_dims=128,
        depths=[2, 2, 18, 2],
        num_heads=[4, 8, 16, 32],
        window_size=7,
        mlp_ratio=4,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.,
        attn_drop_rate=0.,
        drop_path_rate=0.3,
        use_abs_pos_embed=False,
        patch_norm=True,
        out_indices=(0, 1, 2, 3),
        with_cp=False
    ),
    neck=dict(in_channels=[128, 256, 512, 1024]),
    rpn_head=dict(
        loss_seg=dict(
                _delete_=True,
                type='CrossEntropyLoss',
                use_sigmoid=False,
                loss_weight=1.0),
        feat_downsample_stride=4,
    ),
    # add video_knet_vis roi head
    track_head=dict(
        type='QuasiDenseMaskEmbedHeadGTMask',
        num_convs=0,
        num_fcs=2,
        roi_feat_size=1,
        in_channels=256,
        fc_out_channels=256,
        embed_channels=256,
        norm_cfg=dict(type='GN', num_groups=32),
        loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
        loss_track_aux=dict(
            type='L2Loss',
            neg_pos_ub=3,
            pos_margin=0,
            neg_margin=0.1,
            hard_mining=True,
            loss_weight=1.0),
    ),
    # add tracker config
    tracker=dict(
        type='QuasiDenseEmbedTracker',
        init_score_thr=0.35,
        obj_score_thr=0.3,
        match_score_thr=0.5,
        memo_tracklet_frames=5,
        memo_backdrop_frames=1,
        memo_momentum=0.8,
        nms_conf_thr=0.5,
        nms_backdrop_iou_thr=0.3,
        nms_class_iou_thr=0.7,
        with_cats=True,
        match_metric='bisoftmax'
    ),
    # roi head
    roi_head=dict(
        type='VideoKernelIterHead',
        num_stages=num_stages,
        num_thing_classes=2,
        num_stuff_classes=17,
        with_track=True,
        merge_joint=True,
        mask_head=[
            dict(
                type='VideoKernelUpdateHead',
                num_classes=num_classes,
                previous='placeholder',
                previous_link="update_dynamic_cov",
                previous_type="update",
                num_thing_classes=num_thing_classes,
                num_stuff_classes=num_stuff_classes,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=4,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0),
            ) for _ in range(num_stages)
        ]
    ),
    track_train_cfg=dict(
        assigner=dict(
            type='MaskHungarianAssigner',
            cls_cost=dict(type='FocalLossCost', weight=2.0),
            dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
            mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
        sampler=dict(type='MaskPseudoSampler'),),
    bbox_roi_extractor=None
)


img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)

train_pipeline = [
    dict(type='LoadMultiImagesDirect'),
    dict(type='LoadMultiAnnotationsDirect', with_depth=False, divisor=-1, cherry_pick=True, cherry=[11, 13]),
    dict(type='SeqResizeWithDepth', img_scale=(384, 1248), ratio_range=[0.5, 2.0], keep_ratio=True),
    dict(type='SeqFlipWithDepth', flip_ratio=0.5),
    dict(type='SeqRandomCropWithDepth', crop_size=(384, 1248), share_params=True),
    dict(type='SeqNormalizeWithDepth', **img_norm_cfg),
    dict(type='SeqPadWithDepth', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids',]),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]

test_pipeline = [
    dict(type='LoadImgDirect'),
    dict(
        type='MultiScaleFlipAug',
        scale_factor=[1.0],
        flip=False,
        transforms=[
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect',
                 keys=['img', 'img_id', 'seq_id'],
                 meta_keys=[
                     'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                     'flip_direction', 'img_norm_cfg', 'ori_filename'
                 ]),
        ])
]

runner = dict(type='EpochBasedRunner', max_epochs=12)

lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[9, 11])

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=2,
        dataset=dict(
            split='train',
            ref_seq_index=[-2, -1, 1, 2],
            test_mode=False,
            pipeline=train_pipeline
        )),
    test=dict(
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
        split='val',
    )
)

find_unused_parameters=True

================================================
FILE: configs/det/video_knet_kitti_step/video_knet_s3_swinl_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_update.py
================================================
_base_ = [
    '../../_base_/schedules/schedule_1x.py',
    '../../_base_/default_runtime.py',
    '../../_base_/models/knet_kitti_step_s3_r50_fpn.py',
    '../../_base_/datasets/kitti_step_vps.py',
]

load_from = None

num_stages = 3
conv_kernel_size = 1
num_thing_classes = 2
num_stuff_classes = 17
num_classes = num_thing_classes + num_stuff_classes


model = dict(
    type="VideoKNetQuansiEmbedFCJointTrain",
    cityscapes=False,
    kitti_step=True,
    link_previous=True,
    mask_assign_stride=2,
    num_thing_classes=2,
    num_stuff_classes=17,
    ignore_label=255,
    backbone=dict(
        _delete_=True,
        type='SwinTransformerDIY',
        embed_dims=192,
        depths=[2, 2, 18, 2],
        num_heads=[6, 12, 24, 48],
        window_size=7,
        mlp_ratio=4.,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.,
        attn_drop_rate=0.,
        drop_path_rate=0.2,
        use_abs_pos_embed=False,
        patch_norm=True,
        out_indices=(0, 1, 2, 3),
        with_cp=False),
    neck=dict(in_channels=[192, 384, 768, 1536]),
    rpn_head=dict(
        loss_seg=dict(
                _delete_=True,
                type='CrossEntropyLoss',
                use_sigmoid=False,
                loss_weight=1.0),
        feat_downsample_stride=4,
    ),
    # add video_knet_vis roi head
    track_head=dict(
        type='QuasiDenseMaskEmbedHeadGTMask',
        num_convs=0,
        num_fcs=2,
        roi_feat_size=1,
        in_channels=256,
        fc_out_channels=256,
        embed_channels=256,
        norm_cfg=dict(type='GN', num_groups=32),
        loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
        loss_track_aux=dict(
            type='L2Loss',
            neg_pos_ub=3,
            pos_margin=0,
            neg_margin=0.1,
            hard_mining=True,
            loss_weight=1.0),
    ),
    # add tracker config
    tracker=dict(
        type='QuasiDenseEmbedTracker',
        init_score_thr=0.35,
        obj_score_thr=0.3,
        match_score_thr=0.5,
        memo_tracklet_frames=5,
        memo_backdrop_frames=1,
        memo_momentum=0.8,
        nms_conf_thr=0.5,
        nms_backdrop_iou_thr=0.3,
        nms_class_iou_thr=0.7,
        with_cats=True,
        match_metric='bisoftmax'
    ),
    # roi head
    roi_head=dict(
        type='VideoKernelIterHead',
        num_stages=num_stages,
        num_thing_classes=2,
        num_stuff_classes=17,
        with_track=True,
        merge_joint=True,
        mask_head=[
            dict(
                type='VideoKernelUpdateHead',
                num_classes=19,
                previous='placeholder',
                previous_link="update_dynamic_cov",
                previous_type="update",
                num_thing_classes=2,
                num_stuff_classes=17,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=4,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0),
            ) for _ in range(num_stages)
        ]
    ),
    track_train_cfg=dict(
        assigner=dict(
            type='MaskHungarianAssigner',
            cls_cost=dict(type='FocalLossCost', weight=2.0),
            dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
            mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
        sampler=dict(type='MaskPseudoSampler'),),
    bbox_roi_extractor=None
)

work_dir = 'logger/ks_wodepth_4x8_step_stride2_nocrop_2_17'

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)

train_pipeline = [
    dict(type='LoadMultiImagesDirect'),
    dict(type='LoadMultiAnnotationsDirect', with_depth=False, divisor=-1, cherry_pick=True, cherry=[11, 13]),
    dict(type='SeqResizeWithDepth', img_scale=(384, 1248), ratio_range=[0.5, 2.0], keep_ratio=True),
    dict(type='SeqFlipWithDepth', flip_ratio=0.5),
    dict(type='SeqRandomCropWithDepth', crop_size=(384, 1248), share_params=True),
    dict(type='SeqNormalizeWithDepth', **img_norm_cfg),
    dict(type='SeqPadWithDepth', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids',]),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]

test_pipeline = [
    dict(type='LoadImgDirect'),
    dict(
        type='MultiScaleFlipAug',
        scale_factor=[1.0],
        flip=False,
        transforms=[
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect',
                 keys=['img', 'img_id', 'seq_id'],
                 meta_keys=[
                     'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                     'flip_direction', 'img_norm_cfg', 'ori_filename'
                 ]),
        ])
]

runner = dict(type='EpochBasedRunner', max_epochs=12)

lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[9, 11])

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=2,
        dataset=dict(
            split='train',
            ref_seq_index=[-2, -1, 1, 2],
            test_mode=False,
            pipeline=train_pipeline
        )),
    test=dict(
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
        split='val',
    )
)

find_unused_parameters=True

================================================
FILE: configs/det/video_knet_kitti_step/video_knet_s3_swinl_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_update_conv_short_track_fc.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_kitti_step_s3_r50_fpn.py',
    '../_base_/datasets/kitti_step_vps.py',
]
# load_from = "/mnt/lustre/lixiangtai/project/Knet/work_dirs/city_step/swin_l_joint_8e/latest.pth"

load_from = None

num_stages = 3
conv_kernel_size = 1

model = dict(
    type="VideoKNetQuansiEmbedFCJointTrain",
    cityscapes=False,
    kitti_step=True,
    link_previous=True,
    mask_assign_stride=2,
    num_thing_classes=2,
    num_stuff_classes=17,
    ignore_label=255,
    backbone=dict(
        _delete_=True,
        type='SwinTransformerDIY',
        embed_dims=192,
        depths=[2, 2, 18, 2],
        num_heads=[6, 12, 24, 48],
        window_size=7,
        mlp_ratio=4.,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.,
        attn_drop_rate=0.,
        drop_path_rate=0.2,
        use_abs_pos_embed=False,
        patch_norm=True,
        out_indices=(0, 1, 2, 3),
        with_cp=False),
    neck=dict(in_channels=[192, 384, 768, 1536]),
    rpn_head=dict(
        loss_seg=dict(
                _delete_=True,
                type='CrossEntropyLoss',
                use_sigmoid=False,
                loss_weight=1.0),
        feat_downsample_stride=4,
    ),
    # add track roi head
    track_head=dict(
        type='QuasiDenseMaskEmbedHeadGTMask',
        num_convs=0,
        num_fcs=1,
        roi_feat_size=1,
        in_channels=256,
        fc_out_channels=256,
        embed_channels=256,
        norm_cfg=dict(type='GN', num_groups=32),
        loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
        loss_track_aux=dict(
            type='L2Loss',
            neg_pos_ub=3,
            pos_margin=0,
            neg_margin=0.1,
            hard_mining=True,
            loss_weight=1.0),
    ),
    # add tracker config
    tracker=dict(
        type='QuasiDenseEmbedTracker',
        init_score_thr=0.35,
        obj_score_thr=0.3,
        match_score_thr=0.5,
        memo_tracklet_frames=5,
        memo_backdrop_frames=1,
        memo_momentum=0.8,
        nms_conf_thr=0.5,
        nms_backdrop_iou_thr=0.3,
        nms_class_iou_thr=0.7,
        with_cats=True,
        match_metric='bisoftmax'
    ),
    # roi head
    roi_head=dict(
        type='VideoKernelIterHead',
        num_stages=num_stages,
        num_thing_classes=2,
        num_stuff_classes=17,
        with_track=True,
        merge_joint=True,
        mask_head=[
            dict(
                type='VideoKernelUpdateHead',
                num_classes=19,
                previous='placeholder',
                previous_link="update_dynamic_cov",
                previous_type="ffn",
                num_thing_classes=2,
                num_stuff_classes=17,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=4,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0),
            ) for _ in range(num_stages)
        ]
    ),
    track_train_cfg=dict(
        assigner=dict(
            type='MaskHungarianAssigner',
            cls_cost=dict(type='FocalLossCost', weight=2.0),
            dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
            mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
        sampler=dict(type='MaskPseudoSampler'),),
    bbox_roi_extractor=None
)


img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)

train_pipeline = [
    dict(type='LoadMultiImagesDirect'),
    dict(type='LoadMultiAnnotationsDirect', with_depth=False, divisor=-1, cherry_pick=True, cherry=[11, 13]),
    dict(type='SeqResizeWithDepth', img_scale=(384, 1248), ratio_range=[0.5, 2.0], keep_ratio=True),
    dict(type='SeqFlipWithDepth', flip_ratio=0.5),
    dict(type='SeqRandomCropWithDepth', crop_size=(384, 1248), share_params=True),
    dict(type='SeqNormalizeWithDepth', **img_norm_cfg),
    dict(type='SeqPadWithDepth', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids',]),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]

test_pipeline = [
    dict(type='LoadImgDirect'),
    dict(
        type='MultiScaleFlipAug',
        scale_factor=[1.0],
        flip=False,
        transforms=[
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect',
                 keys=['img', 'img_id', 'seq_id'],
                 meta_keys=[
                     'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                     'flip_direction', 'img_norm_cfg', 'ori_filename'
                 ]),
        ])
]

runner = dict(type='EpochBasedRunner', max_epochs=12)

lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[9, 11])

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=2,
        dataset=dict(
            split='train',
            ref_seq_index=[-2, -1, 1, 2],
            test_mode=False,
            pipeline=train_pipeline
        )),
    test=dict(
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
        split='val',
    )
)

find_unused_parameters=True

================================================
FILE: configs/det/video_knet_vipseg/video_knet_s3_r50_rpn_vipseg_mask_embed_link_ffn_joint_train.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_vipseg_s3_r50_fpn.py',
    '../_base_/datasets/vipseg_dvps.py',
]

num_stages = 3
conv_kernel_size = 1
num_thing_classes = 58
num_stuff_classes = 66
num_classes = num_stuff_classes + num_thing_classes

model = dict(
    type="VideoKNetQuansiEmbedFCJointTrain",
    # use cityscape style label distribution. # thing first , stuff second
    cityscapes=False,
    vipseg=True,
    kitti_step=False,
    link_previous=True,
    mask_assign_stride=2,
    num_thing_classes=num_thing_classes,
    num_stuff_classes=num_stuff_classes,
    ignore_label=255,
    backbone=dict(
            type='ResNet',
            depth=50,
            num_stages=4,
            out_indices=(0, 1, 2, 3),
            frozen_stages=1,
            norm_cfg=dict(type='BN', requires_grad=True),
            norm_eval=True
    ),
    rpn_head=dict(
        num_thing_classes=num_thing_classes,
        num_stuff_classes=num_stuff_classes,
            loss_seg=dict(
                    _delete_=True,
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=1.0),
        feat_downsample_stride=4,
    ),
    # add track roi head
    track_head=dict(
        type='QuasiDenseMaskEmbedHeadGTMask',
        num_convs=0,
        num_fcs=2,
        roi_feat_size=1,
        in_channels=256,
        fc_out_channels=256,
        embed_channels=256,
        norm_cfg=dict(type='GN', num_groups=32),
        loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
        loss_track_aux=dict(
            type='L2Loss',
            neg_pos_ub=3,
            pos_margin=0,
            neg_margin=0.1,
            hard_mining=True,
            loss_weight=1.0),
    ),
    # add tracker config
    tracker=dict(
        type='QuasiDenseEmbedTracker',
        init_score_thr=0.35,
        obj_score_thr=0.3,
        match_score_thr=0.5,
        memo_tracklet_frames=5,
        memo_backdrop_frames=1,
        memo_momentum=0.8,
        nms_conf_thr=0.5,
        nms_backdrop_iou_thr=0.3,
        nms_class_iou_thr=0.7,
        with_cats=True,
        match_metric='bisoftmax'
    ),
    # roi head
    roi_head=dict(
        type='VideoKernelIterHead',
        num_stages=num_stages,
        num_thing_classes=num_thing_classes,
        num_stuff_classes=num_stuff_classes,
        with_track=True,
        merge_joint=True,
        mask_head=[
            dict(
                type='VideoKernelUpdateHead',
                num_classes=num_classes,
                previous='placeholder',
                previous_type="ffn",
                num_thing_classes=num_thing_classes,
                num_stuff_classes=num_stuff_classes,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=4,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0),
            ) for _ in range(num_stages)
        ]
    ),
    track_train_cfg=dict(
        assigner=dict(
            type='MaskHungarianAssigner',
            cls_cost=dict(type='FocalLossCost', weight=2.0),
            dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
            mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
        sampler=dict(type='MaskPseudoSampler'),),
    bbox_roi_extractor=None
)


runner = dict(type='EpochBasedRunner', max_epochs=12)

lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[9, 11])


find_unused_parameters=True

================================================
FILE: configs/det/video_knet_vipseg/video_knet_s3_swin_b_rpn_vipseg_mask_embed_link_ffn_joint_train_8e.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_vipseg_s3_r50_fpn.py',
    '../_base_/datasets/vipseg_dvps.py',
]


num_stages = 3
conv_kernel_size = 1
num_thing_classes = 58
num_stuff_classes = 66
num_classes = num_stuff_classes + num_thing_classes

model = dict(
    type="VideoKNetQuansiEmbedFCJointTrain",
    # use cityscape style label distribution. # thing first , stuff second
    cityscapes=False,
    vipseg=True,
    kitti_step=False,
    link_previous=True,
    mask_assign_stride=2,
    num_thing_classes=num_thing_classes,
    num_stuff_classes=num_stuff_classes,
    ignore_label=255,
    backbone=dict(
        _delete_=True,
        type='SwinTransformerDIY',
        embed_dims=128,
        depths=[2, 2, 18, 2],
        num_heads=[4, 8, 16, 32],
        window_size=7,
        mlp_ratio=4.,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.,
        attn_drop_rate=0.,
        drop_path_rate=0.3,
        use_abs_pos_embed=False,
        patch_norm=True,
        out_indices=(0, 1, 2, 3),
        with_cp=False),
    neck=dict(
        in_channels=[128, 256, 512, 1024],
    ),
    rpn_head=dict(
        num_thing_classes=num_thing_classes,
        num_stuff_classes=num_stuff_classes,
            loss_seg=dict(
                    _delete_=True,
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=1.0),
        feat_downsample_stride=4,
    ),
    # add track roi head
    track_head=dict(
        type='QuasiDenseMaskEmbedHeadGTMask',
        num_convs=0,
        num_fcs=2,
        roi_feat_size=1,
        in_channels=256,
        fc_out_channels=256,
        embed_channels=256,
        norm_cfg=dict(type='GN', num_groups=32),
        loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
        loss_track_aux=dict(
            type='L2Loss',
            neg_pos_ub=3,
            pos_margin=0,
            neg_margin=0.1,
            hard_mining=True,
            loss_weight=1.0),
    ),
    # add tracker config
    tracker=dict(
        type='QuasiDenseEmbedTracker',
        init_score_thr=0.35,
        obj_score_thr=0.3,
        match_score_thr=0.5,
        memo_tracklet_frames=5,
        memo_backdrop_frames=1,
        memo_momentum=0.8,
        nms_conf_thr=0.5,
        nms_backdrop_iou_thr=0.3,
        nms_class_iou_thr=0.7,
        with_cats=True,
        match_metric='bisoftmax'
    ),
    # roi head
    roi_head=dict(
        type='VideoKernelIterHead',
        num_stages=num_stages,
        num_thing_classes=num_thing_classes,
        num_stuff_classes=num_stuff_classes,
        with_track=True,
        merge_joint=True,
        mask_head=[
            dict(
                type='VideoKernelUpdateHead',
                num_classes=num_classes,
                previous='placeholder',
                previous_type="ffn",
                num_thing_classes=num_thing_classes,
                num_stuff_classes=num_stuff_classes,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=4,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0),
            ) for _ in range(num_stages)
        ]
    ),
    track_train_cfg=dict(
        assigner=dict(
            type='MaskHungarianAssigner',
            cls_cost=dict(type='FocalLossCost', weight=2.0),
            dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
            mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
        sampler=dict(type='MaskPseudoSampler'),),
    bbox_roi_extractor=None
)


runner = dict(type='EpochBasedRunner', max_epochs=8)

lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[7,])


find_unused_parameters=True

================================================
FILE: configs/video_knet_vis/_base_/datasets/coco_instance.py
================================================
dataset_type = 'CocoDataset'
data_root = 'data/coco/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1333, 800),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_train2017.json',
        img_prefix=data_root + 'train2017/',
        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline))
# we do not evaluate bbox because K-Net does not predict bounding boxes
evaluation = dict(metric=['segm'])


================================================
FILE: configs/video_knet_vis/_base_/datasets/youtubevis_2019.py
================================================
# dataset settings
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375],
    to_rgb=True
)

train_pipeline = [
    dict(type='LoadMultiImagesFromFile', to_float32=True),
    dict(
        type='SeqLoadAnnotations',
        with_bbox=True,
        with_mask=True,
        with_track=True),
    dict(
        type='SeqResize',
        multiscale_mode='value',
        share_params=True,
        img_scale=[(288,1e6), (320,1e6), (352,1e6), (392,1e6), (416,1e6), (448,1e6), (480,1e6), (512,1e6)],
        keep_ratio=True
    ),
    dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
    dict(type='SeqNormalize', **img_norm_cfg),
    dict(type='SeqPad', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_instance_ids'],
        reject_empty=True,
        num_ref_imgs=5,
    ),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]

test_pipeline = [
    dict(type='LoadMultiImagesFromFile', to_float32=True),
    dict(type='MultiScaleFlipAugVideo',
         img_scale=(640, 360),
         flip=False,
         transforms=[
             dict(type='SeqResize'),
             dict(type='SeqNormalize', **img_norm_cfg),
             dict(type='SeqPad', size_divisor=32),
             dict(
                 type='VideoCollect',
                 keys=['img'],
                 reject_empty=False,
                 num_ref_imgs=0,  # 0 means do not apply check
             ),
             dict(type='ConcatVideoReferences'),
             dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
         ])
]

dataset_type = 'YouTubeVISDataset'
data_root = 'data/youtube_vis_2019/'
dataset_version = '2019'
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type=dataset_type,
        dataset_version=dataset_version,
        ann_file=data_root + 'annotations/youtube_vis_2019_train.json',
        img_prefix=data_root + 'train/JPEGImages',
        ref_img_sampler=dict(
            num_ref_imgs=5,
            frame_range=[-2, 2],
            filter_key_img=False,
            method='uniform'),
        pipeline=train_pipeline
    ),
    val=dict(
        type=dataset_type,
        dataset_version=dataset_version,
        ann_file=data_root + 'annotations/youtube_vis_2019_valid.json',
        img_prefix=data_root + 'valid/JPEGImages',
        ref_img_sampler=None,
        load_all_frames=True,
        pipeline=test_pipeline
    ),
    test=dict(
        type=dataset_type,
        dataset_version=dataset_version,
        ann_file=data_root + 'annotations/youtube_vis_2019_valid.json',
        img_prefix=data_root + 'valid/JPEGImages',
        ref_img_sampler=None,
        load_all_frames=True,
        pipeline=test_pipeline
    )
)


================================================
FILE: configs/video_knet_vis/_base_/default_runtime.py
================================================
checkpoint_config = dict(interval=1)
log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
    ]
)
# custom_hooks = [dict(type='NumClassCheckHook')]

dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]

work_dir = 'logger/blackhole'


================================================
FILE: configs/video_knet_vis/_base_/models/knet_track_r50.py
================================================
num_stages = 3
num_proposals = 100
conv_kernel_size = 1
model = dict(
    type='KNetTrack',
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=0,
        add_extra_convs='on_input',
        num_outs=4),
    rpn_head=dict(
        type='ConvKernelHeadVideo',
        conv_kernel_size=conv_kernel_size,
        feat_downsample_stride=2,
        feat_refine_stride=1,
        feat_refine=False,
        use_binary=True,
        num_loc_convs=1,
        num_seg_convs=1,
        conv_normal_init=True,
        localization_fpn=dict(
            type='SemanticFPNWrapper',
            in_channels=256,
            feat_channels=256,
            out_channels=256,
            start_level=0,
            end_level=3,
            upsample_times=2,
            positional_encoding=dict(
                type='SinePositionalEncoding', num_feats=128, normalize=True),
            cat_coors=False,
            cat_coors_level=3,
            fuse_by_cat=False,
            return_list=False,
            num_aux_convs=1,
            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)
        ),
        num_proposals=num_proposals,
        proposal_feats_with_obj=True,
        xavier_init_kernel=False,
        kernel_init_std=1,
        num_cls_fcs=1,
        in_channels=256,
        num_classes=40,
        feat_transform_cfg=None,
        loss_seg=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_mask=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_dice=dict(type='DiceLoss', loss_weight=4.0)),
    roi_head=dict(
        type='KernelIterHeadVideo',
        num_stages=num_stages,
        stage_loss_weights=[1] * num_stages,
        proposal_feature_channel=256,
        num_thing_classes=40,
        num_stuff_classes=0,
        mask_head=[
            dict(
                type='KernelUpdateHead',
                num_classes=40,
                num_thing_classes=40,
                num_stuff_classes=0,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=2,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0)
            ) for _ in range(num_stages)
        ]),
    tracker=dict(
        type="KernelFrameIterHeadVideo",
        num_proposals=num_proposals,
        num_stages=3,
        assign_stages=2,
        proposal_feature_channel=256,
        stage_loss_weights=(1., 1., 1.),
        num_thing_classes=40,
        num_stuff_classes=0,
        mask_head=dict(
            type='KernelUpdateHeadVideo',
            num_proposals=num_proposals,
            num_classes=40,
            num_thing_classes=40,
            num_stuff_classes=0,
            num_ffn_fcs=2,
            num_heads=8,
            num_cls_fcs=1,
            num_mask_fcs=1,
            feedforward_channels=2048,
            in_channels=256,
            out_channels=256,
            dropout=0.0,
            mask_thr=0.5,
            conv_kernel_size=conv_kernel_size,
            mask_upsample_stride=2,
            ffn_act_cfg=dict(type='ReLU', inplace=True),
            with_ffn=True,
            feat_transform_cfg=dict(
                conv_cfg=dict(type='Conv2d'), act_cfg=None),
            kernel_updator_cfg=dict(
                type='KernelUpdator',
                in_channels=256,
                feat_channels=256,
                out_channels=256,
                input_feat_shape=3,
                act_cfg=dict(type='ReLU', inplace=True),
                norm_cfg=dict(type='LN')),
            loss_mask=dict(
                type='CrossEntropyLoss',
                use_sigmoid=True,
                loss_weight=1.0),
            loss_dice=dict(
                type='DiceLoss', loss_weight=4.0),
            loss_cls=dict(
                type='FocalLoss',
                use_sigmoid=True,
                gamma=2.0,
                alpha=0.25,
                loss_weight=2.0)
        ),

    ),
    # training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaskHungarianAssigner',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)
            ),
            sampler=dict(type='MaskPseudoSampler'),
            pos_weight=1),
        rcnn=[
            dict(
                assigner=dict(
                    type='MaskHungarianAssigner',
                    cls_cost=dict(type='FocalLossCost', weight=2.0),
                    dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                    mask_cost=dict(type='MaskCost', weight=1.0,
                                   pred_act=True)
                ),
                sampler=dict(type='MaskPseudoSampler'),
                pos_weight=1) for _ in range(num_stages)
        ],
        tracker=dict(
            assigner=dict(
                type='MaskHungarianAssignerVideo',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                mask_cost=dict(type='MaskCost', weight=1.0,
                               pred_act=True)
            ),
            sampler=dict(type='MaskPseudoSampler'),
            pos_weight=1)
    ),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=10,
            mask_thr=0.5,
            merge_stuff_thing=dict(
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3
            )
        ),
        tracker=dict(
            max_per_img=10,
            mask_thr=0.5,
            merge_stuff_thing=dict(
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3
            ),
        ),
    )
)

custom_imports = dict(
    imports=[
        'knet_vis.det.knet',
        'knet_vis.det.kernel_head',
        'knet_vis.det.kernel_iter_head',
        'knet_vis.det.kernel_update_head',
        'knet_vis.det.semantic_fpn_wrapper',
        'knet_vis.kernel_updator',
        'knet_vis.det.mask_hungarian_assigner',
        'knet_vis.det.mask_pseudo_sampler',
        'knet_vis.tracker.track',
        'knet_vis.tracker.kernel_head',
        'knet_vis.tracker.kernel_iter_head',
        'knet_vis.tracker.kernel_frame_iter_head',
        'knet_vis.tracker.mask_hungarian_assigner',
        'knet_vis.tracker.kernel_update_head',
        'swin.swin_transformer',
        'mmtrack.datasets.youtube_vis_dataset',
        'mmtrack.pipelines',
    ],
    allow_failed_imports=False
)


================================================
FILE: configs/video_knet_vis/_base_/models/knet_track_r50_deformablefpn.py
================================================
num_stages = 3
num_proposals = 100
conv_kernel_size = 1
model = dict(
    type='KNetTrack',
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    neck=dict(
        type='MSDeformAttnPixelDecoder',
        num_outs=3,
        norm_cfg=dict(type='GN', num_groups=32),
        act_cfg=dict(type='ReLU'),
        return_one_list=True,
        encoder=dict(
            type='DetrTransformerEncoder',
            num_layers=6,
            transformerlayers=dict(
                type='BaseTransformerLayer',
                attn_cfgs=dict(
                    type='MultiScaleDeformableAttention',
                    embed_dims=256,
                    num_heads=8,
                    num_levels=3,
                    num_points=4,
                    im2col_step=64,
                    dropout=0.0,
                    batch_first=False,
                    norm_cfg=None,
                    init_cfg=None),
                ffn_cfgs=dict(
                    type='FFN',
                    embed_dims=256,
                    feedforward_channels=1024,
                    num_fcs=2,
                    ffn_drop=0.0,
                    act_cfg=dict(type='ReLU', inplace=True)),
                operation_order=('self_attn', 'norm', 'ffn', 'norm')),
            init_cfg=None),
        positional_encoding=dict(
            type='SinePositionalEncoding', num_feats=128, normalize=True),
        init_cfg=None),
    rpn_head=dict(
        type='ConvKernelHeadVideo',
        conv_kernel_size=conv_kernel_size,
        feat_downsample_stride=2,
        feat_refine_stride=1,
        feat_refine=False,
        use_binary=True,
        num_loc_convs=1,
        num_seg_convs=1,
        conv_normal_init=True,
        localization_fpn=dict(
            type='SemanticFPNWrapper',
            in_channels=256,
            feat_channels=256,
            out_channels=256,
            start_level=0,
            end_level=3,
            upsample_times=2,
            positional_encoding=dict(
                type='SinePositionalEncoding', num_feats=128, normalize=True),
            cat_coors=False,
            cat_coors_level=3,
            fuse_by_cat=False,
            return_list=False,
            num_aux_convs=1,
            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)
        ),
        num_proposals=num_proposals,
        proposal_feats_with_obj=True,
        xavier_init_kernel=False,
        kernel_init_std=1,
        num_cls_fcs=1,
        in_channels=256,
        num_classes=40,
        feat_transform_cfg=None,
        loss_seg=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_mask=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_dice=dict(type='DiceLoss', loss_weight=4.0)),
    roi_head=dict(
        type='KernelIterHeadVideo',
        num_stages=num_stages,
        stage_loss_weights=[1] * num_stages,
        proposal_feature_channel=256,
        num_thing_classes=40,
        num_stuff_classes=0,
        mask_head=[
            dict(
                type='KernelUpdateHead',
                num_classes=40,
                num_thing_classes=40,
                num_stuff_classes=0,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=2,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0)
            ) for _ in range(num_stages)
        ]),
    tracker=dict(
        type="KernelFrameIterHeadVideo",
        num_proposals=num_proposals,
        num_stages=3,
        assign_stages=2,
        proposal_feature_channel=256,
        stage_loss_weights=(1., 1., 1.),
        num_thing_classes=40,
        num_stuff_classes=0,
        mask_head=dict(
            type='KernelUpdateHeadVideo',
            num_proposals=num_proposals,
            num_classes=40,
            num_thing_classes=40,
            num_stuff_classes=0,
            num_ffn_fcs=2,
            num_heads=8,
            num_cls_fcs=1,
            num_mask_fcs=1,
            feedforward_channels=2048,
            in_channels=256,
            out_channels=256,
            dropout=0.0,
            mask_thr=0.5,
            conv_kernel_size=conv_kernel_size,
            mask_upsample_stride=2,
            ffn_act_cfg=dict(type='ReLU', inplace=True),
            with_ffn=True,
            feat_transform_cfg=dict(
                conv_cfg=dict(type='Conv2d'), act_cfg=None),
            kernel_updator_cfg=dict(
                type='KernelUpdator',
                in_channels=256,
                feat_channels=256,
                out_channels=256,
                input_feat_shape=3,
                act_cfg=dict(type='ReLU', inplace=True),
                norm_cfg=dict(type='LN')),
            loss_mask=dict(
                type='CrossEntropyLoss',
                use_sigmoid=True,
                loss_weight=1.0),
            loss_dice=dict(
                type='DiceLoss', loss_weight=4.0),
            loss_cls=dict(
                type='FocalLoss',
                use_sigmoid=True,
                gamma=2.0,
                alpha=0.25,
                loss_weight=2.0)
        ),

    ),
    # training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaskHungarianAssigner',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)
            ),
            sampler=dict(type='MaskPseudoSampler'),
            pos_weight=1),
        rcnn=[
            dict(
                assigner=dict(
                    type='MaskHungarianAssigner',
                    cls_cost=dict(type='FocalLossCost', weight=2.0),
                    dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                    mask_cost=dict(type='MaskCost', weight=1.0,
                                   pred_act=True)
                ),
                sampler=dict(type='MaskPseudoSampler'),
                pos_weight=1) for _ in range(num_stages)
        ],
        tracker=dict(
            assigner=dict(
                type='MaskHungarianAssignerVideo',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                mask_cost=dict(type='MaskCost', weight=1.0,
                               pred_act=True)
            ),
            sampler=dict(type='MaskPseudoSampler'),
            pos_weight=1)
    ),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=10,
            mask_thr=0.5,
            merge_stuff_thing=dict(
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3
            )
        ),
        tracker=dict(
            max_per_img=10,
            mask_thr=0.5,
            merge_stuff_thing=dict(
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3
            ),
        ),
    )
)

custom_imports = dict(
    imports=[
        'knet_vis.det.knet',
        'knet_vis.det.kernel_head',
        'knet_vis.det.kernel_iter_head',
        'knet_vis.det.kernel_update_head',
        'knet_vis.det.semantic_fpn_wrapper',
        'knet_vis.kernel_updator',
        'knet.det.msdeformattn_decoder',
        'knet_vis.det.mask_hungarian_assigner',
        'knet_vis.det.mask_pseudo_sampler',
        'knet_vis.tracker.track',
        'knet_vis.tracker.kernel_head',
        'knet_vis.tracker.kernel_iter_head',
        'knet_vis.tracker.kernel_frame_iter_head',
        'knet_vis.tracker.mask_hungarian_assigner',
        'knet_vis.tracker.kernel_update_head',
        'swin.swin_transformer',
        'mmtrack.datasets.youtube_vis_dataset',
        'mmtrack.pipelines',
    ],
    allow_failed_imports=False
)


================================================
FILE: configs/video_knet_vis/_base_/schedules/schedule_0.75x.py
================================================
# optimizer
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    weight_decay=0.05,
    paramwise_cfg=dict(
        custom_keys={
            'backbone': dict(lr_mult=0.25)
        }
    )
)
optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))
# learning policy
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[5, 7]
)
runner = dict(type='EpochBasedRunner', max_epochs=8)


================================================
FILE: configs/video_knet_vis/_base_/schedules/schedule_1x.py
================================================
# optimizer
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    weight_decay=0.05,
    paramwise_cfg=dict(
        custom_keys={
            'backbone': dict(lr_mult=0.25)
        }
    )
)
optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))
# learning policy
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[8, 11]
)
runner = dict(type='EpochBasedRunner', max_epochs=12)


================================================
FILE: configs/video_knet_vis/_base_/schedules/schedule_8e.py
================================================
# optimizer
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    weight_decay=0.05,
    paramwise_cfg=dict(
        custom_keys={
            'backbone': dict(lr_mult=0.25)
        }
    )
)
optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))
# learning policy
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[7, ]
)
runner = dict(type='EpochBasedRunner', max_epochs=8)


================================================
FILE: configs/video_knet_vis/common/mstrain_3x_coco_instance.py
================================================
_base_ = '../_base_/default_runtime.py'
# dataset settings
dataset_type = 'CocoDataset'
data_root = 'data/coco/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
# multiscale_mode='range'
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
    dict(
        type='Resize',
        img_scale=[(1333, 640), (1333, 800)],
        multiscale_mode='range',
        keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1333, 800),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]

# Use RepeatDataset to speed up training
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=3,
        dataset=dict(
            type=dataset_type,
            ann_file=data_root + 'annotations/instances_train2017.json',
            img_prefix=data_root + 'train2017/',
            pipeline=train_pipeline)),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline))
evaluation = dict(interval=1, metric=['segm'])

# optimizer
# this is different from the original 1x schedule that use SGD
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    weight_decay=0.05,
    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)}))
optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))

# learning policy
# Experiments show that using step=[9, 11] has higher performance
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[9, 11])
runner = dict(type='EpochBasedRunner', max_epochs=12)


================================================
FILE: configs/video_knet_vis/video_knet_vis/knet_track_r50_1x_youtubevis.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_track_r50.py',
    '../_base_/datasets/youtubevis_2019.py',
]

================================================
FILE: configs/video_knet_vis/video_knet_vis/knet_track_r50_deformable_fpn_1x_youtubevis.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_track_r50_deformablefpn.py',
    '../_base_/datasets/youtubevis_2019.py',
]


data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,)


================================================
FILE: configs/video_knet_vis/video_knet_vis/knet_track_swinb_1x_youtubevis_8e.py
================================================
_base_ = [
    '../_base_/schedules/schedule_8e.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_track_r50.py',
    '../_base_/datasets/youtubevis_2019.py',
]

model = dict(
    backbone=dict(
        _delete_=True,
        type='SwinTransformerDIY',
        embed_dims=128,
        depths=[2, 2, 18, 2],
        num_heads=[4, 8, 16, 32],
        window_size=7,
        mlp_ratio=4,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.,
        attn_drop_rate=0.,
        drop_path_rate=0.3,
        use_abs_pos_embed=False,
        patch_norm=True,
        out_indices=(0, 1, 2, 3),
        with_cp=False
    ),
    neck=dict(in_channels=[128, 256, 512, 1024]),
)

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
)

================================================
FILE: configs/video_knet_vis/video_knet_vis/knet_track_swinb_deformable_1x_youtubevis.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_track_r50_deformablefpn.py',
    '../_base_/datasets/youtubevis_2019.py',
]

model = dict(
    backbone=dict(
        _delete_=True,
        type='SwinTransformerDIY',
        embed_dims=128,
        depths=[2, 2, 18, 2],
        num_heads=[4, 8, 16, 32],
        window_size=7,
        mlp_ratio=4,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.,
        attn_drop_rate=0.,
        drop_path_rate=0.3,
        use_abs_pos_embed=False,
        patch_norm=True,
        out_indices=(0, 1, 2, 3),
        with_cp=True
    ),
    neck=dict(in_channels=[128, 256, 512, 1024]),
)

dataset_type = 'YouTubeVISDataset'
data_root = 'data/youtube_vis_2019/'
dataset_version = '2019'

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375],
    to_rgb=True
)

train_pipeline = [
    dict(type='LoadMultiImagesFromFile', to_float32=True),
    dict(
        type='SeqLoadAnnotations',
        with_bbox=True,
        with_mask=True,
        with_track=True),
    dict(
        type='SeqResize',
        multiscale_mode='value',
        share_params=True,
        img_scale=[(288,1e6), (320,1e6), (352,1e6), (392,1e6), (416,1e6), (448,1e6), (480,1e6), (512,1e6)],
        keep_ratio=True
    ),
    dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
    dict(type='SeqNormalize', **img_norm_cfg),
    dict(type='SeqPad', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_instance_ids'],
        reject_empty=True,
        num_ref_imgs=5,
    ),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=1,
        dataset=dict(
        type=dataset_type,
        dataset_version=dataset_version,
        ann_file=data_root + 'annotations/youtube_vis_2019_train.json',
        img_prefix=data_root + 'train/JPEGImages',
        ref_img_sampler=dict(
            num_ref_imgs=5,
            frame_range=[-2, 2],
            filter_key_img=False,
            method='uniform'),
        pipeline=train_pipeline
    )),
)

================================================
FILE: external/cityscape_panoptic.py
================================================
import contextlib
import io
import itertools
import os
import glob
import tempfile
import logging
import os.path as osp
from collections import OrderedDict

import pycocotools.mask as maskUtils

import mmcv
import numpy as np
from mmcv.utils import print_log
from mmdet.datasets.builder import DATASETS
from mmdet.datasets.coco import CocoDataset
from mmdet.datasets.api_wrappers import COCO, COCOeval
from terminaltables import AsciiTable
from external.coco_panoptic import parse_pq_results, _print_panoptic_results


@DATASETS.register_module()
class CityscapesPanopticDataset(CocoDataset):

    CLASSES = ('person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
               'bicycle')

    def load_annotations(self, ann_file):
        """Load annotation from COCO style annotation file.

        Args:
            ann_file (str): Path of annotation file.

        Returns:
            list[dict]: Annotation info from COCO api.
        """

        self.coco = COCO(ann_file['ins_ann'])
        self.cat_ids = self.coco.get_cat_ids(cat_names=self.CLASSES)
        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
        self.img_ids = sorted(self.coco.get_img_ids())

        self.panoptic_anns = mmcv.load(ann_file['panoptic_ann'])

        self.stuff_ids = [
            k['id'] for k in self.panoptic_anns['categories']
            if k['isthing'] == 0
        ]

        self.thing_ids = [
            k['id'] for k in self.panoptic_anns['categories']
            if k['isthing'] == 1
        ]

        assert self.thing_ids == self.cat_ids

        self.seg2stuff_ids = {
            i + 1: stuff_id
            for i, stuff_id in enumerate(self.stuff_ids)
        }

        self.seg2stuff_ids.update({0: 0})

        self.ins2thing_ids = {
            i: thing_id
            for i, thing_id in enumerate(self.thing_ids)
        }


        data_infos = []
        total_ann_ids = []
        for i in self.img_ids:
            info = self.coco.load_imgs([i])[0]
            info['filename'] = info['file_name']
            data_infos.append(info)
            ann_ids = self.coco.get_ann_ids(img_ids=[i])
            total_ann_ids.extend(ann_ids)
        assert len(set(total_ann_ids)) == len(
            total_ann_ids), f"Annotation ids in '{ann_file}' are not unique!"
        return data_infos

    def _filter_imgs(self, min_size=32):
        """Filter images too small or without ground truths."""
        valid_inds = []
        # obtain images that contain annotation
        ids_with_ann = set(_['image_id'] for _ in self.coco.anns.values())
        # obtain images that contain annotations of the required categories
        ids_in_cat = set()
        for i, class_id in enumerate(self.cat_ids):
            ids_in_cat |= set(self.coco.cat_img_map[class_id])
        # merge the image id sets of the two conditions and use the merged set
        # to filter out images if self.filter_empty_gt=True
        ids_in_cat &= ids_with_ann

        valid_img_ids = []
        for i, img_info in enumerate(self.data_infos):
            img_id = img_info['id']
            ann_ids = self.coco.getAnnIds(imgIds=[img_id])
            ann_info = self.coco.loadAnns(ann_ids)
            all_iscrowd = all([_['iscrowd'] for _ in ann_info])
            if self.filter_empty_gt and (self.img_ids[i] not in ids_in_cat
                                         or all_iscrowd):
                continue
            if min(img_info['width'], img_info['height']) >= min_size:
                valid_inds.append(i)
                valid_img_ids.append(img_id)
        self.img_ids = valid_img_ids
        return valid_inds

    def _parse_ann_info(self, img_info, ann_info):
        """Parse bbox and mask annotation.

        Args:
            img_info (dict): Image info of an image.
            ann_info (list[dict]): Annotation info of an image.

        Returns:
            dict: A dict containing the following keys: bboxes, \
                bboxes_ignore, labels, masks, seg_map. \
                "masks" are already decoded into binary masks.
        """
        gt_bboxes = []
        gt_labels = []
        gt_bboxes_ignore = []
        gt_masks_ann = []

        for i, ann in enumerate(ann_info):
            if ann.get('ignore', False):
                continue
            x1, y1, w, h = ann['bbox']
            if ann['area'] <= 0 or w < 1 or h < 1:
                continue
            if ann['category_id'] not in self.cat_ids:
                continue
            bbox = [x1, y1, x1 + w, y1 + h]
            if ann.get('iscrowd', False):
                gt_bboxes_ignore.append(bbox)
            else:
                gt_bboxes.append(bbox)
                gt_labels.append(self.cat2label[ann['category_id']])
                gt_masks_ann.append(ann['segmentation'])

        if gt_bboxes:
            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
            gt_labels = np.array(gt_labels, dtype=np.int64)
        else:
            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
            gt_labels = np.array([], dtype=np.int64)

        if gt_bboxes_ignore:
            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
        else:
            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
        ann = dict(
            bboxes=gt_bboxes,
            labels=gt_labels,
            bboxes_ignore=gt_bboxes_ignore,
            masks=gt_masks_ann,
            seg_map=img_info['segm_file'])

        return ann

    def _panoptic2json(self, results, outfile_prefix):
        panoptic_json_results = []
        mmcv.mkdir_or_exist(outfile_prefix)
        for idx in range(len(self)):
            img_id = self.img_ids[idx]
            panoptic = results[idx]
            png_string, segments_info = panoptic
            data = dict()
            # hack
            # To match the corresponding ids for panoptic segmentation prediction
            # for both cityscape vps and cityscapes
            if self.vps is not None:
                data['image_id'] = "_".join(self.data_infos[idx]['file_name'].split(".")[0].split("_")[:5])
            else:
                data['image_id'] = self.data_infos[idx]['file_name'].split("/")[-1].split(".")[0][:-12]

            for segment_info in segments_info:
                isthing = segment_info.pop('isthing')
                cat_id = segment_info['category_id']
                if isthing is True:
                    segment_info['category_id'] = self.ins2thing_ids[cat_id]
                else:
                    segment_info['category_id'] = self.seg2stuff_ids[cat_id]

            png_path = self.data_infos[idx]['file_name'].replace(
                '.jpg', '.png')
            # hack: to save all the images into one folder
            png_path = png_path.split("/")[-1]
            png_save_path = osp.join(outfile_prefix, png_path)

            data['file_name'] = png_path

            with open(png_save_path, 'wb') as f:
                f.write(png_string)
            data['segments_info'] = segments_info
            panoptic_json_results.append(data)
        return panoptic_json_results

    def results2json(self, results, outfile_prefix):
        """Dump the detection results to a COCO style json file.

        There are 3 types of results: proposals, bbox predictions, mask
        predictions, and they have different data types. This method will
        automatically recognize the type, and dump them to json files.

        Args:
            results (list[list | tuple | ndarray]): Testing results of the
                dataset.
            outfile_prefix (str): The filename prefix of the json files. If the
                prefix is "somepath/xxx", the json files will be named
                "somepath/xxx.bbox.json", "somepath/xxx.segm.json",
                "somepath/xxx.proposal.json".

        Returns:
            dict[str: str]: Possible keys are "bbox", "segm", "proposal", and \
                values are corresponding filenames.
        """
        result_files = dict()
        if isinstance(results[0], list):
            json_results = self._det2json(results)
            result_files['bbox'] = f'{outfile_prefix}.bbox.json'
            result_files['proposal'] = f'{outfile_prefix}.bbox.json'
            mmcv.dump(json_results, result_files['bbox'])
        elif isinstance(results[0], tuple):
            if len(results[0]) == 3:  # dump the panoptic
                instance_segm_results = []
                panoptic_results = []
                for idx in range(len(self)):
                    det, seg, panoptic = results[idx]
                    instance_segm_results.append([det, seg])
                    panoptic_results.append(panoptic)
                panoptic_json = dict()
                panoptic_json['annotations'] = self._panoptic2json(
                    panoptic_results, outfile_prefix)
                result_files['panoptic'] = f'{outfile_prefix}.panoptic.json'
                mmcv.dump(panoptic_json, result_files['panoptic'])
            else:
                instance_segm_results = results
            json_results = self._segm2json(instance_segm_results)
            result_files['bbox'] = f'{outfile_prefix}.bbox.json'
            result_files['proposal'] = f'{outfile_prefix}.bbox.json'
            result_files['segm'] = f'{outfile_prefix}.segm.json'
            mmcv.dump(json_results[0], result_files['bbox'])
            mmcv.dump(json_results[1], result_files['segm'])
        elif isinstance(results[0], np.ndarray):
            json_results = self._proposal2json(results)
            result_files['proposal'] = f'{outfile_prefix}.proposal.json'
            mmcv.dump(json_results, result_files['proposal'])
        else:
            raise TypeError('invalid type of results')
        return result_files

    def results2txt(self, results, outfile_prefix):
        """Dump the detection results to a txt file.

        Args:
            results (list[list | tuple]): Testing results of the
                dataset.
            outfile_prefix (str): The filename prefix of the json files.
                If the prefix is "somepath/xxx",
                the txt files will be named "somepath/xxx.txt".

        Returns:
            list[str]: Result txt files which contains corresponding \
                instance segmentation images.
        """
        try:
            import cityscapesscripts.helpers.labels as CSLabels
        except ImportError:
            raise ImportError('Please run "pip install citscapesscripts" to '
                              'install cityscapesscripts first.')
        result_files = []
        os.makedirs(outfile_prefix, exist_ok=True)
        prog_bar = mmcv.ProgressBar(len(self))
        for idx in range(len(self)):
            result = results[idx]
            filename = self.data_infos[idx]['filename']
            basename = osp.splitext(osp.basename(filename))[0]
            pred_txt = osp.join(outfile_prefix, basename + '_pred.txt')

            bbox_result, segm_result = result
            bboxes = np.vstack(bbox_result)
            # segm results
            if isinstance(segm_result, tuple):
                # Some detectors use different scores for bbox and mask,
                # like Mask Scoring R-CNN. Score of segm will be used instead
                # of bbox score.
                segms = mmcv.concat_list(segm_result[0])
                mask_score = segm_result[1]
            else:
                # use bbox score for mask score
                segms = mmcv.concat_list(segm_result)
                mask_score = [bbox[-1] for bbox in bboxes]
            labels = [
                np.full(bbox.shape[0], i, dtype=np.int32)
                for i, bbox in enumerate(bbox_result)
            ]
            labels = np.concatenate(labels)

            assert len(bboxes) == len(segms) == len(labels)
            num_instances = len(bboxes)
            prog_bar.update()
            with open(pred_txt, 'w') as fout:
                for i in range(num_instances):
                    pred_class = labels[i]
                    classes = self.CLASSES[pred_class]
                    class_id = CSLabels.name2label[classes].id
                    score = mask_score[i]
                    mask = maskUtils.decode(segms[i]).astype(np.uint8)
                    png_filename = osp.join(outfile_prefix,
                                            basename + f'_{i}_{classes}.png')
                    mmcv.imwrite(mask, png_filename)
                    fout.write(f'{osp.basename(png_filename)} {class_id} '
                               f'{score}\n')
            result_files.append(pred_txt)

        return result_files

    def format_results(self, results, jsonfile_prefix="./test", **kwargs):
        """Format the results to json (standard format for COCO evaluation).

        Args:
            results (list[tuple | numpy.ndarray]): Testing results of the
                dataset.
            jsonfile_prefix (str | None): The prefix of json files. It includes
                the file path and the prefix of filename, e.g., "a/b/prefix".
                If not specified, a temp file will be created. Default: None.

        Returns:
            tuple: (result_files, tmp_dir), result_files is a dict containing \
                the json filepaths, tmp_dir is the temporal directory created \
                for saving json files when jsonfile_prefix is not specified.
        """
        assert isinstance(results, list), 'results must be a list'
        assert len(results) == len(self), (
            'The length of results is not equal to the dataset len: {} != {}'.
            format(len(results), len(self)))

        if jsonfile_prefix is None:
            tmp_dir = tempfile.TemporaryDirectory()
            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
        else:
            tmp_dir = None
        result_files = self.results2json(results, jsonfile_prefix)
        return result_files, tmp_dir

    def evaluate(self,
                 results,
                 metric='bbox',
                 logger=None,
                 outfile_prefix=None,
                 classwise=False,
                 proposal_nums=(100, 300, 1000),
                 iou_thrs=np.arange(0.5, 0.96, 0.05),
                 metric_items = None):
        """Evaluation in Cityscapes/COCO protocol.

        Args:
            results (list[list | tuple]): Testing results of the dataset.
            metric (str | list[str]): Metrics to be evaluated. Options are
                'bbox', 'segm', 'proposal', 'proposal_fast'.
            logger (logging.Logger | str | None): Logger used for printing
                related information during evaluation. Default: None.
            outfile_prefix (str | None): The prefix of output file. It includes
                the file path and the prefix of filename, e.g., "a/b/prefix".
                If results are evaluated with COCO protocol, it would be the
                prefix of output json file. For example, the metric is 'bbox'
                and 'segm', then json files would be "a/b/prefix.bbox.json" and
                "a/b/prefix.segm.json".
                If results are evaluated with cityscapes protocol, it would be
                the prefix of output txt/png files. The output files would be
                png images under folder "a/b/prefix/xxx/" and the file name of
                images would be written into a txt file
                "a/b/prefix/xxx_pred.txt", where "xxx" is the video name of
                cityscapes. If not specified, a temp file will be created.
                Default: None.
            classwise (bool): Whether to evaluating the AP for each class.
            proposal_nums (Sequence[int]): Proposal number used for evaluating
                recalls, such as recall@100, recall@1000.
                Default: (100, 300, 1000).
            iou_thrs (Sequence[float]): IoU threshold used for evaluating
                recalls. If set to a list, the average recall of all IoUs will
                also be computed. Default: 0.5.

        Returns:
            dict[str, float]: COCO style evaluation metric or cityscapes mAP \
                and AP@50.
        """
        eval_results = dict()

        metrics = metric.copy() if isinstance(metric, list) else [metric]
        allowed_metrics = [
            'bbox', 'segm', 'cityscapes', 'panoptic'
        ]
        for metric in metrics:
            if metric not in allowed_metrics:
                raise KeyError(f'metric {metric} is not supported')

        if 'cityscapes' in metrics:
            eval_results.update(
                self._evaluate_cityscapes(results, outfile_prefix, logger))
            metrics.remove('cityscapes')

        if iou_thrs is None:
            iou_thrs = np.linspace(
                .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
        if metric_items is not None:
            if not isinstance(metric_items, list):
                metric_items = [metric_items]

        result_files, tmp_dir = self.format_results(results, outfile_prefix)

        eval_results = OrderedDict()
        cocoGt = self.coco
        for metric in metrics:
            msg = f'Evaluating {metric}...'
            if logger is None:
                msg = '\n' + msg
            print_log(msg, logger=logger)

            if metric == 'proposal_fast':
                ar = self.fast_eval_recall(
                    results, proposal_nums, iou_thrs, logger='silent')
                log_msg = []
                for i, num in enumerate(proposal_nums):
                    eval_results[f'AR@{num}'] = ar[i]
                    log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}')
                log_msg = ''.join(log_msg)
                print_log(log_msg, logger=logger)
                continue

            if metric == 'panoptic':
                from panopticapi.evaluation import pq_compute

                with contextlib.redirect_stdout(io.StringIO()):
                    pq_res = pq_compute(
                        self.ann_file['panoptic_ann'],
                        result_files['panoptic'],
                        gt_folder=self.seg_prefix,
                        pred_folder=result_files['panoptic'].split('.')[0])
                results = parse_pq_results(pq_res)
                for k, v in results.items():
                    eval_results[f'{metric}_{k}'] = f'{float(v):0.3f}'
                print_log(
                    'Panoptic Evaluation Results:\n' +
                    _print_panoptic_results(pq_res),
                    logger=logger)
                continue

            iou_type = 'bbox' if metric == 'proposal' else metric
            if metric not in result_files:
                raise KeyError(f'{metric} is not in results')
            try:
                predictions = mmcv.load(result_files[metric])
                if iou_type == 'segm':
                    # Refer to https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L331  # noqa
                    # When evaluating mask AP, if the results contain bbox,
                    # cocoapi will use the box area instead of the mask area
                    # for calculating the instance area. Though the overall AP
                    # is not affected, this leads to different small, medium,
                    # and large mask AP results.
                    for x in predictions:
                        x.pop('bbox')
                cocoDt = cocoGt.loadRes(predictions)
            except IndexError:
                print_log(
                    'The testing results of the whole dataset is empty.',
                    logger=logger,
                    level=logging.ERROR)
                break

            cocoEval = COCOeval(cocoGt, cocoDt, iou_type)
            cocoEval.params.catIds = self.cat_ids
            cocoEval.params.imgIds = self.img_ids
            cocoEval.params.maxDets = list(proposal_nums)
            cocoEval.params.iouThrs = iou_thrs
            # mapping of cocoEval.stats
            coco_metric_names = {
                'mAP': 0,
                'mAP_50': 1,
                'mAP_75': 2,
                'mAP_s': 3,
                'mAP_m': 4,
                'mAP_l': 5,
                'AR@100': 6,
                'AR@300': 7,
                'AR@1000': 8,
                'AR_s@1000': 9,
                'AR_m@1000': 10,
                'AR_l@1000': 11
            }
            if metric_items is not None:
                for metric_item in metric_items:
                    if metric_item not in coco_metric_names:
                        raise KeyError(
                            f'metric item {metric_item} is not supported')

            if metric == 'proposal':
                cocoEval.params.useCats = 0
                cocoEval.evaluate()
                cocoEval.accumulate()
                cocoEval.summarize()
                if metric_items is None:
                    metric_items = [
                        'AR@100', 'AR@300', 'AR@1000', 'AR_s@1000',
                        'AR_m@1000', 'AR_l@1000'
                    ]

                for item in metric_items:
                    val = float(
                        f'{cocoEval.stats[coco_metric_names[item]]:.3f}')
                    eval_results[item] = val
            else:
                cocoEval.evaluate()
                cocoEval.accumulate()
                cocoEval.summarize()
                if classwise:  # Compute per-category AP
                    # Compute per-category AP
                    # from https://github.com/facebookresearch/detectron2/
                    precisions = cocoEval.eval['precision']
                    # precision: (iou, recall, cls, area range, max dets)
                    assert len(self.cat_ids) == precisions.shape[2]

                    results_per_category = []
                    for idx, catId in enumerate(self.cat_ids):
                        # area range index 0: all area ranges
                        # max dets index -1: typically 100 per image
                        nm = self.coco.loadCats(catId)[0]
                        precision = precisions[:, :, idx, 0, -1]
                        precision = precision[precision > -1]
                        if precision.size:
                            ap = np.mean(precision)
                        else:
                            ap = float('nan')
                        results_per_category.append(
                            (f'{nm["name"]}', f'{float(ap):0.3f}'))

                    num_columns = min(6, len(results_per_category) * 2)
                    results_flatten = list(
                        itertools.chain(*results_per_category))
                    headers = ['category', 'AP'] * (num_columns // 2)
                    results_2d = itertools.zip_longest(*[
                        results_flatten[i::num_columns]
                        for i in range(num_columns)
                    ])
                    table_data = [headers]
                    table_data += [result for result in results_2d]
                    table = AsciiTable(table_data)
                    print_log('\n' + table.table, logger=logger)

                if metric_items is None:
                    metric_items = [
                        'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
                    ]

                for metric_item in metric_items:
                    key = f'{metric}_{metric_item}'
                    val = float(
                        f'{cocoEval.stats[coco_metric_names[metric_item]]:.3f}'
                    )
                    eval_results[key] = val
                ap = cocoEval.stats[:6]
                eval_results[f'{metric}_mAP_copypaste'] = (
                    f'{ap[0]:.3f} {ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} '
                    f'{ap[4]:.3f} {ap[5]:.3f}')

        if tmp_dir is not None:
            tmp_dir.cleanup()
        return eval_results

    def _evaluate_cityscapes(self, results, txtfile_prefix, logger):
        """Evaluation in Cityscapes protocol.

        Args:
            results (list): Testing results of the dataset.
            txtfile_prefix (str | None): The prefix of output txt file
            logger (logging.Logger | str | None): Logger used for printing
                related information during evaluation. Default: None.

        Returns:
            dict[str: float]: Cityscapes evaluation results, contains 'mAP' \
                and 'AP@50'.
        """

        try:
            import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as CSEval  # noqa
        except ImportError:
            raise ImportError('Please run "pip install citscapesscripts" to '
                              'install cityscapesscripts first.')
        msg = 'Evaluating in Cityscapes style'
        if logger is None:
            msg = '\n' + msg
        print_log(msg, logger=logger)

        result_files, tmp_dir = self.format_results(results, txtfile_prefix)

        if tmp_dir is None:
            result_dir = osp.join(txtfile_prefix, 'results')
        else:
            result_dir = osp.join(tmp_dir.name, 'results')

        eval_results = OrderedDict()
        print_log(f'Evaluating results under {result_dir} ...', logger=logger)

        # set global states in cityscapes evaluation API
        CSEval.args.cityscapesPath = os.path.join(self.img_prefix, '../..')
        CSEval.args.predictionPath = os.path.abspath(result_dir)
        CSEval.args.predictionWalk = None
        CSEval.args.JSONOutput = False
        CSEval.args.colorized = False
        CSEval.args.gtInstancesFile = os.path.join(result_dir,
                                                   'gtInstances.json')
        CSEval.args.groundTruthSearch = os.path.join(
            self.img_prefix.replace('leftImg8bit', 'gtFine'),
            '*/*_gtFine_instanceIds.png')

        groundTruthImgList = glob.glob(CSEval.args.groundTruthSearch)
        assert len(groundTruthImgList), 'Cannot find ground truth images' \
            f' in {CSEval.args.groundTruthSearch}.'
        predictionImgList = []
        for gt in groundTruthImgList:
            predictionImgList.append(CSEval.getPrediction(gt, CSEval.args))
        CSEval_results = CSEval.evaluateImgLists(predictionImgList,
                                                 groundTruthImgList,
                                                 CSEval.args)['averages']

        eval_results['mAP'] = CSEval_results['allAp']
        eval_results['AP@50'] = CSEval_results['allAp50%']
        if tmp_dir is not None:
            tmp_dir.cleanup()
        return eval_results

================================================
FILE: external/cityscapes_step.py
================================================
import os

import numpy as np

from mmdet.datasets.builder import DATASETS
from mmdet.datasets.pipelines.compose import Compose

from external.dataset.mIoU import eval_miou


@DATASETS.register_module()
class CityscapesSTEP:
    CLASSES = ('road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
               'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky',
               'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
               'bicycle')

    def __init__(
            self,
            pipeline=None,
            data_root=None,
            test_mode=False,
            split='train',
    ):
        # Let's figure out where is the cityscapes first
        assert os.path.exists(os.path.join(data_root, 'license.txt')), \
            "It seems that '{}' is not the root folder of cityscapes".format(data_root)
        assert os.path.exists(os.path.join(data_root, 'leftImg8bit')), \
            "leftImg8bit cannot be found."
        assert os.path.exists(os.path.join(data_root, 'gtFine')), \
            "gtFine cannot be found."

        if pipeline is None:
            pipeline = []

        image_main_dir = os.path.join(data_root, 'leftImg8bit', split)
        gt_dir = os.path.join(data_root, 'gtFine', split)

        locations = os.listdir(image_main_dir)
        samples = []
        for loc in locations:
            for sample in os.listdir(os.path.join(image_main_dir, loc)):
                location, seq_id, img_id, _ = sample.split('_')
                assert location == loc
                samples.append((location, int(seq_id), int(img_id)))
        samples = sorted(samples)
        self.samples = samples

        # Set the image dirs
        self.gt_dir = gt_dir
        self.img_dir = image_main_dir

        self.pipeline = Compose(pipeline)
        self.load_ann_pipeline = Compose([
            dict(
                type='LoadAnnotationsInstanceMasks',
                with_mask=False,
                with_seg=True,
                with_inst=True,
            ),
        ])
        self.test_mode = test_mode

        self.flag = self._set_groups()

        # eval
        self.max_ins = 1000
        self.no_obj_id = 255

    def pre_pipeline(self, results):
        results['img_prefix'] = None
        results['img_fields'] = []
        results['mask_fields'] = []
        results['seg_fields'] = []
        results['bbox_fields'] = []
        return results

    def prepare_test_img(self, idx):
        get_idx = self.samples[idx]
        filename = os.path.join(self.img_dir, get_idx[0], '{}_{:06d}_{:06d}_leftImg8bit.png'.format(*get_idx))
        results = {
            'img_info': {
                'filename': filename
            }
        }
        results = self.pre_pipeline(results)
        return self.pipeline(results)

    def prepare_val_annotation(self, idx):
        get_idx = self.samples[idx]
        results = {
            'ann_info': {
                'seg_map': os.path.join(self.gt_dir, get_idx[0],
                                        '{}_{:06d}_{:06d}_gtFine_labelTrainIds.png'.format(*get_idx)),
                'inst_map': os.path.join(self.gt_dir, get_idx[0],
                                         '{}_{:06d}_{:06d}_gtFine_instanceTrainIds.png'.format(*get_idx)),
            }
        }
        results = self.pre_pipeline(results)
        return self.load_ann_pipeline(results)

    def prepare_train_img(self, idx):
        get_idx = self.samples[idx]
        filename = os.path.join(self.img_dir, get_idx[0], '{}_{:06d}_{:06d}_leftImg8bit.png'.format(*get_idx))
        results = {
            'img_info': {
                'filename': filename
            },
            'ann_info': {
                'seg_map': os.path.join(self.gt_dir, get_idx[0],
                                        '{}_{:06d}_{:06d}_gtFine_labelTrainIds.png'.format(*get_idx)),
                'inst_map': os.path.join(self.gt_dir, get_idx[0],
                                         '{}_{:06d}_{:06d}_gtFine_instanceTrainIds.png'.format(*get_idx)),
            }
        }
        results = self.pre_pipeline(results)
        return self.pipeline(results)

    # Copy and Modify from mmdet
    def __getitem__(self, idx):
        """Get training/test data after pipeline.

        Args:
            idx (int): Index of data.

        Returns:
            dict: Training/test data (with annotation if `test_mode` is set \
                True).
        """

        if self.test_mode:
            return self.prepare_test_img(idx)
        else:
            while True:
                cur_data = self.prepare_train_img(idx)
                if cur_data is None:
                    idx = self._rand_another(idx)
                    continue
                return cur_data

    def _rand_another(self, idx):
        """Get another random index from the same group as the given index."""
        pool = np.arange(len(self))
        return np.random.choice(pool)

    def __len__(self):
        return len(self.samples)

    def _set_groups(self):
        return np.zeros((len(self)), dtype=np.int64)

    # The evaluate func
    def evaluate(
            self,
            results,
            **kwargs
    ):
        # logger and metric
        thing_lower = 11
        thing_upper = 19

        num_thing_classes = 8
        num_stuff_classes = 11
        pred_results_handled = []
        sem_preds = []

        thing_knet2real = [11, 13]

        for item in results:
            bbox_results, mask_results, seg_results, _, _ = item
            # in seg_info id starts from 1
            inst_map, seg_info = seg_results
            cat_map = np.zeros_like(inst_map) + num_thing_classes + num_stuff_classes
            for instance in seg_info:
                cat_cur = instance['category_id']
                if instance['isthing']:
                    cat_cur = thing_knet2real[cat_cur]
                else:
                    cat_cur -= 1
                    offset = 0
                    for thing_id in thing_knet2real:
                        if cat_cur + offset >= thing_id:
                            offset += 1
                    cat_cur += offset
                assert cat_cur < num_thing_classes + num_stuff_classes
                cat_map[inst_map == instance['id']] = cat_cur
                if not instance['isthing']:
                    inst_map[inst_map == instance['id']] = 0
            pred_results_handled.append(cat_map.astype(np.int32) * self.max_ins + inst_map.astype(np.int32))
            sem_preds.append(cat_map)

        gt_panseg = []
        sem_targets = []
        for idx in range(len(self)):
            results = self.prepare_val_annotation(idx)
            panseg_map = results['gt_instance_map']
            sem_targets.append(panseg_map // self.max_ins)
            gt_panseg.append(panseg_map)

        vpq_results = []
        for pred, gt in zip(pred_results_handled, gt_panseg):
            vpq_result = vpq_eval([pred, gt])
            vpq_results.append(vpq_result)

        iou_per_class = np.stack([result[0] for result in vpq_results]).sum(axis=0)[
                        :num_thing_classes + num_stuff_classes]
        tp_per_class = np.stack([result[1] for result in vpq_results]).sum(axis=0)[
                       :num_thing_classes + num_stuff_classes]
        fn_per_class = np.stack([result[2] for result in vpq_results]).sum(axis=0)[
                       :num_thing_classes + num_stuff_classes]
        fp_per_class = np.stack([result[3] for result in vpq_results]).sum(axis=0)[
                       :num_thing_classes + num_stuff_classes]

        # calculate the PQs
        epsilon = 0.
        sq = iou_per_class / (tp_per_class + epsilon)
        rq = tp_per_class / (tp_per_class + 0.5 *
                             fn_per_class + 0.5 * fp_per_class + epsilon)
        pq = sq * rq
        # stuff_pq = pq[:num_stuff_classes]
        # things_pq = pq[num_stuff_classes:]
        things_index = np.zeros((19,)).astype(bool)
        things_index[11] = True
        things_index[13] = True
        stuff_pq = pq[np.logical_not(things_index)]
        things_pq = pq[things_index]

        miou_per_class = eval_miou(sem_preds, sem_targets, num_classes=num_thing_classes + num_stuff_classes)

        pq = sq * rq
        print("class        pq\t\tsq\t\trq\t\ttp\t\tfp\t\tfn\t\tmIoU")

        for i in range(len(self.CLASSES)):
            print("{}{}{:.3f}\t\t{:.3f}\t\t{:.3f}\t\t{:.0f}\t\t{:.0f}\t\t{:.0f}\t\t{:.3f}".format(
                self.CLASSES[i], ' '*(13 - len(self.CLASSES[i])), pq[i], sq[i], rq[i], tp_per_class[i],
                fp_per_class[i], fn_per_class[i], miou_per_class[i]
            ))

        return {
            "PQ": np.nan_to_num(pq).mean() * 100,
            "Stuff PQ": np.nan_to_num(stuff_pq).mean() * 100,
            "Things PQ": np.nan_to_num(things_pq).mean() * 100,
            "mIoU":np.nan_to_num(miou_per_class).mean() * 100,
        }


def vpq_eval(element):
    import six
    pred_ids, gt_ids = element
    max_ins = 1000
    ign_id = 255
    offset = 256 * 256
    num_cat = 19 + 1

    iou_per_class = np.zeros(num_cat, dtype=np.float64)
    tp_per_class = np.zeros(num_cat, dtype=np.float64)
    fn_per_class = np.zeros(num_cat, dtype=np.float64)
    fp_per_class = np.zeros(num_cat, dtype=np.float64)

    def _ids_to_counts(id_array):
        ids, counts = np.unique(id_array, return_counts=True)
        return dict(six.moves.zip(ids, counts))

    pred_areas = _ids_to_counts(pred_ids)
    gt_areas = _ids_to_counts(gt_ids)

    void_id = ign_id * max_ins
    ign_ids = {
        gt_id for gt_id in six.iterkeys(gt_areas)
        if (gt_id // max_ins) == ign_id
    }

    int_ids = gt_ids.astype(np.int64) * offset + pred_ids.astype(np.int64)
    int_areas = _ids_to_counts(int_ids)

    def prediction_void_overlap(pred_id):
        void_int_id = void_id * offset + pred_id
        return int_areas.get(void_int_id, 0)

    def prediction_ignored_overlap(pred_id):
        total_ignored_overlap = 0
        for _ign_id in ign_ids:
            int_id = _ign_id * offset + pred_id
            total_ignored_overlap += int_areas.get(int_id, 0)
        return total_ignored_overlap

    gt_matched = set()
    pred_matched = set()

    for int_id, int_area in six.iteritems(int_areas):
        gt_id = int(int_id // offset)
        gt_cat = int(gt_id // max_ins)
        pred_id = int(int_id % offset)
        pred_cat = int(pred_id // max_ins)
        if gt_cat != pred_cat:
            continue
        union = (
                gt_areas[gt_id] + pred_areas[pred_id] - int_area -
                prediction_void_overlap(pred_id)
        )
        iou = int_area / union
        if iou > 0.5:
            tp_per_class[gt_cat] += 1
            iou_per_class[gt_cat] += iou
            gt_matched.add(gt_id)
            pred_matched.add(pred_id)

    for gt_id in six.iterkeys(gt_areas):
        if gt_id in gt_matched:
            continue
        cat_id = gt_id // max_ins
        if cat_id == ign_id:
            continue
        fn_per_class[cat_id] += 1

    for pred_id in six.iterkeys(pred_areas):
        if pred_id in pred_matched:
            continue
        if (prediction_ignored_overlap(pred_id) / pred_areas[pred_id]) > 0.5:
            continue
        cat = pred_id // max_ins
        fp_per_class[cat] += 1

    return iou_per_class, tp_per_class, fn_per_class, fp_per_class


if __name__ == '__main__':
    import dataset.pipelines.loading
    import dataset.pipelines.transforms

    img_norm_cfg = dict(
        mean=[123.675, 116.28, 103.53],
        std=[58.395, 57.12, 57.375],
        to_rgb=True
    )
    train_pipelines = [
        dict(type='LoadImageFromFile'),
        dict(type='LoadAnnotationsInstanceMasks', cherry=[11, 13]),
        dict(type='KNetInsAdapterCherryPick', stuff_nums=11, cherry=[11, 13]),
        dict(type='Resize', img_scale=(1024, 2048), ratio_range=[0.5, 2.0], keep_ratio=True),
        dict(type='RandomFlip', flip_ratio=0.5),
        dict(type='RandomCrop', crop_size=(1024, 2048)),
        dict(type='Normalize', **img_norm_cfg),
        dict(type='PadFutureMMDet', size_divisor=32, pad_val=dict(img=0, masks=0, seg=255)),
        dict(type='DefaultFormatBundle'),
        dict(type='Collect', keys=['img', 'gt_masks', 'gt_labels', 'gt_semantic_seg'],
             meta_keys=('ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                        'flip_direction', 'img_norm_cfg')
             ),
    ]
    data = CityscapesSTEP(
        pipeline=train_pipelines,
        data_root='data/cityscapes',
        split='train',
        test_mode=False
    )
    for item in data:
        print(item)


================================================
FILE: external/cityscapes_vps.py
================================================
import contextlib
import io
import itertools
import os
import glob
import tempfile
import logging
import os.path as osp
from collections import OrderedDict

import pycocotools.mask as maskUtils

import mmcv
import numpy as np
from mmcv.utils import print_log
from mmdet.datasets.builder import DATASETS
from mmdet.datasets.coco import CocoDataset
from mmdet.datasets.api_wrappers import COCO, COCOeval
from terminaltables import AsciiTable
from external.coco_panoptic import parse_pq_results, _print_panoptic_results


@DATASETS.register_module()
class CityscapesVPSDataset(CocoDataset):
    def __init__(self,
                 ann_file,
                 pipeline,
                 data_root=None,
                 img_prefix=None,
                 seg_prefix=None,
                 proposal_file=None,
                 test_mode=False,
                 offsets=None,
                 ref_prefix=None,
                 nframes_span_test=6):
        super(CityscapesVPSDataset, self).__init__(
            ann_file=ann_file,
            pipeline=pipeline,
            data_root=data_root,
            img_prefix=img_prefix,
            seg_prefix=seg_prefix,
            proposal_file=proposal_file,
            test_mode=test_mode)

        # Hack: we use ref_img_infos to load reference images.
        self.ref_img_infos = self.load_ref_annotations(
                    self.ann_file)
        self.ref_prefix = ref_prefix
        self.offsets = offsets
        self.nframes_span_test = nframes_span_test
        self.iid2_img_infos = {x['id']: x for x in self.ref_img_infos}

    CLASSES = ('person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
               'bicycle')

    def load_ref_annotations(self, ann_file):
        self.ref_coco = COCO(ann_file['ins_ann'])
        self.ref_cat_ids = self.ref_coco.getCatIds()
        self.ref_cat2label = {
            cat_id: i + 1
            for i, cat_id in enumerate(self.ref_cat_ids)
        }
        self.ref_img_ids = self.ref_coco.getImgIds()
        img_infos = []
        for i in self.ref_img_ids:
            info = self.ref_coco.loadImgs([i])[0]
            info['filename'] = info['file_name']
            img_infos.append(info)
        return img_infos

    def load_annotations(self, ann_file):
        """Load annotation from COCO style annotation file.

        Args:
            ann_file (str): Path of annotation file.

        Returns:
            list[dict]: Annotation info from COCO api.
        """

        self.coco = COCO(ann_file['ins_ann'])
        self.cat_ids = self.coco.get_cat_ids(cat_names=self.CLASSES)
        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
        self.img_ids = sorted(self.coco.get_img_ids())

        self.panoptic_anns = mmcv.load(ann_file['panoptic_ann'])

        self.stuff_ids = [
            k['trainid'] for k in self.panoptic_anns['categories']
            if k['isthing'] == 0
        ]

        self.thing_ids = [
            k['trainid'] for k in self.panoptic_anns['categories']
            if k['isthing'] == 1
        ]

        assert self.thing_ids == self.cat_ids

        self.seg2stuff_ids = {
            i + 1: stuff_id
            for i, stuff_id in enumerate(self.stuff_ids)
        }

        self.seg2stuff_ids.update({0: 0})

        self.ins2thing_ids = {
            i: thing_id
            for i, thing_id in enumerate(self.thing_ids)
        }

        data_infos = []
        total_ann_ids = []
        for i in self.img_ids:
            info = self.coco.load_imgs([i])[0]
            info['filename'] = info['file_name']
            data_infos.append(info)
            ann_ids = self.coco.get_ann_ids(img_ids=[i])
            total_ann_ids.extend(ann_ids)
        assert len(set(total_ann_ids)) == len(
            total_ann_ids), f"Annotation ids in '{ann_file}' are not unique!"
        return data_infos

    def _filter_imgs(self, min_size=32):
        """Filter images too small or without ground truths."""
        valid_inds = []
        # obtain images that contain annotation
        ids_with_ann = set(_['image_id'] for _ in self.coco.anns.values())
        # obtain images that contain annotations of the required categories
        ids_in_cat = set()
        for i, class_id in enumerate(self.cat_ids):
            ids_in_cat |= set(self.coco.cat_img_map[class_id])
        # merge the image id sets of the two conditions and use the merged set
        # to filter out images if self.filter_empty_gt=True
        ids_in_cat &= ids_with_ann

        valid_img_ids = []
        for i, img_info in enumerate(self.data_infos):
            img_id = img_info['id']
            ann_ids = self.coco.getAnnIds(imgIds=[img_id])
            ann_info = self.coco.loadAnns(ann_ids)
            all_iscrowd = all([_['iscrowd'] for _ in ann_info])
            if self.filter_empty_gt and (self.img_ids[i] not in ids_in_cat
                                         or all_iscrowd):
                continue
            if min(img_info['width'], img_info['height']) >= min_size:
                valid_inds.append(i)
                valid_img_ids.append(img_id)
        self.img_ids = valid_img_ids
        return valid_inds

    def prepare_train_img(self, idx):
        """Get training data and annotations after pipeline.

        Args:
            idx (int): Index of data.

        Returns:
            dict: Training data and annotation after pipeline with new keys \
                introduced by pipeline.
        """

        img_info = self.data_infos[idx]
        ann_info = self.get_ann_info(idx)
        results = [dict(img_info=img_info, ann_info=ann_info)]

        iid = img_info['id']
        # self.offsets = [-1, 1] for Cityscapes_VPS
        offsets = self.offsets.copy()
        # random sampling of future or past 5-th frame [-1, 1]
        while True:
            m = np.random.choice(offsets)
            ref_iid = iid + m
            if ref_iid in self.img_ids and self.check_whether_has_correspondence(ref_iid, iid):
                break
            offsets.remove(m)
            # If all offset values fail, return None.
            if len(offsets) == 0:
                return None
        # Reference image: information, annotations
        ref_iid = iid + m

        ref_img_info = self.iid2_img_infos[ref_iid]
        ref_ann_info = self.get_ref_ann_info_by_iid(ref_iid, ref_img_info)
        results.append(dict(img_info=ref_img_info, ann_info=ref_ann_info))

        if self.proposals is not None:
            results['proposals'] = self.proposals[idx]

        self.pre_pipeline(results)

        return self.pipeline(results)

    def check_whether_has_correspondence(self, ref_iid, iid):
        ref_img_info = self.iid2_img_infos[ref_iid]
        ref_ann_info = self.get_ref_ann_info_by_iid(ref_iid, ref_img_info)

        img_info = self.iid2_img_infos[iid]
        ann_info = self.get_ref_ann_info_by_iid(iid, img_info)
        nomatch = self.check_match(ref_ann_info, ann_info)
        if nomatch:  # no match
            return False
        else:
            return True

    def check_match(self, ref_ann_info, ann_info):
        ref_ids = ref_ann_info['instance_ids'].tolist()
        gt_ids = ann_info['instance_ids'].tolist()
        gt_pids = [ref_ids.index(i) if i in ref_ids else -1 for i in gt_ids]
        nomatch = (np.array(gt_pids) == -1).all()
        return nomatch

    def prepare_test_img(self, idx):
        """Get testing data  after pipeline.

        Args:
            idx (int): Index of data.

        Returns:
            dict: Testing data after pipeline with new keys introduced by \
                pipeline.
        """

        img_info = self.data_infos[idx]
        prev_img_info = self.data_infos[idx - 1] if idx % (self.nframes_span_test) > 0 else img_info
        img_info['ref_id'] = prev_img_info['id'] - 1
        img_info['ref_filename'] = prev_img_info['file_name']
        results = dict(img_info=img_info)
        if self.proposals is not None:
            results['proposals'] = self.proposals[idx]
        self.pre_test_pipeline(results)
        return self.pipeline(results)

    def pre_pipeline(self, results):
        """Prepare results dict for pipeline."""
        for result in results:
            result['img_prefix'] = self.img_prefix
            result['seg_prefix'] = self.seg_prefix
            result['proposal_file'] = self.proposal_file
            result['bbox_fields'] = []
            result['mask_fields'] = []
            result['seg_fields'] = []
            seg_filename = result['ann_info']['seg_map'].replace('leftImg8bit', 'gtFine_color').\
                replace('newImg8bit', 'final_mask')

            result['ann_info']['seg_map'] = seg_filename

    def pre_test_pipeline(self, results):
        results['img_prefix'] = self.img_prefix
        results['seg_prefix'] = self.seg_prefix
        results['ref_prefix'] = self.ref_prefix
        results['proposal_file'] = self.proposal_file
        results['bbox_fields'] = []
        results['mask_fields'] = []
        results['ref_bbox_fields'] = []
        results['ref_mask_fields'] = []

    def _parse_ann_info(self, img_info, ann_info):
        """Parse bbox and mask annotation.

        Args:
            img_info (dict): Image info of an image.
            ann_info (list[dict]): Annotation info of an image.

        Returns:
            dict: A dict containing the following keys: bboxes, \
                bboxes_ignore, labels, masks, seg_map. \
                "masks" are already decoded into binary masks.
        """
        gt_bboxes = []
        gt_labels = []
        gt_bboxes_ignore = []
        gt_masks_ann = []
        gt_obj_ids = []

        for i, ann in enumerate(ann_info):
            if ann.get('ignore', False):
                continue
            x1, y1, w, h = ann['bbox']
            if ann['area'] <= 0 or w < 1 or h < 1:
                continue
            if ann['category_id'] not in self.cat_ids:
                continue
            bbox = [x1, y1, x1 + w, y1 + h]
            if ann.get('iscrowd', False):
                gt_bboxes_ignore.append(bbox)
            else:
                gt_bboxes.append(bbox)
                gt_labels.append(self.cat2label[ann['category_id']])
                gt_masks_ann.append(ann['segmentation'])
                gt_obj_ids.append(ann['inst_id'])

        if gt_bboxes:
            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
            gt_labels = np.array(gt_labels, dtype=np.int64)
            gt_obj_ids = np.array(gt_obj_ids, dtype=np.int64)
        else:
            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
            gt_labels = np.array([], dtype=np.int64)
            gt_obj_ids = np.array([], dtype=np.int64)

        if gt_bboxes_ignore:
            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
        else:
            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)

        seg_map = img_info['filename'].replace('jpg', 'png')

        ann = dict(
            bboxes=gt_bboxes,
            labels=gt_labels,
            bboxes_ignore=gt_bboxes_ignore,
            masks=gt_masks_ann,
            instance_ids=gt_obj_ids,
            seg_map=seg_map)

        return ann

    def get_ref_ann_info_by_iid(self, img_id, ref_img_info):
        ann_ids = self.ref_coco.getAnnIds(imgIds=[img_id])
        ann_info = self.ref_coco.loadAnns(ann_ids)
        return self._parse_ann_info(ref_img_info, ann_info)

    def _panoptic2json(self, results, outfile_prefix):
        panoptic_json_results = []
        mmcv.mkdir_or_exist(outfile_prefix)
        for idx in range(len(self)):
            img_id = self.img_ids[idx]
            panoptic = results[idx]
            png_string, segments_info = panoptic
            data = dict()
            # hack
            # To match the corresponding ids for panoptic segmentation prediction
            data['image_id'] = self.data_infos[idx]['file_name'].split("/")[-1].split(".")[0][:-12]

            for segment_info in segments_info:
                isthing = segment_info.pop('isthing')
                cat_id = segment_info['category_id']
                if isthing is True:
                    segment_info['category_id'] = self.ins2thing_ids[cat_id]
                else:
                    segment_info['category_id'] = self.seg2stuff_ids[cat_id]

            png_path = self.data_infos[idx]['file_name'].replace(
                '.jpg', '.png')
            # hack: to save all the images into one folder
            png_path = png_path.split("/")[-1]
            png_save_path = osp.join(outfile_prefix, png_path)

            data['file_name'] = png_path

            with open(png_save_path, 'wb') as f:
                f.write(png_string)
            data['segments_info'] = segments_info
            panoptic_json_results.append(data)
        return panoptic_json_results

    def results2json(self, results, outfile_prefix):
        """Dump the detection results to a COCO style json file.

        There are 3 types of results: proposals, bbox predictions, mask
        predictions, and they have different data types. This method will
        automatically recognize the type, and dump them to json files.

        Args:
            results (list[list | tuple | ndarray]): Testing results of the
                dataset.
            outfile_prefix (str): The filename prefix of the json files. If the
                prefix is "somepath/xxx", the json files will be named
                "somepath/xxx.bbox.json", "somepath/xxx.segm.json",
                "somepath/xxx.proposal.json".

        Returns:
            dict[str: str]: Possible keys are "bbox", "segm", "proposal", and \
                values are corresponding filenames.
        """
        result_files = dict()
        if isinstance(results[0], list):
            json_results = self._det2json(results)
            result_files['bbox'] = f'{outfile_prefix}.bbox.json'
            result_files['proposal'] = f'{outfile_prefix}.bbox.json'
            mmcv.dump(json_results, result_files['bbox'])
        elif isinstance(results[0], tuple):
            if len(results[0]) == 3:  # dump the panoptic
                instance_segm_results = []
                panoptic_results = []
                for idx in range(len(self)):
                    det, seg, panoptic = results[idx]
                    instance_segm_results.append([det, seg])
                    panoptic_results.append(panoptic)
                panoptic_json = dict()
                panoptic_json['annotations'] = self._panoptic2json(
                    panoptic_results, outfile_prefix)
                result_files['panoptic'] = f'{outfile_prefix}.panoptic.json'
                mmcv.dump(panoptic_json, result_files['panoptic'])
            else:
                instance_segm_results = results
            json_results = self._segm2json(instance_segm_results)
            result_files['bbox'] = f'{outfile_prefix}.bbox.json'
            result_files['proposal'] = f'{outfile_prefix}.bbox.json'
            result_files['segm'] = f'{outfile_prefix}.segm.json'
            mmcv.dump(json_results[0], result_files['bbox'])
            mmcv.dump(json_results[1], result_files['segm'])
        elif isinstance(results[0], np.ndarray):
            json_results = self._proposal2json(results)
            result_files['proposal'] = f'{outfile_prefix}.proposal.json'
            mmcv.dump(json_results, result_files['proposal'])
        else:
            raise TypeError('invalid type of results')
        return result_files

    def results2txt(self, results, outfile_prefix):
        """Dump the detection results to a txt file.

        Args:
            results (list[list | tuple]): Testing results of the
                dataset.
            outfile_prefix (str): The filename prefix of the json files.
                If the prefix is "somepath/xxx",
                the txt files will be named "somepath/xxx.txt".

        Returns:
            list[str]: Result txt files which contains corresponding \
                instance segmentation images.
        """
        try:
            import cityscapesscripts.helpers.labels as CSLabels
        except ImportError:
            raise ImportError('Please run "pip install citscapesscripts" to '
                              'install cityscapesscripts first.')
        result_files = []
        os.makedirs(outfile_prefix, exist_ok=True)
        prog_bar = mmcv.ProgressBar(len(self))
        for idx in range(len(self)):
            result = results[idx]
            filename = self.data_infos[idx]['filename']
            basename = osp.splitext(osp.basename(filename))[0]
            pred_txt = osp.join(outfile_prefix, basename + '_pred.txt')

            bbox_result, segm_result = result
            bboxes = np.vstack(bbox_result)
            # segm results
            if isinstance(segm_result, tuple):
                # Some detectors use different scores for bbox and mask,
                # like Mask Scoring R-CNN. Score of segm will be used instead
                # of bbox score.
                segms = mmcv.concat_list(segm_result[0])
                mask_score = segm_result[1]
            else:
                # use bbox score for mask score
                segms = mmcv.concat_list(segm_result)
                mask_score = [bbox[-1] for bbox in bboxes]
            labels = [
                np.full(bbox.shape[0], i, dtype=np.int32)
                for i, bbox in enumerate(bbox_result)
            ]
            labels = np.concatenate(labels)

            assert len(bboxes) == len(segms) == len(labels)
            num_instances = len(bboxes)
            prog_bar.update()
            with open(pred_txt, 'w') as fout:
                for i in range(num_instances):
                    pred_class = labels[i]
                    classes = self.CLASSES[pred_class]
                    class_id = CSLabels.name2label[classes].id
                    score = mask_score[i]
                    mask = maskUtils.decode(segms[i]).astype(np.uint8)
                    png_filename = osp.join(outfile_prefix,
                                            basename + f'_{i}_{classes}.png')
                    mmcv.imwrite(mask, png_filename)
                    fout.write(f'{osp.basename(png_filename)} {class_id} '
                               f'{score}\n')
            result_files.append(pred_txt)

        return result_files

    def format_results(self, results, jsonfile_prefix=None, **kwargs):
        """Format the results to json (standard format for COCO evaluation).

        Args:
            results (list[tuple | numpy.ndarray]): Testing results of the
                dataset.
            jsonfile_prefix (str | None): The prefix of json files. It includes
                the file path and the prefix of filename, e.g., "a/b/prefix".
                If not specified, a temp file will be created. Default: None.

        Returns:
            tuple: (result_files, tmp_dir), result_files is a dict containing \
                the json filepaths, tmp_dir is the temporal directory created \
                for saving json files when jsonfile_prefix is not specified.
        """
        assert isinstance(results, list), 'results must be a list'
        assert len(results) == len(self), (
            'The length of results is not equal to the dataset len: {} != {}'.
            format(len(results), len(self)))

        if jsonfile_prefix is None:
            tmp_dir = tempfile.TemporaryDirectory()
            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
        else:
            tmp_dir = None
        result_files = self.results2json(results, jsonfile_prefix)
        return result_files, tmp_dir

    def evaluate(self,
                 results,
                 metric='bbox',
                 logger=None,
                 outfile_prefix=None,
                 classwise=False,
                 proposal_nums=(100, 300, 1000),
                 iou_thrs=np.arange(0.5, 0.96, 0.05),
                 metric_items = None):
        """Evaluation in Cityscapes/COCO protocol.

        Args:
            results (list[list | tuple]): Testing results of the dataset.
            metric (str | list[str]): Metrics to be evaluated. Options are
                'bbox', 'segm', 'proposal', 'proposal_fast'.
            logger (logging.Logger | str | None): Logger used for printing
                related information during evaluation. Default: None.
            outfile_prefix (str | None): The prefix of output file. It includes
                the file path and the prefix of filename, e.g., "a/b/prefix".
                If results are evaluated with COCO protocol, it would be the
                prefix of output json file. For example, the metric is 'bbox'
                and 'segm', then json files would be "a/b/prefix.bbox.json" and
                "a/b/prefix.segm.json".
                If results are evaluated with cityscapes protocol, it would be
                the prefix of output txt/png files. The output files would be
                png images under folder "a/b/prefix/xxx/" and the file name of
                images would be written into a txt file
                "a/b/prefix/xxx_pred.txt", where "xxx" is the video name of
                cityscapes. If not specified, a temp file will be created.
                Default: None.
            classwise (bool): Whether to evaluating the AP for each class.
            proposal_nums (Sequence[int]): Proposal number used for evaluating
                recalls, such as recall@100, recall@1000.
                Default: (100, 300, 1000).
            iou_thrs (Sequence[float]): IoU threshold used for evaluating
                recalls. If set to a list, the average recall of all IoUs will
                also be computed. Default: 0.5.

        Returns:
            dict[str, float]: COCO style evaluation metric or cityscapes mAP \
                and AP@50.
        """
        eval_results = dict()

        metrics = metric.copy() if isinstance(metric, list) else [metric]
        allowed_metrics = [
            'bbox', 'segm', 'cityscapes', 'panoptic'
        ]
        for metric in metrics:
            if metric not in allowed_metrics:
                raise KeyError(f'metric {metric} is not supported')

        if 'cityscapes' in metrics:
            eval_results.update(
                self._evaluate_cityscapes(results, outfile_prefix, logger))
            metrics.remove('cityscapes')

        if iou_thrs is None:
            iou_thrs = np.linspace(
                .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
        if metric_items is not None:
            if not isinstance(metric_items, list):
                metric_items = [metric_items]

        result_files, tmp_dir = self.format_results(results, outfile_prefix)

        eval_results = OrderedDict()
        cocoGt = self.coco
        for metric in metrics:
            msg = f'Evaluating {metric}...'
            if logger is None:
                msg = '\n' + msg
            print_log(msg, logger=logger)

            if metric == 'proposal_fast':
                ar = self.fast_eval_recall(
                    results, proposal_nums, iou_thrs, logger='silent')
                log_msg = []
                for i, num in enumerate(proposal_nums):
                    eval_results[f'AR@{num}'] = ar[i]
                    log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}')
                log_msg = ''.join(log_msg)
                print_log(log_msg, logger=logger)
                continue

            if metric == 'panoptic':
                from panopticapi.evaluation import pq_compute
                # print("pred folder", result_files['panoptic'].split('.')[0])
                with contextlib.redirect_stdout(io.StringIO()):
                    pq_res = pq_compute(
                        self.ann_file['panoptic_ann'],
                        result_files['panoptic'],
                        gt_folder=self.seg_prefix,
                        pred_folder=result_files['panoptic'].split('.')[0])
                results = parse_pq_results(pq_res)
                for k, v in results.items():
                    eval_results[f'{metric}_{k}'] = f'{float(v):0.3f}'
                print_log(
                    'Panoptic Evaluation Results:\n' +
                    _print_panoptic_results(pq_res),
                    logger=logger)
                continue

            iou_type = 'bbox' if metric == 'proposal' else metric
            if metric not in result_files:
                raise KeyError(f'{metric} is not in results')
            try:
                predictions = mmcv.load(result_files[metric])
                if iou_type == 'segm':
                    # Refer to https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L331  # noqa
                    # When evaluating mask AP, if the results contain bbox,
                    # cocoapi will use the box area instead of the mask area
                    # for calculating the instance area. Though the overall AP
                    # is not affected, this leads to different small, medium,
                    # and large mask AP results.
                    for x in predictions:
                        x.pop('bbox')
                cocoDt = cocoGt.loadRes(predictions)
            except IndexError:
                print_log(
                    'The testing results of the whole dataset is empty.',
                    logger=logger,
                    level=logging.ERROR)
                break

            cocoEval = COCOeval(cocoGt, cocoDt, iou_type)
            cocoEval.params.catIds = self.cat_ids
            cocoEval.params.imgIds = self.img_ids
            cocoEval.params.maxDets = list(proposal_nums)
            cocoEval.params.iouThrs = iou_thrs
            # mapping of cocoEval.stats
            coco_metric_names = {
                'mAP': 0,
                'mAP_50': 1,
                'mAP_75': 2,
                'mAP_s': 3,
                'mAP_m': 4,
                'mAP_l': 5,
                'AR@100': 6,
                'AR@300': 7,
                'AR@1000': 8,
                'AR_s@1000': 9,
                'AR_m@1000': 10,
                'AR_l@1000': 11
            }
            if metric_items is not None:
                for metric_item in metric_items:
                    if metric_item not in coco_metric_names:
                        raise KeyError(
                            f'metric item {metric_item} is not supported')

            if metric == 'proposal':
                cocoEval.params.useCats = 0
                cocoEval.evaluate()
                cocoEval.accumulate()
                cocoEval.summarize()
                if metric_items is None:
                    metric_items = [
                        'AR@100', 'AR@300', 'AR@1000', 'AR_s@1000',
                        'AR_m@1000', 'AR_l@1000'
                    ]

                for item in metric_items:
                    val = float(
                        f'{cocoEval.stats[coco_metric_names[item]]:.3f}')
                    eval_results[item] = val
            else:
                cocoEval.evaluate()
                cocoEval.accumulate()
                cocoEval.summarize()
                if classwise:  # Compute per-category AP
                    # Compute per-category AP
                    # from https://github.com/facebookresearch/detectron2/
                    precisions = cocoEval.eval['precision']
                    # precision: (iou, recall, cls, area range, max dets)
                    assert len(self.cat_ids) == precisions.shape[2]

                    results_per_category = []
                    for idx, catId in enumerate(self.cat_ids):
                        # area range index 0: all area ranges
                        # max dets index -1: typically 100 per image
                        nm = self.coco.loadCats(catId)[0]
                        precision = precisions[:, :, idx, 0, -1]
                        precision = precision[precision > -1]
                        if precision.size:
                            ap = np.mean(precision)
                        else:
                            ap = float('nan')
                        results_per_category.append(
                            (f'{nm["name"]}', f'{float(ap):0.3f}'))

                    num_columns = min(6, len(results_per_category) * 2)
                    results_flatten = list(
                        itertools.chain(*results_per_category))
                    headers = ['category', 'AP'] * (num_columns // 2)
                    results_2d = itertools.zip_longest(*[
                        results_flatten[i::num_columns]
                        for i in range(num_columns)
                    ])
                    table_data = [headers]
                    table_data += [result for result in results_2d]
                    table = AsciiTable(table_data)
                    print_log('\n' + table.table, logger=logger)

                if metric_items is None:
                    metric_items = [
                        'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
                    ]

                for metric_item in metric_items:
                    key = f'{metric}_{metric_item}'
                    val = float(
                        f'{cocoEval.stats[coco_metric_names[metric_item]]:.3f}'
                    )
                    eval_results[key] = val
                ap = cocoEval.stats[:6]
                eval_results[f'{metric}_mAP_copypaste'] = (
                    f'{ap[0]:.3f} {ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} '
                    f'{ap[4]:.3f} {ap[5]:.3f}')

        if tmp_dir is not None:
            tmp_dir.cleanup()
        return eval_results

    def _evaluate_cityscapes(self, results, txtfile_prefix, logger):
        """Evaluation in Cityscapes protocol.

        Args:
            results (list): Testing results of the dataset.
            txtfile_prefix (str | None): The prefix of output txt file
            logger (logging.Logger | str | None): Logger used for printing
                related information during evaluation. Default: None.

        Returns:
            dict[str: float]: Cityscapes evaluation results, contains 'mAP' \
                and 'AP@50'.
        """

        try:
            import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as CSEval  # noqa
        except ImportError:
            raise ImportError('Please run "pip install citscapesscripts" to '
                              'install cityscapesscripts first.')
        msg = 'Evaluating in Cityscapes style'
        if logger is None:
            msg = '\n' + msg
        print_log(msg, logger=logger)

        result_files, tmp_dir = self.format_results(results, txtfile_prefix)

        if tmp_dir is None:
            result_dir = osp.join(txtfile_prefix, 'results')
        else:
            result_dir = osp.join(tmp_dir.name, 'results')

        eval_results = OrderedDict()
        print_log(f'Evaluating results under {result_dir} ...', logger=logger)

        # set global states in cityscapes evaluation API
        CSEval.args.cityscapesPath = os.path.join(self.img_prefix, '../..')
        CSEval.args.predictionPath = os.path.abspath(result_dir)
        CSEval.args.predictionWalk = None
        CSEval.args.JSONOutput = False
        CSEval.args.colorized = False
        CSEval.args.gtInstancesFile = os.path.join(result_dir,
                                                   'gtInstances.json')
        CSEval.args.groundTruthSearch = os.path.join(
            self.img_prefix.replace('leftImg8bit', 'gtFine'),
            '*/*_gtFine_instanceIds.png')

        groundTruthImgList = glob.glob(CSEval.args.groundTruthSearch)
        assert len(groundTruthImgList), 'Cannot find ground truth images' \
            f' in {CSEval.args.groundTruthSearch}.'
        predictionImgList = []
        for gt in groundTruthImgList:
            predictionImgList.append(CSEval.getPrediction(gt, CSEval.args))
        CSEval_results = CSEval.evaluateImgLists(predictionImgList,
                                                 groundTruthImgList,
                                                 CSEval.args)['averages']

        eval_results['mAP'] = CSEval_results['allAp']
        eval_results['AP@50'] = CSEval_results['allAp50%']
        if tmp_dir is not None:
            tmp_dir.cleanup()
        return eval_results

================================================
FILE: external/coco_panoptic.py
================================================
import contextlib
import io
import itertools
import logging
import tempfile
import os.path as osp
from collections import OrderedDict

import mmcv
import numpy as np
from mmcv.utils import print_log
from mmdet.datasets.builder import DATASETS
from mmdet.datasets.coco import CocoDataset
from mmdet.datasets.api_wrappers import COCO, COCOeval
from terminaltables import AsciiTable


@DATASETS.register_module()
class CocoPanopticDatasetCustom(CocoDataset):

    def load_annotations(self, ann_file):
        """Load annotation from COCO style annotation file.

        Args:
            ann_file (str): Path of annotation file.

        Returns:
            list[dict]: Annotation info from COCO api.
        """

        self.coco = COCO(ann_file['ins_ann'])
        self.cat_ids = self.coco.get_cat_ids(cat_names=self.CLASSES)
        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
        self.img_ids = sorted(self.coco.get_img_ids())

        self.panoptic_anns = mmcv.load(ann_file['panoptic_ann'])

        self.stuff_ids = [
            k['id'] for k in self.panoptic_anns['categories']
            if k['isthing'] == 0
        ]
        self.thing_ids = [
            k['id'] for k in self.panoptic_anns['categories']
            if k['isthing'] == 1
        ]

        assert self.thing_ids == self.cat_ids

        self.seg2stuff_ids = {
            i + 1: stuff_id
            for i, stuff_id in enumerate(self.stuff_ids)
        }
        self.seg2stuff_ids.update({0: 0})

        self.ins2thing_ids = {
            i: thing_id
            for i, thing_id in enumerate(self.thing_ids)
        }

        data_infos = []
        total_ann_ids = []
        for i in self.img_ids:
            info = self.coco.load_imgs([i])[0]
            info['filename'] = info['file_name']
            data_infos.append(info)
            ann_ids = self.coco.get_ann_ids(img_ids=[i])
            total_ann_ids.extend(ann_ids)
        assert len(set(total_ann_ids)) == len(
            total_ann_ids), f"Annotation ids in '{ann_file}' are not unique!"
        return data_infos

    def get_ann_info(self, idx):
        """Get COCO annotation by index.

        Args:
            idx (int): Index of data.

        Returns:
            dict: Annotation info of specified index.
        """

        img_id = self.data_infos[idx]['id']
        ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
        ann_info = self.coco.load_anns(ann_ids)
        return self._parse_ann_info(self.data_infos[idx], ann_info)

    def get_cat_ids(self, idx):
        """Get COCO category ids by index.

        Args:
            idx (int): Index of data.

        Returns:
            list[int]: All categories in the image of specified index.
        """

        img_id = self.data_infos[idx]['id']
        ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
        ann_info = self.coco.load_anns(ann_ids)
        return [ann['category_id'] for ann in ann_info]

    def _parse_ann_info(self, img_info, ann_info):
        """Parse bbox and mask annotation.

        Args:
            ann_info (list[dict]): Annotation info of an image.
            with_mask (bool): Whether to parse mask annotations.

        Returns:
            dict: A dict containing the following keys: bboxes, bboxes_ignore,\
                labels, masks, seg_map. "masks" are raw annotations and not \
                decoded into binary masks.
        """
        gt_bboxes = []
        gt_labels = []
        gt_bboxes_ignore = []
        gt_masks_ann = []
        for i, ann in enumerate(ann_info):
            if ann.get('ignore', False):
                continue
            x1, y1, w, h = ann['bbox']
            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
            if inter_w * inter_h == 0:
                continue
            if ann['area'] <= 0 or w < 1 or h < 1:
                continue
            if ann['category_id'] not in self.cat_ids:
                continue
            bbox = [x1, y1, x1 + w, y1 + h]
            if ann.get('iscrowd', False):
                gt_bboxes_ignore.append(bbox)
            else:
                gt_bboxes.append(bbox)
                gt_labels.append(self.cat2label[ann['category_id']])
                gt_masks_ann.append(ann.get('segmentation', None))

        if gt_bboxes:
            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
            gt_labels = np.array(gt_labels, dtype=np.int64)
        else:
            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
            gt_labels = np.array([], dtype=np.int64)

        if gt_bboxes_ignore:
            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
        else:
            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)

        seg_map = img_info['filename'].replace('jpg', 'png')

        ann = dict(
            bboxes=gt_bboxes,
            labels=gt_labels,
            bboxes_ignore=gt_bboxes_ignore,
            masks=gt_masks_ann,
            seg_map=seg_map)

        return ann

    def _panoptic2json(self, results, outfile_prefix):
        panoptic_json_results = []
        mmcv.mkdir_or_exist(outfile_prefix)
        for idx in range(len(self)):
            img_id = self.img_ids[idx]
            panoptic = results[idx]
            png_string, segments_info = panoptic
            data = dict()
            data['image_id'] = img_id
            for segment_info in segments_info:
                isthing = segment_info.pop('isthing')
                cat_id = segment_info['category_id']
                if isthing is True:
                    segment_info['category_id'] = self.ins2thing_ids[cat_id]
                else:
                    segment_info['category_id'] = self.seg2stuff_ids[cat_id]

            png_path = self.data_infos[idx]['file_name'].replace(
                '.jpg', '.png')
            png_save_path = osp.join(outfile_prefix, png_path)
            data['file_name'] = png_path
            with open(png_save_path, 'wb') as f:
                f.write(png_string)
            data['segments_info'] = segments_info
            panoptic_json_results.append(data)
        return panoptic_json_results

    def results2json(self, results, outfile_prefix):
        """Dump the detection results to a COCO style json file.

        There are 3 types of results: proposals, bbox predictions, mask
        predictions, and they have different data types. This method will
        automatically recognize the type, and dump them to json files.

        Args:
            results (list[list | tuple | ndarray]): Testing results of the
                dataset.
            outfile_prefix (str): The filename prefix of the json files. If the
                prefix is "somepath/xxx", the json files will be named
                "somepath/xxx.bbox.json", "somepath/xxx.segm.json",
                "somepath/xxx.proposal.json".

        Returns:
            dict[str: str]: Possible keys are "bbox", "segm", "proposal", and \
                values are corresponding filenames.
        """
        result_files = dict()
        if isinstance(results[0], list):
            json_results = self._det2json(results)
            result_files['bbox'] = f'{outfile_prefix}.bbox.json'
            result_files['proposal'] = f'{outfile_prefix}.bbox.json'
            mmcv.dump(json_results, result_files['bbox'])
        elif isinstance(results[0], tuple):
            if len(results[0]) == 3:
                instance_segm_results = []
                panoptic_results = []
                for idx in range(len(self)):
                    det, seg, panoptic = results[idx]
                    instance_segm_results.append([det, seg])
                    panoptic_results.append(panoptic)
                panoptic_json = dict()
                panoptic_json['annotations'] = self._panoptic2json(
                    panoptic_results, outfile_prefix)
                result_files['panoptic'] = f'{outfile_prefix}.panoptic.json'
                mmcv.dump(panoptic_json, result_files['panoptic'])
            else:
                instance_segm_results = results
            json_results = self._segm2json(instance_segm_results)
            result_files['bbox'] = f'{outfile_prefix}.bbox.json'
            result_files['proposal'] = f'{outfile_prefix}.bbox.json'
            result_files['segm'] = f'{outfile_prefix}.segm.json'
            mmcv.dump(json_results[0], result_files['bbox'])
            mmcv.dump(json_results[1], result_files['segm'])
        elif isinstance(results[0], np.ndarray):
            json_results = self._proposal2json(results)
            result_files['proposal'] = f'{outfile_prefix}.proposal.json'
            mmcv.dump(json_results, result_files['proposal'])
        else:
            raise TypeError('invalid type of results')
        return result_files

    def format_results(self, results, jsonfile_prefix=None, **kwargs):
        """Format the results to json (standard format for COCO evaluation).

        Args:
            results (list[tuple | numpy.ndarray]): Testing results of the
                dataset.
            jsonfile_prefix (str | None): The prefix of json files. It includes
                the file path and the prefix of filename, e.g., "a/b/prefix".
                If not specified, a temp file will be created. Default: None.

        Returns:
            tuple: (result_files, tmp_dir), result_files is a dict containing \
                the json filepaths, tmp_dir is the temporal directory created \
                for saving json files when jsonfile_prefix is not specified.
        """
        assert isinstance(results, list), 'results must be a list'
        assert len(results) == len(self), (
            'The length of results is not equal to the dataset len: {} != {}'.
            format(len(results), len(self)))

        if jsonfile_prefix is None:
            tmp_dir = tempfile.TemporaryDirectory()
            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
        else:
            tmp_dir = None
        result_files = self.results2json(results, jsonfile_prefix)
        return result_files, tmp_dir

    def evaluate(self,
                 results,
                 metric='bbox',
                 logger=None,
                 jsonfile_prefix=None,
                 classwise=False,
                 proposal_nums=(100, 300, 1000),
                 iou_thrs=None,
                 metric_items=None):
        """Evaluation in COCO protocol.

        Args:
            results (list[list | tuple]): Testing results of the dataset.
            metric (str | list[str]): Metrics to be evaluated. Options are
                'bbox', 'segm', 'proposal', 'proposal_fast'.
            logger (logging.Logger | str | None): Logger used for printing
                related information during evaluation. Default: None.
            jsonfile_prefix (str | None): The prefix of json files. It includes
                the file path and the prefix of filename, e.g., "a/b/prefix".
                If not specified, a temp file will be created. Default: None.
            classwise (bool): Whether to evaluating the AP for each class.
            proposal_nums (Sequence[int]): Proposal number used for evaluating
                recalls, such as recall@100, recall@1000.
                Default: (100, 300, 1000).
            iou_thrs (Sequence[float], optional): IoU threshold used for
                evaluating recalls/mAPs. If set to a list, the average of all
                IoUs will also be computed. If not specified, [0.50, 0.55,
                0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95] will be used.
                Default: None.
            metric_items (list[str] | str, optional): Metric items that will
                be returned. If not specified, ``['AR@100', 'AR@300',
                'AR@1000', 'AR_s@1000', 'AR_m@1000', 'AR_l@1000' ]`` will be
                used when ``metric=='proposal'``, ``['mAP', 'mAP_50', 'mAP_75',
                'mAP_s', 'mAP_m', 'mAP_l']`` will be used when
                ``metric=='bbox' or metric=='segm'``.

        Returns:
            dict[str, float]: COCO style evaluation metric.
        """

        metrics = metric if isinstance(metric, list) else [metric]
        allowed_metrics = [
            'bbox', 'segm', 'proposal', 'proposal_fast', 'panoptic'
        ]
        for metric in metrics:
            if metric not in allowed_metrics:
                raise KeyError(f'metric {metric} is not supported')
        if iou_thrs is None:
            iou_thrs = np.linspace(
                .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
        if metric_items is not None:
            if not isinstance(metric_items, list):
                metric_items = [metric_items]

        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)

        eval_results = OrderedDict()
        cocoGt = self.coco
        for metric in metrics:
            msg = f'Evaluating {metric}...'
            if logger is None:
                msg = '\n' + msg
            print_log(msg, logger=logger)

            if metric == 'proposal_fast':
                ar = self.fast_eval_recall(
                    results, proposal_nums, iou_thrs, logger='silent')
                log_msg = []
                for i, num in enumerate(proposal_nums):
                    eval_results[f'AR@{num}'] = ar[i]
                    log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}')
                log_msg = ''.join(log_msg)
                print_log(log_msg, logger=logger)
                continue

            if metric == 'panoptic':
                from panopticapi.evaluation import pq_compute
                with contextlib.redirect_stdout(io.StringIO()):
                    pq_res = pq_compute(
                        self.ann_file['panoptic_ann'],
                        result_files['panoptic'],
                        gt_folder=self.seg_prefix,
                        pred_folder=result_files['panoptic'].split('.')[0])
                results = parse_pq_results(pq_res)
                for k, v in results.items():
                    eval_results[f'{metric}_{k}'] = f'{float(v):0.3f}'
                print_log(
                    'Panoptic Evaluation Results:\n' +
                    _print_panoptic_results(pq_res),
                    logger=logger)
                continue

            iou_type = 'bbox' if metric == 'proposal' else metric
            if metric not in result_files:
                raise KeyError(f'{metric} is not in results')
            try:
                predictions = mmcv.load(result_files[metric])
                if iou_type == 'segm':
                    # Refer to https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L331  # noqa
                    # When evaluating mask AP, if the results contain bbox,
                    # cocoapi will use the box area instead of the mask area
                    # for calculating the instance area. Though the overall AP
                    # is not affected, this leads to different small, medium,
                    # and large mask AP results.
                    for x in predictions:
                        x.pop('bbox')
                cocoDt = cocoGt.loadRes(predictions)
            except IndexError:
                print_log(
                    'The testing results of the whole dataset is empty.',
                    logger=logger,
                    level=logging.ERROR)
                break

            cocoEval = COCOeval(cocoGt, cocoDt, iou_type)
            cocoEval.params.catIds = self.cat_ids
            cocoEval.params.imgIds = self.img_ids
            cocoEval.params.maxDets = list(proposal_nums)
            cocoEval.params.iouThrs = iou_thrs
            # mapping of cocoEval.stats
            coco_metric_names = {
                'mAP': 0,
                'mAP_50': 1,
                'mAP_75': 2,
                'mAP_s': 3,
                'mAP_m': 4,
                'mAP_l': 5,
                'AR@100': 6,
                'AR@300': 7,
                'AR@1000': 8,
                'AR_s@1000': 9,
                'AR_m@1000': 10,
                'AR_l@1000': 11
            }
            if metric_items is not None:
                for metric_item in metric_items:
                    if metric_item not in coco_metric_names:
                        raise KeyError(
                            f'metric item {metric_item} is not supported')

            if metric == 'proposal':
                cocoEval.params.useCats = 0
                cocoEval.evaluate()
                cocoEval.accumulate()
                cocoEval.summarize()
                if metric_items is None:
                    metric_items = [
                        'AR@100', 'AR@300', 'AR@1000', 'AR_s@1000',
                        'AR_m@1000', 'AR_l@1000'
                    ]

                for item in metric_items:
                    val = float(
                        f'{cocoEval.stats[coco_metric_names[item]]:.3f}')
                    eval_results[item] = val
            else:
                cocoEval.evaluate()
                cocoEval.accumulate()
                cocoEval.summarize()
                if classwise:  # Compute per-category AP
                    # Compute per-category AP
                    # from https://github.com/facebookresearch/detectron2/
                    precisions = cocoEval.eval['precision']
                    # precision: (iou, recall, cls, area range, max dets)
                    assert len(self.cat_ids) == precisions.shape[2]

                    results_per_category = []
                    for idx, catId in enumerate(self.cat_ids):
                        # area range index 0: all area ranges
                        # max dets index -1: typically 100 per image
                        nm = self.coco.loadCats(catId)[0]
                        precision = precisions[:, :, idx, 0, -1]
                        precision = precision[precision > -1]
                        if precision.size:
                            ap = np.mean(precision)
                        else:
                            ap = float('nan')
                        results_per_category.append(
                            (f'{nm["name"]}', f'{float(ap):0.3f}'))

                    num_columns = min(6, len(results_per_category) * 2)
                    results_flatten = list(
                        itertools.chain(*results_per_category))
                    headers = ['category', 'AP'] * (num_columns // 2)
                    results_2d = itertools.zip_longest(*[
                        results_flatten[i::num_columns]
                        for i in range(num_columns)
                    ])
                    table_data = [headers]
                    table_data += [result for result in results_2d]
                    table = AsciiTable(table_data)
                    print_log('\n' + table.table, logger=logger)

                if metric_items is None:
                    metric_items = [
                        'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
                    ]

                for metric_item in metric_items:
                    key = f'{metric}_{metric_item}'
                    val = float(
                        f'{cocoEval.stats[coco_metric_names[metric_item]]:.3f}'
                    )
                    eval_results[key] = val
                ap = cocoEval.stats[:6]
                eval_results[f'{metric}_mAP_copypaste'] = (
                    f'{ap[0]:.3f} {ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} '
                    f'{ap[4]:.3f} {ap[5]:.3f}')

        if tmp_dir is not None:
            tmp_dir.cleanup()
        return eval_results


def parse_pq_results(pq_res):
    res = dict()
    res['PQ'] = 100 * pq_res['All']['pq']
    res['SQ'] = 100 * pq_res['All']['sq']
    res['RQ'] = 100 * pq_res['All']['rq']
    res['PQ_th'] = 100 * pq_res['Things']['pq']
    res['SQ_th'] = 100 * pq_res['Things']['sq']
    res['RQ_th'] = 100 * pq_res['Things']['rq']
    res['PQ_st'] = 100 * pq_res['Stuff']['pq']
    res['SQ_st'] = 100 * pq_res['Stuff']['sq']
    res['RQ_st'] = 100 * pq_res['Stuff']['rq']
    return res


def _print_panoptic_results(pq_res):
    headers = ['', 'PQ', 'SQ', 'RQ', 'categories']
    data = [headers]
    for name in ['All', 'Things', 'Stuff']:
        numbers = [
            f'{(pq_res[name][k] * 100):0.3f}' for k in ['pq', 'sq', 'rq']
        ]
        row = [name] + numbers + [pq_res[name]['n']]
        data.append(row)
    table = AsciiTable(data)
    return table.table


================================================
FILE: external/dataset/dvps_pipelines/__init__.py
================================================


================================================
FILE: external/dataset/dvps_pipelines/loading.py
================================================
import mmcv
import numpy as np
from mmdet.core import BitmapMasks
from mmdet.datasets.builder import PIPELINES


def bitmasks2bboxes(bitmasks):
    bitmasks_array = bitmasks.masks
    boxes = np.zeros((bitmasks_array.shape[0], 4), dtype=np.float32)
    x_any = np.any(bitmasks_array, axis=1)
    y_any = np.any(bitmasks_array, axis=2)
    for idx in range(bitmasks_array.shape[0]):
        x = np.where(x_any[idx, :])[0]
        y = np.where(y_any[idx, :])[0]
        if len(x) > 0 and len(y) > 0:
            boxes[idx, :] = np.array((x[0], y[0], x[-1], y[-1]), dtype=np.float32)
    return boxes


@PIPELINES.register_module()
class LoadImgDirect:
    """Go ahead and just load image
    """

    def __init__(self,
                 to_float32=False,
                 color_type='color'):
        self.to_float32 = to_float32
        self.color_type = color_type

    def __call__(self, results):
        """Call functions to load image and get image meta information.

        Args:
            results (dict): Result dict requires "img" which is the img path.

        Returns:
            dict: The dict contains loaded image and meta information.
            'img' : img
            'img_shape' : img_shape
            'ori_shape' : original shape
            'img_fields' : the img fields
        """
        img = mmcv.imread(results['img'], channel_order='rgb', flag=self.color_type)
        if self.to_float32:
            img = img.astype(np.float32)

        results['img'] = img
        results['img_shape'] = img.shape
        results['ori_shape'] = img.shape
        results['img_fields'] = ['img']
        return results

    def __repr__(self):
        repr_str = (f'{self.__class__.__name__}('
                    f'to_float32={self.to_float32}, '
                    f"color_type='{self.color_type}', ")
        return repr_str


@PIPELINES.register_module()
class LoadMultiImagesDirect(LoadImgDirect):
    """Load multi images from file.
    Please refer to `mmdet.datasets.pipelines.loading.py:LoadImageFromFile`
    for detailed docstring.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def __call__(self, results):
        """Call function.
        For each dict in `results`, call the call function of
        `LoadImageFromFile` to load image.
        Args:
            results (list[dict]): List of dict from
                :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            list[dict]: List of dict that contains loaded image.
        """
        outs = []
        for _results in results:
            _results = super().__call__(_results)
            outs.append(_results)
        return outs


@PIPELINES.register_module()
class LoadAnnotationsDirect:
    """Go ahead and just load image
    """

    def __init__(self,
                 with_depth=True,
                 divisor: int = 1000,
                 cherry_pick=False,
                 cherry=None,
                 viper=False,
                 vipseg=False
                 ):
        self.with_depth = with_depth
        self.panseg_divisor = divisor
        self.cherry_pick = cherry_pick
        self.cherry = cherry
        self.viper = viper
        self.vipseg=vipseg
        if self.vipseg:
            self.panseg_divisor = 1000

    def __call__(self, results):
        """Call functions to load image and get image meta information.

        Args:
            results (dict): Result dict requires "img" which is the img path.

        Returns:
            dict: The dict contains loaded image and meta information.
            'depth_fields' : the depth fields for supporting depth aug
        """

        if self.with_depth:
            depth = mmcv.imread(results['depth'], flag='unchanged').astype(np.float32) / 256.
            del results['depth']
            depth[depth >= 80.] = 80.
            results['gt_depth'] = depth
            results['depth_fields'] = ['gt_depth']

        local_divisor = 10000
        if self.panseg_divisor == 0:
            # The seperate file to store class id and inst id
            gt_semantic_seg = mmcv.imread(results['ann_class'], flag='unchanged').astype(np.float32)
            inst_map = mmcv.imread(results['ann_inst'], flag='unchanged').astype(np.float32)
            ps_id = gt_semantic_seg * local_divisor + inst_map
            del results['ann_class']
            del results['ann_inst']
        elif self.panseg_divisor == -1:
            # KITTI step mode which means the panseg is stored with RGB
            id_map = mmcv.imread(results['ann'], flag='color', channel_order='rgb')
            gt_semantic_seg = id_map[..., 0].astype(np.float32)
            inst_map = id_map[..., 1].astype(np.float32) * 256 + id_map[..., 2].astype(np.float32)
            ps_id = gt_semantic_seg * local_divisor + inst_map
            del results['ann']
        else:
            ps_id = mmcv.imread(results['ann'], flag='unchanged').astype(np.float32)
            if self.vipseg:
                ps_id = results['pre_hook'](ps_id)
                del results['pre_hook']
            # This is for viper
            if self.viper or self.vipseg:
                ps_id[ps_id < 1000] *= 1000
            del results['ann']
            gt_semantic_seg = ps_id // self.panseg_divisor

        if self.viper:
            gt_semantic_seg[gt_semantic_seg >= results['thing_upper']] = results['no_obj_class']
        results['gt_semantic_seg'] = gt_semantic_seg.astype(np.int)
        results['seg_fields'] = ['gt_semantic_seg']

        classes = []
        masks = []
        instance_ids = []
        no_obj_class = results['no_obj_class']
        for pan_seg_id in np.unique(ps_id):
            classes.append(pan_seg_id // self.panseg_divisor if self.panseg_divisor > 0
                           else pan_seg_id // local_divisor)
            masks.append((ps_id == pan_seg_id).astype(np.int))
            instance_ids.append(pan_seg_id)
        gt_labels = np.stack(classes).astype(np.int)
        gt_instance_ids = np.stack(instance_ids).astype(np.int)
        gt_masks = BitmapMasks(masks, height=results['img_shape'][0], width=results['img_shape'][1])
        # check the sanity of gt_masks
        verify = np.sum(gt_masks.masks.astype(np.int), axis=0)
        assert (verify == np.ones(gt_masks.masks.shape[-2:], dtype=verify.dtype)).all()
        # now delete the no_obj_class
        gt_masks.masks = np.delete(gt_masks.masks, gt_labels == no_obj_class, axis=0)
        gt_instance_ids = np.delete(gt_instance_ids, gt_labels == no_obj_class)
        gt_labels = np.delete(gt_labels, gt_labels == no_obj_class)
        if results['is_instance_only'] and not self.cherry_pick:
            gt_masks.masks = np.delete(
                gt_masks.masks,
                (gt_labels >= results['thing_upper']) | (gt_labels < results['thing_lower']),
                axis=0
            )
            gt_instance_ids = np.delete(
                gt_instance_ids,
                (gt_labels >= results['thing_upper']) | (gt_labels < results['thing_lower'])
            )
            gt_labels = np.delete(
                gt_labels,
                (gt_labels >= results['thing_upper']) | (gt_labels < results['thing_lower'])
            )
            gt_labels -= results['thing_lower']
        elif results['is_instance_only'] and self.cherry_pick:
            gt_masks.masks = np.delete(
                gt_masks.masks,
                list(map(lambda x: x not in self.cherry, gt_labels)),
                axis=0
            )
            gt_instance_ids = np.delete(
                gt_instance_ids,
                list(map(lambda x: x not in self.cherry, gt_labels)),
            )
            gt_labels = np.delete(
                gt_labels,
                list(map(lambda x: x not in self.cherry, gt_labels)),
            )
            gt_labels = np.array(list(map(lambda x: self.cherry.index(x), gt_labels))) if len(gt_labels) > 0 else []

        if len(gt_labels) == 0:
            return None

        results['gt_labels'] = gt_labels
        results['gt_masks'] = gt_masks
        results['gt_instance_ids'] = gt_instance_ids
        results['mask_fields'] = ['gt_masks']

        # generate boxes
        boxes = bitmasks2bboxes(gt_masks)
        results['gt_bboxes'] = boxes
        results['bbox_fields'] = ['gt_bboxes']
        return results


@PIPELINES.register_module()
class LoadMultiAnnotationsDirect(LoadAnnotationsDirect):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def __call__(self, results):
        outs = []
        for _results in results:
            _results = super().__call__(_results)
            if _results is None:
                return None
            outs.append(_results)
        return outs


================================================
FILE: external/dataset/dvps_pipelines/transforms.py
================================================
import mmcv
import numpy as np
from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import Resize, RandomFlip, Pad, Normalize


@PIPELINES.register_module()
class ResizeWithDepth(Resize):
    """This subclass of Resize is to support depth resize
    """

    def __init__(self, *args, **kwargs):
        assert kwargs['keep_ratio']
        super().__init__(*args, **kwargs)

    def _resize_depth(self, results):
        """Resize depth with ``results['scale']``"""
        # Although depth is not discrete, we use nearest to match the segmentation
        for key in results.get('depth_fields', []):
            if self.keep_ratio:
                results[key] = mmcv.imrescale(
                    results[key],
                    results['scale'],
                    interpolation='nearest',
                    backend=self.backend)
            else:
                results[key] = mmcv.imresize(
                    results[key],
                    results['scale'],
                    interpolation='nearest',
                    backend=self.backend)
            results[key] /= results['scale_factor'].mean()

    def __call__(self, results):
        super().__call__(results)
        self._resize_depth(results)
        return results


@PIPELINES.register_module()
class SeqResizeWithDepth(ResizeWithDepth):
    """Resize images.
    Please refer to `mmdet.datasets.pipelines.transfroms.py:Resize` for
    detailed docstring.
    Args:
        share_params (bool): If True, share the resize parameters for all
            images. Defaults to True.
    """

    def __init__(self, share_params=True, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.share_params = share_params

    def __call__(self, results):
        """Call function.
        For each dict in results, call the call function of `Resize` to resize
        image and corresponding annotations.
        Args:
            results (list[dict]): List of dict that from
                :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            list[dict]: List of dict that contains resized results,
            'img_shape', 'pad_shape', 'scale_factor', 'keep_ratio' keys
            are added into result dict.
        """
        outs, scale = [], None
        for i, _results in enumerate(results):
            if self.share_params and i > 0:
                _results['scale'] = scale
            _results = super().__call__(_results)
            if self.share_params and i == 0:
                scale = _results['scale']
            outs.append(_results)
        return outs


@PIPELINES.register_module()
class RandomFlipWithDepth(RandomFlip):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def __call__(self, results):
        super().__call__(results)
        if results['flip']:
            for key in results.get('depth_fields', []):
                results[key] = mmcv.imflip(
                    results[key], direction=results['flip_direction'])
        return results


@PIPELINES.register_module()
class SeqFlipWithDepth(RandomFlipWithDepth):
    """Randomly flip for images.
    Please refer to `mmdet.datasets.pipelines.transfroms.py:RandomFlip` for
    detailed docstring.
    Args:
        share_params (bool): If True, share the flip parameters for all images.
            Defaults to True.
    """

    def __init__(self, share_params=True, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.share_params = share_params

    def __call__(self, results):
        """Call function.
        For each dict in results, call `RandomFlip` to randomly flip image.
        Args:
            results (list[dict]): List of dict that from
                :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            list[dict]: List of dict that contains flipped results, 'flip',
            'flip_direction' keys are added into the dict.
        """
        if self.share_params:
            if isinstance(self.direction, list):
                # None means non-flip
                direction_list = self.direction + [None]
            else:
                # None means non-flip
                direction_list = [self.direction, None]

            if isinstance(self.flip_ratio, list):
                non_flip_ratio = 1 - sum(self.flip_ratio)
                flip_ratio_list = self.flip_ratio + [non_flip_ratio]
            else:
                non_flip_ratio = 1 - self.flip_ratio
                # exclude non-flip
                single_ratio = self.flip_ratio / (len(direction_list) - 1)
                flip_ratio_list = [single_ratio] * (len(direction_list) -
                                                    1) + [non_flip_ratio]

            cur_dir = np.random.choice(direction_list, p=flip_ratio_list)
            flip = cur_dir is not None
            flip_direction = cur_dir

            for _results in results:
                _results['flip'] = flip
                _results['flip_direction'] = flip_direction

        outs = []
        for _results in results:
            _results = super().__call__(_results)
            outs.append(_results)
        return outs


@PIPELINES.register_module()
class SeqRandomCropWithDepth(object):
    """Sequentially random crop the images & bboxes & masks.
    The absolute `crop_size` is sampled based on `crop_type` and `image_size`,
    then the cropped results are generated.
    Args:
        crop_size (tuple): The relative ratio or absolute pixels of
            height and width.
        allow_negative_crop (bool, optional): Whether to allow a crop that does
            not contain any bbox area. Default False.
        share_params (bool, optional): Whether share the cropping parameters
            for the images.
        bbox_clip_border (bool, optional): Whether clip the objects outside
            the border of the image. Defaults to True.
    Note:
        - If the image is smaller than the absolute crop size, return the
            original image.
        - The keys for bboxes, labels and masks must be aligned. That is,
          `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and
          `gt_bboxes_ignore` corresponds to `gt_labels_ignore` and
          `gt_masks_ignore`.
        - If the crop does not contain any gt-bbox region and
          `allow_negative_crop` is set to False, skip this image.
    """

    def __init__(self,
                 crop_size,
                 allow_negative_crop=False,
                 share_params=False,
                 bbox_clip_border=True,
                 check_id_match=True,
                 ):
        assert crop_size is None or (crop_size[0] > 0 and crop_size[1] > 0)
        self.crop_size = crop_size
        self.allow_negative_crop = allow_negative_crop
        self.share_params = share_params
        self.bbox_clip_border = bbox_clip_border
        self.check_id_match = check_id_match
        # The key correspondence from bboxes to labels and masks.
        self.bbox2label = {
            'gt_bboxes': ['gt_labels', 'gt_instance_ids'],
            'gt_bboxes_ignore': ['gt_labels_ignore', 'gt_instance_ids_ignore']
        }
        self.bbox2mask = {
            'gt_bboxes': 'gt_masks',
            'gt_bboxes_ignore': 'gt_masks_ignore'
        }

    def get_offsets(self, img):
        """Random generate the offsets for cropping."""
        margin_h = max(img.shape[0] - self.crop_size[0], 0)
        margin_w = max(img.shape[1] - self.crop_size[1], 0)
        offset_h = np.random.randint(0, margin_h + 1)
        offset_w = np.random.randint(0, margin_w + 1)
        return offset_h, offset_w

    def random_crop(self, results, offsets=None):
        """Call function to randomly crop images, bounding boxes, masks,
        semantic segmentation maps.
        Args:
            results (dict): Result dict from loading pipeline.
            offsets (tuple, optional): Pre-defined offsets for cropping.
                Default to None.
        Returns:
            dict: Randomly cropped results, 'img_shape' key in result dict is
            updated according to crop size.
        """
        # Only supporting img
        assert results['img_fields'] == ['img']
        img = results['img']
        if offsets is not None:
            offset_h, offset_w = offsets
        else:
            offset_h, offset_w = self.get_offsets(img)
        results['crop_offsets'] = (offset_h, offset_w)
        crop_y1, crop_y2 = offset_h, offset_h + self.crop_size[0]
        crop_x1, crop_x2 = offset_w, offset_w + self.crop_size[1]

        # crop the image
        img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
        img_shape = img.shape
        results['img'] = img
        results['img_shape'] = img_shape

        # crop bboxes accordingly and clip to the image boundary
        for key in results.get('bbox_fields', []):
            # e.g. gt_bboxes and gt_bboxes_ignore
            bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h],
                                   dtype=np.float32)
            bboxes = results[key] - bbox_offset
            if self.bbox_clip_border:
                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
            valid_inds = (bboxes[:, 2] > bboxes[:, 0]) & (
                    bboxes[:, 3] > bboxes[:, 1])
            # If the crop does not contain any gt-bbox area and
            # self.allow_negative_crop is False, skip this image.
            if (key == 'gt_bboxes' and not valid_inds.any()
                    and not self.allow_negative_crop):
                return None
            results[key] = bboxes[valid_inds, :]
            # label fields. e.g. gt_labels and gt_labels_ignore
            label_keys = self.bbox2label.get(key)
            for label_key in label_keys:
                if label_key in results:
                    results[label_key] = results[label_key][valid_inds]

            # mask fields, e.g. gt_masks and gt_masks_ignore
            mask_key = self.bbox2mask.get(key)
            if mask_key in results:
                results[mask_key] = results[mask_key][
                    valid_inds.nonzero()[0]].crop(
                    np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))

        # crop semantic seg
        for key in results.get('seg_fields', []):
            results[key] = results[key][crop_y1:crop_y2, crop_x1:crop_x2]

        # crop depth
        for key in results.get('depth_fields', []):
            results[key] = results[key][crop_y1:crop_y2, crop_x1:crop_x2]

        return results

    def __call__(self, results):
        """Call function to sequentially randomly crop images, bounding boxes,
        masks, semantic segmentation maps.
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Randomly cropped results, 'img_shape' key in result dict is
            updated according to crop size.
        """
        if self.share_params and self.crop_size is not None:
            offsets = self.get_offsets(results[0]['img'])
        else:
            offsets = None

        if self.crop_size is not None:
            outs = []
            for _results in results:
                _results = self.random_crop(_results, offsets)
                if _results is None:
                    return None
                outs.append(_results)
        else:
            outs = []
            for _results in results:
                outs.append(_results)

        if len(outs) == 2 and self.check_id_match:
            ref_result, result = outs[1], outs[0]
            if self.check_match(ref_result, result):
                return None

        return outs

    def check_match(self, ref_results, results):
        ref_ids = ref_results['gt_instance_ids'].tolist()
        gt_ids = results['gt_instance_ids'].tolist()
        gt_pids = [ref_ids.index(i) if i in ref_ids else -1 for i in gt_ids]
        nomatch = (np.array(gt_pids) == -1).all()
        return nomatch


@PIPELINES.register_module()
class PadWithDepth(Pad):

    def _pad_depth(self, results):
        """Pad depth according to
        ``results['pad_shape']``."""
        for key in results.get('depth_fields', []):
            results[key] = mmcv.impad(
                results[key], shape=results['pad_shape'][:2], pad_val=0)

    # the original pad sem_seg does not consider the no_obj_class with value except for 0
    #
    def _pad_seg(self, results):
        """Pad semantic segmentation map according to
        ``results['pad_shape']``."""
        no_obj_class = results['no_obj_class']
        for key in results.get('seg_fields', []):
            results[key] = mmcv.impad(
                results[key],
                shape=results['pad_shape'][:2],
                pad_val=no_obj_class)

    def __call__(self, results):
        """Call function to pad images, masks, semantic segmentation maps.

        Args:
            results (dict): Result dict from loading pipeline.

        Returns:
            dict: Updated result dict.
        """
        self._pad_img(results)
        self._pad_masks(results)
        self._pad_seg(results)
        self._pad_depth(results)
        return results


@PIPELINES.register_module()
class SeqPadWithDepth(PadWithDepth):
    """Pad images.
    Please refer to `mmdet.datasets.pipelines.transfroms.py:Pad` for detailed
    docstring.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def __call__(self, results):
        """Call function.
        For each dict in results, call the call function of `Pad` to pad image.
        Args:
            results (list[dict]): List of dict that from
                :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            list[dict]: List of dict that contains padding results,
            'pad_shape', 'pad_fixed_size' and 'pad_size_divisor' keys are
            added into the dict.
        """
        outs = []
        for _results in results:
            _results = super().__call__(_results)
            outs.append(_results)
        return outs


# There is nothing new from SeqNormalize.
@PIPELINES.register_module()
class SeqNormalizeWithDepth(Normalize):
    """Normalize images.
    Please refer to `mmdet.datasets.pipelines.transfroms.py:Normalize` for
    detailed docstring.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def __call__(self, results):
        """Call function.
        For each dict in results, call the call function of `Normalize` to
        normalize image.
        Args:
            results (list[dict]): List of dict that from
                :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            list[dict]: List of dict that contains normalized results,
            'img_norm_cfg' key is added into result dict.
        """
        outs = []
        for _results in results:
            _results = super().__call__(_results)
            outs.append(_results)
        return outs


================================================
FILE: external/dataset/dvps_pipelines/tricks.py
================================================
import numpy as np
from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import AutoAugment


@PIPELINES.register_module()
class SeqAutoAug(AutoAugment):
    """
    Auto augmentation a sequence.
    """
    def __init__(self, policies):
        super().__init__(policies=policies)

    def __call__(self, results):
        transform = np.random.choice(self.transforms)
        outs = []
        for _results in results:
            out = transform(_results)
            outs.append(out)
        return outs


================================================
FILE: external/dataset/forecasting_pipelines/__init__.py
================================================


================================================
FILE: external/dataset/forecasting_pipelines/loading.py
================================================
import mmcv
import numpy as np
from mmdet.core import BitmapMasks

from mmdet.datasets.builder import PIPELINES


def bitmasks2bboxes(bitmasks):
    bitmasks_array = bitmasks.masks
    boxes = np.zeros((bitmasks_array.shape[0], 4), dtype=np.float32)
    x_any = np.any(bitmasks_array, axis=1)
    y_any = np.any(bitmasks_array, axis=2)
    for idx in range(bitmasks_array.shape[0]):
        x = np.where(x_any[idx, :])[0]
        y = np.where(y_any[idx, :])[0]
        if len(x) > 0 and len(y) > 0:
            boxes[idx, :] = np.array((x[0], y[0], x[-1], y[-1]), dtype=np.float32)
    return boxes


@PIPELINES.register_module()
class LoadMultiImagesFromFile:
    """Load an image from file.
    Required keys are "img_prefix" and "img_info" (a dict that must contain the
    key "filename"). Added or updated keys are "filename", "img", "img_shape",
    "ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`),
    "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1).
    Args:
        to_float32 (bool): Whether to convert the loaded image to a float32
            numpy array. If set to False, the loaded image is an uint8 array.
            Defaults to False.
        color_type (str): The flag argument for :func:`mmcv.imfrombytes`.
            Defaults to 'color'.
        file_client_args (dict): Arguments to instantiate a FileClient.
            See :class:`mmcv.fileio.FileClient` for details.
            Defaults to ``dict(backend='disk')``.
    """

    def __init__(self,
                 to_float32=False,
                 color_type='color',
                 file_client_args=dict(backend='disk')):
        self.to_float32 = to_float32
        self.color_type = color_type
        self.file_client_args = file_client_args.copy()
        self.file_client = None

    def __call__(self, results):
        """Call functions to load image and get image meta information.
        Args:
            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
        Returns:
            dict: The dict contains loaded image and meta information.
        """

        if self.file_client is None:
            self.file_client = mmcv.FileClient(**self.file_client_args)

        filenames = results['img_info']['filename']
        imgs = []
        for filename in filenames:
            img_bytes = self.file_client.get(filename)
            img = mmcv.imfrombytes(img_bytes, flag=self.color_type)
            if self.to_float32:
                img = img.astype(np.float32)
            imgs.append(img)
        img = np.concatenate(imgs, axis=-1)

        results['img'] = img
        results['img_shape'] = img.shape
        results['ori_shape'] = img.shape
        results['img_fields'].append('img')
        return results

    def __repr__(self):
        repr_str = (f'{self.__class__.__name__}('
                    f'to_float32={self.to_float32}, '
                    f"color_type='{self.color_type}', "
                    f'file_client_args={self.file_client_args})')
        return repr_str


@PIPELINES.register_module()
class LoadAnnotationsInstanceMasks:
    def __init__(self,
                 with_mask=True,
                 with_seg=True,
                 with_inst=False,
                 file_client_args=dict(backend='disk')):
        self.with_mask = with_mask
        self.with_seg = with_seg
        self.with_inst = with_inst
        self.file_client_args = file_client_args.copy()
        self.file_client = None

    def _load_masks(self, results):
        """Private function to load mask annotations.
        Args:
            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
        Returns:
            dict: The dict contains loaded mask annotations.
                If ``self.poly2mask`` is set ``True``, `gt_mask` will contain
                :obj:`PolygonMasks`. Otherwise, :obj:`BitmapMasks` is used.
        """

        img_bytes = self.file_client.get(results['ann_info']['inst_map'])
        inst_mask = mmcv.imfrombytes(img_bytes, flag='unchanged').squeeze()
        if self.with_inst:
            results['gt_instance_map'] = inst_mask.copy().astype(int)
            results['gt_instance_map'][inst_mask < 10000] *= 1000
        if not self.with_mask:
            return results
        masks = []
        labels = []
        for inst_id in np.unique(inst_mask):
            if inst_id >= 10000:
                masks.append((inst_mask == inst_id).astype(int))
                labels.append(inst_id // 1000)
        if len(masks) == 0:
            return None
        gt_masks = BitmapMasks(masks, height=inst_mask.shape[0], width=inst_mask.shape[1])
        results['gt_masks'] = gt_masks
        results['mask_fields'].append('gt_masks')
        results['gt_labels'] = np.array(labels)

        boxes = bitmasks2bboxes(gt_masks)
        results['gt_bboxes'] = boxes
        results['bbox_fields'].append('gt_bboxes')
        return results

    def _load_semantic_seg(self, results):
        """Private function to load semantic segmentation annotations.
        Args:
            results (dict): Result dict from :obj:`dataset`.
        Returns:
            dict: The dict contains loaded semantic segmentation annotations.
        """
        img_bytes = self.file_client.get(results['ann_info']['seg_map'])
        results['gt_semantic_seg'] = mmcv.imfrombytes(
            img_bytes, flag='unchanged').squeeze()
        results['seg_fields'].append('gt_semantic_seg')
        return results

    def __call__(self, results):
        """Call function to load multiple types annotations.
        Args:
            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
        Returns:
            dict: The dict contains loaded bounding box, label, mask and
                semantic segmentation annotations.
        """
        if self.file_client is None:
            self.file_client = mmcv.FileClient(**self.file_client_args)
        if self.with_mask or self.with_inst:
            results = self._load_masks(results)
            if results is None:
                return None
        if self.with_seg:
            results = self._load_semantic_seg(results)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'with_mask={self.with_mask}, '
        repr_str += f'with_seg={self.with_seg}, '
        return repr_str


================================================
FILE: external/dataset/forecasting_pipelines/transforms.py
================================================
import mmcv
import numpy as np
import warnings
from mmdet.datasets import PIPELINES


@PIPELINES.register_module()
class NormalizeMultiple:
    """Normalize the image.

    Added key is "img_norm_cfg".

    Args:
        mean (sequence): Mean values of 3 channels.
        std (sequence): Std values of 3 channels.
        to_rgb (bool): Whether to convert the image from BGR to RGB,
            default is true.
    """

    def __init__(self, mean, std, to_rgb=True):
        self.mean = np.array(mean, dtype=np.float32)
        self.std = np.array(std, dtype=np.float32)
        self.to_rgb = to_rgb

    def __call__(self, results):
        """Call function to normalize images.

        Args:
            results (dict): Result dict from loading pipeline.

        Returns:
            dict: Normalized results, 'img_norm_cfg' key is added into
                result dict.
        """
        for key in results.get('img_fields', ['img']):
            if results[key].shape[-1] > 3:
                num_3 = results[key].shape[-1]
                assert num_3 % 3 == 0
                num_img = num_3 // 3
                img = np.ones_like(results[key]).astype(np.float32)
                for i in range(num_img):
                    img[..., 3 * i:3 * i + 3] = mmcv.imnormalize(
                        results[key][..., 3 * i:3 * i + 3], self.mean, self.std, self.to_rgb)
                results[key] = img
            else:
                results[key] = mmcv.imnormalize(results[key], self.mean, self.std, self.to_rgb)
        results['img_norm_cfg'] = dict(
            mean=self.mean, std=self.std, to_rgb=self.to_rgb)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
        return repr_str


@PIPELINES.register_module()
class PadFutureMMDet:
    """Pad the image & masks & segmentation map.
    There are two padding modes: (1) pad to a fixed size and (2) pad to the
    minimum size that is divisible by some number.
    Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
    Args:
        size (tuple, optional): Fixed padding size.
        size_divisor (int, optional): The divisor of padded size.
        pad_to_square (bool): Whether to pad the image into a square.
            Currently only used for YOLOX. Default: False.
        pad_val (dict, optional): A dict for padding value, the default
            value is `dict(img=0, masks=0, seg=255)`.
    """

    def __init__(self,
                 size=None,
                 size_divisor=None,
                 pad_to_square=False,
                 pad_val=dict(img=0, masks=0, seg=255)):
        self.size = size
        self.size_divisor = size_divisor
        if isinstance(pad_val, float) or isinstance(pad_val, int):
            warnings.warn(
                'pad_val of float type is deprecated now, '
                f'please use pad_val=dict(img={pad_val}, '
                f'masks={pad_val}, seg=255) instead.', DeprecationWarning)
            pad_val = dict(img=pad_val, masks=pad_val, seg=255)
        assert isinstance(pad_val, dict)
        self.pad_val = pad_val
        self.pad_to_square = pad_to_square

        if pad_to_square:
            assert size is None and size_divisor is None, \
                'The size and size_divisor must be None ' \
                'when pad2square is True'
        else:
            assert size is not None or size_divisor is not None, \
                'only one of size and size_divisor should be valid'
            assert size is None or size_divisor is None

    def _pad_img(self, results):
        """Pad images according to ``self.size``."""
        pad_val = self.pad_val.get('img', 0)
        for key in results.get('img_fields', ['img']):
            if self.pad_to_square:
                max_size = max(results[key].shape[:2])
                self.size = (max_size, max_size)
            if self.size is not None:
                padded_img = mmcv.impad(
                    results[key], shape=self.size, pad_val=pad_val)
            elif self.size_divisor is not None:
                padded_img = mmcv.impad_to_multiple(
                    results[key], self.size_divisor, pad_val=pad_val)
            results[key] = padded_img
        results['pad_shape'] = padded_img.shape
        results['pad_fixed_size'] = self.size
        results['pad_size_divisor'] = self.size_divisor

    def _pad_masks(self, results):
        """Pad masks according to ``results['pad_shape']``."""
        pad_shape = results['pad_shape'][:2]
        pad_val = self.pad_val.get('masks', 0)
        for key in results.get('mask_fields', []):
            results[key] = results[key].pad(pad_shape, pad_val=pad_val)

    def _pad_seg(self, results):
        """Pad semantic segmentation map according to
        ``results['pad_shape']``."""
        pad_val = self.pad_val.get('seg', 255)
        for key in results.get('seg_fields', []):
            results[key] = mmcv.impad(
                results[key], shape=results['pad_shape'][:2], pad_val=pad_val)

    def __call__(self, results):
        """Call function to pad images, masks, semantic segmentation maps.
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Updated result dict.
        """
        self._pad_img(results)
        self._pad_masks(results)
        self._pad_seg(results)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(size={self.size}, '
        repr_str += f'size_divisor={self.size_divisor}, '
        repr_str += f'pad_to_square={self.pad_to_square}, '
        repr_str += f'pad_val={self.pad_val})'
        return repr_str


@PIPELINES.register_module()
class KNetInsAdapter:
    """Adapter that is used to convert city-style instance class-ids
    to coco-style instance-ids (11-starting to 0-starting)
    """

    def __init__(self, stuff_nums=11):
        self.stuff_nums = stuff_nums

    def __call__(self, results):
        """Call function to modify gt_labels
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Updated result dict.
        """
        results['gt_labels'] -= self.stuff_nums
        return results


================================================
FILE: external/dataset/mIoU.py
================================================
import numpy as np


def eval_miou(results, targets, num_classes, ignore_index=255):
    total_area_intersect = np.zeros((num_classes,), dtype=np.float64)
    total_area_union = np.zeros((num_classes,), dtype=np.float64)
    total_area_pred = np.zeros((num_classes,), dtype=np.float64)
    total_area_label = np.zeros((num_classes,), dtype=np.float64)

    for result, target in zip(results, targets):
        mask = (target != ignore_index)
        pred = result[mask]
        label = target[mask]

        intersect = pred[pred == label]
        area_intersect, _ = np.histogram(intersect.astype(float), bins=num_classes, range=(0, num_classes - 1))
        area_pred, _ = np.histogram(pred.astype(float), bins=num_classes, range=(0, num_classes - 1))
        area_label, _ = np.histogram(label.astype(float), bins=num_classes, range=(0, num_classes - 1))
        area_union = area_pred + area_label - area_intersect

        total_area_intersect += area_intersect
        total_area_pred += area_intersect
        total_area_label += area_label
        total_area_union += area_union

    iou_per_class = total_area_intersect / total_area_union
    return iou_per_class


if __name__ == '__main__':
    results = [
        np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    ]
    targets = [
        np.array([[1, 2, 3], [1, 1, 2], [255, 255, 255]])
    ]
    eval_miou(results, targets, 19)


================================================
FILE: external/dataset/pipelines/__init__.py
================================================


================================================
FILE: external/dataset/pipelines/formatting.py
================================================
import numpy as np
import torch
from mmcv.parallel import DataContainer as DC
from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import to_tensor


@PIPELINES.register_module()
class ConcatVideoReferences(object):
    """Concat video references.

    If the input list contains at least two dicts, concat the input list of
    dict to one dict from 2-nd dict of the input list.

    Args:
        results (list[dict]): List of dict that contain keys such as 'img',
            'img_metas', 'gt_masks','proposals', 'gt_bboxes',
            'gt_bboxes_ignore', 'gt_labels','gt_semantic_seg',
            'gt_instance_ids'.

    Returns:
        list[dict]: The first dict of outputs is the same as the first
        dict of `results`. The second dict of outputs concats the
        dicts in `results[1:]`.
    """

    def __call__(self, results):
        assert (isinstance(results, list)), 'results must be list'
        outs = results[:1]
        for i, result in enumerate(results[1:], 1):
            if 'img' in result:
                img = result['img']
                if len(img.shape) < 3:
                    img = np.expand_dims(img, -1)
                if i == 1:
                    result['img'] = np.expand_dims(img, -1)
                else:
                    outs[1]['img'] = np.concatenate(
                        (outs[1]['img'], np.expand_dims(img, -1)), axis=-1)
            for key in ['img_metas', 'gt_masks']:
                if key in result:
                    if i == 1:
                        result[key] = [result[key]]
                    else:
                        outs[1][key].append(result[key])
            for key in [
                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
                'gt_instance_ids',
            ]:
                if key not in result:
                    continue
                value = result[key]
                if value.ndim == 1:
                    value = value[:, None]
                N = value.shape[0]
                value = np.concatenate((np.full(
                    (N, 1), i - 1, dtype=np.float32), value),
                    axis=1)
                if i == 1:
                    result[key] = value
                else:
                    outs[1][key] = np.concatenate((outs[1][key], value),
                                                  axis=0)
            if 'gt_semantic_seg' in result:
                if i == 1:
                    result['gt_semantic_seg'] = result['gt_semantic_seg'][...,
                                                                          None,
                                                                          None]
                else:
                    outs[1]['gt_semantic_seg'] = np.concatenate(
                        (outs[1]['gt_semantic_seg'],
                         result['gt_semantic_seg'][..., None, None]),
                        axis=-1)

            if 'gt_depth' in result:
                if i == 1:
                    result['gt_depth'] = result['gt_depth'][...,
                                                            None,
                                                            None]
                else:
                    outs[1]['gt_depth'] = np.concatenate(
                        (outs[1]['gt_depth'],
                         result['gt_depth'][..., None, None]),
                        axis=-1)
            if i == 1:
                outs.append(result)
        return outs


@PIPELINES.register_module()
class ConcatVideos(object):
    """Concat video references.

    If the input list contains at least two dicts, concat the input list of
    dict to one dict from 2-nd dict of the input list.

    Args:
        results (list[dict]): List of dict that contain keys such as 'img',
            'img_metas', 'gt_masks','proposals', 'gt_bboxes',
            'gt_bboxes_ignore', 'gt_labels','gt_semantic_seg',
            'gt_instance_ids'.

    Returns:
        list[dict]: The first dict of outputs is the same as the first
        dict of `results`. The second dict of outputs concats the
        dicts in `results[1:]`.
    """

    def __call__(self, results):
        assert (isinstance(results, list)), 'results must be list'
        outs = results[:1]
        # outs = []
        for i, result in enumerate(results[0:], 1):
            if 'img' in result:
                img = result['img']
                if len(img.shape) < 3:
                    img = np.expand_dims(img, -1)
                if i == 1:
                    result['img'] = np.expand_dims(img, -1)
                else:
                    outs[1]['img'] = np.concatenate(
                        (outs[1]['img'], np.expand_dims(img, -1)), axis=-1)
            for key in ['img_metas', 'gt_masks']:
                if key in result:
                    if i == 1:
                        result[key] = [result[key]]
                    else:
                        outs[1][key].append(result[key])
            for key in [
                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
                'gt_instance_ids'
            ]:
                if key not in result:
                    continue
                value = result[key]
                if value.ndim == 1:
                    value = value[:, None]
                N = value.shape[0]
                value = np.concatenate((np.full(
                    (N, 1), i - 1, dtype=np.float32), value),
                    axis=1)
                if i == 1:
                    result[key] = value
                else:
                    outs[1][key] = np.concatenate((outs[1][key], value),
                                                  axis=0)
            if 'gt_semantic_seg' in result:
                if i == 1:
                    result['gt_semantic_seg'] = result['gt_semantic_seg'][...,
                                                                          None,
                                                                          None]
                else:
                    outs[1]['gt_semantic_seg'] = np.concatenate(
                        (outs[1]['gt_semantic_seg'],
                         result['gt_semantic_seg'][..., None, None]),
                        axis=-1)
            if i == 1:
                outs.append(result)
        res = []
        res.append(outs[1])
        return res


@PIPELINES.register_module()
class MultiImagesToTensor(object):
    """Multi images to tensor.

    1. Transpose and convert image/multi-images to Tensor.
    2. Add prefix to every key in the second dict of the inputs. Then, add
    these keys and corresponding values into the outputs.

    Args:
        ref_prefix (str): The prefix of key added to the second dict of inputs.
            Defaults to 'ref'.
    """

    def __init__(self, ref_prefix='ref'):
        self.ref_prefix = ref_prefix

    def __call__(self, results):
        """Multi images to tensor.

        1. Transpose and convert image/multi-images to Tensor.
        2. Add prefix to every key in the second dict of the inputs. Then, add
        these keys and corresponding values into the output dict.

        Args:
            results (list[dict]): List of two dicts.

        Returns:
            dict: Each key in the first dict of `results` remains unchanged.
            Each key in the second dict of `results` adds `self.ref_prefix`
            as prefix.
        """
        outs = []
        for _results in results:
            _results = self.images_to_tensor(_results)
            outs.append(_results)

        data = {}
        data.update(outs[0])
        if len(outs) == 2:
            for k, v in outs[1].items():
                data[f'{self.ref_prefix}_{k}'] = v

        return data

    def images_to_tensor(self, results):
        """Transpose and convert images/multi-images to Tensor."""
        if 'img' in results:
            img = results['img']
            if len(img.shape) == 3:
                # (H, W, 3) to (3, H, W)
                img = np.ascontiguousarray(img.transpose(2, 0, 1))
            else:
                # (H, W, 3, N) to (N, 3, H, W)
                img = np.ascontiguousarray(img.transpose(3, 2, 0, 1))
            results['img'] = to_tensor(img)
        if 'proposals' in results:
            results['proposals'] = to_tensor(results['proposals'])
        if 'img_metas' in results:
            results['img_metas'] = DC(results['img_metas'], cpu_only=True)
        return results


@PIPELINES.register_module()
class SeqDefaultFormatBundle(object):
    """Sequence Default formatting bundle.

    It simplifies the pipeline of formatting common fields, including "img",
    "img_metas", "proposals", "gt_bboxes", "gt_instance_ids",
    "gt_match_indices", "gt_bboxes_ignore", "gt_labels", "gt_masks" and
    "gt_semantic_seg". These fields are formatted as follows.

    - img: (1) transpose, (2) to tensor, (3) to DataContainer (stack=True)
    - img_metas: (1) to DataContainer (cpu_only=True)
    - proposals: (1) to tensor, (2) to DataContainer
    - gt_bboxes: (1) to tensor, (2) to DataContainer
    - gt_instance_ids: (1) to tensor, (2) to DataContainer
    - gt_match_indices: (1) to tensor, (2) to DataContainer
    - gt_bboxes_ignore: (1) to tensor, (2) to DataContainer
    - gt_labels: (1) to tensor, (2) to DataContainer
    - gt_masks: (1) to DataContainer (cpu_only=True)
    - gt_semantic_seg: (1) unsqueeze dim-0 (2) to tensor, \
                       (3) to DataContainer (stack=True)

    Args:
        ref_prefix (str): The prefix of key added to the second dict of input
            list. Defaults to 'ref'.
    """

    def __init__(self, ref_prefix='ref'):
        self.ref_prefix = ref_prefix

    def __call__(self, results):
        """Sequence Default formatting bundle call function.

        Args:
            results (list[dict]): List of two dicts.

        Returns:
            dict: The result dict contains the data that is formatted with
            default bundle. Each key in the second dict of the input list
            adds `self.ref_prefix` as prefix.
        """
        outs = []
        for _results in results:
            _results = self.default_format_bundle(_results)
            outs.append(_results)

        data = {}
        if self.ref_prefix == 'ref':
            # origin frames
            data.update(outs[0])
            # reference frames
            if len(outs) == 1:
                # for k in outs[0]:
                #     data[f'{self.ref_prefix}_{k}'] = None
                pass
            else:
                for k, v in outs[1].items():
                    data[f'{self.ref_prefix}_{k}'] = v
        elif self.ref_prefix is None:
            # origin frames
            data.update(outs[0])

        return data

    def default_format_bundle(self, results):
        """Transform and format common fields in results.

        Args:
            results (dict): Result dict contains the data to convert.

        Returns:
            dict: The result dict contains the data that is formatted with
            default bundle.
        """
        if 'img' in results:
            img = results['img']
            if len(img.shape) == 3:
                img = np.ascontiguousarray(img.transpose(2, 0, 1))
            else:
                img = np.ascontiguousarray(img.transpose(3, 2, 0, 1))
            results['img'] = DC(to_tensor(img), stack=True)
        for key in [
            'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
            'gt_instance_ids', 'gt_match_indices',
        ]:
            if key not in results:
                continue
            results[key] = DC(to_tensor(results[key]))
        for key in ['img_metas', 'gt_masks']:
            if key in results:
                results[key] = DC(results[key], cpu_only=True)
        if 'gt_semantic_seg' in results:
            semantic_seg = results['gt_semantic_seg']
            if len(semantic_seg.shape) == 2:
                semantic_seg = semantic_seg[None, ...]
            else:
                semantic_seg = np.ascontiguousarray(
                    semantic_seg.transpose(3, 2, 0, 1))
            results['gt_semantic_seg'] = DC(
                to_tensor(semantic_seg), stack=True)
        if 'gt_depth' in results:
            gt_depth = results['gt_depth']
            if len(gt_depth.shape) == 2:
                gt_depth = gt_depth[None, ...]
            else:
                gt_depth = np.ascontiguousarray(
                    gt_depth.transpose(3, 2, 0, 1))
            results['gt_depth'] = DC(
                to_tensor(gt_depth), stack=True)
        return results

    def __repr__(self):
        return self.__class__.__name__


@PIPELINES.register_module()
class VideoCollect(object):
    """Collect data from the loader relevant to the specific task.

    Args:
        keys (Sequence[str]): Keys of results to be collected in ``data``.
        meta_keys (Sequence[str]): Meta keys to be converted to
            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
            Defaults to None.
        default_meta_keys (tuple): Default meta keys. Defaults to ('filename',
            'ori_filename', 'ori_shape', 'img_shape', 'pad_shape',
            'scale_factor', 'flip', 'flip_direction', 'img_norm_cfg',
            'frame_id', 'is_video_data').
    """

    def __init__(self,
                 keys,
                 meta_keys=None,
                 reject_empty=False,
                 num_ref_imgs=0,
                 # no_obj_class is added for handling non-0  no-obj class
                 default_meta_keys=('filename', 'ori_filename', 'ori_shape',
                                    'img_shape', 'pad_shape', 'scale_factor',
                                    'flip', 'flip_direction', 'img_norm_cfg',
                                    'video_id',
                                    'frame_id', 'is_video_data', 'no_obj_class')):
        self.keys = keys
        self.meta_keys = default_meta_keys
        if meta_keys is not None:
            if isinstance(meta_keys, str):
                meta_keys = (meta_keys,)
            else:
                assert isinstance(meta_keys, tuple), \
                    'meta_keys must be str or tuple'
            self.meta_keys += meta_keys

        self.reject_empty = reject_empty
        self.num_ref_imgs = num_ref_imgs

    def __call__(self, results):
        """Call function to collect keys in results.

        The keys in ``meta_keys`` and ``default_meta_keys`` will be converted
        to :obj:mmcv.DataContainer.

        Args:
            results (list[dict] | dict): List of dict or dict which contains
                the data to collect.

        Returns:
            list[dict] | dict: List of dict or dict that contains the
            following keys:

            - keys in ``self.keys``
            - ``img_metas``
        """
        results_is_dict = isinstance(results, dict)
        if results_is_dict:
            results = [results]
        outs = []
        for _results in results:
            _results = self._add_default_meta_keys(_results)
            _results = self._collect_meta_keys(_results)
            outs.append(_results)

        if results_is_dict:
            outs[0]['img_metas'] = DC(outs[0]['img_metas'], cpu_only=True)

        if self.reject_empty:
            if len(results[0]['gt_labels']) == 0:
                return None
        if self.num_ref_imgs > 0:
            if len(results) != self.num_ref_imgs + 1:
                return None
        return outs[0] if results_is_dict else outs

    def _collect_meta_keys(self, results):
        """Collect `self.keys` and `self.meta_keys` from `results` (dict)."""
        data = {}
        img_meta = {}
        for key in self.meta_keys:
            if key in results:
                img_meta[key] = results[key]
            elif key in results['img_info']:
                img_meta[key] = results['img_info'][key]
        data['img_metas'] = img_meta
        for key in self.keys:
            data[key] = results[key]
        return data

    def _add_default_meta_keys(self, results):
        """Add default meta keys.

        We set default meta keys including `pad_shape`, `scale_factor` and
        `img_norm_cfg` to avoid the case where no `Resize`, `Normalize` and
        `Pad` are implemented during the whole pipeline.

        Args:
            results (dict): Result dict contains the data to convert.

        Returns:
            results (dict): Updated result dict contains the data to convert.
        """
        img = results['img']
        results.setdefault('pad_shape', img.shape)
        results.setdefault('scale_factor', 1.0)
        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
        results.setdefault(
            'img_norm_cfg',
            dict(
                mean=np.zeros(num_channels, dtype=np.float32),
                std=np.ones(num_channels, dtype=np.float32),
                to_rgb=False))
        return results


@PIPELINES.register_module()
class ToList(object):
    """Use list to warp each value of the input dict.

    Args:
        results (dict): Result dict contains the data to convert.

    Returns:
        dict: Updated result dict contains the data to convert.
    """

    def __call__(self, results):
        out = {}
        for k, v in results.items():
            out[k] = [v]
        return out


@PIPELINES.register_module()
class ReIDFormatBundle(object):
    """ReID formatting bundle.

    It first concatenates common fields, then simplifies the pipeline of
    formatting common fields, including "img", and "gt_label".
    These fields are formatted as follows.

    - img: (1) transpose, (2) to tensor, (3) to DataContainer (stack=True)
    - gt_labels: (1) to tensor, (2) to DataContainer
    """

    def __init__(self, *args, **kwargs):
        super().__init__()

    def __call__(self, results):
        """ReID formatting bundle call function.

        Args:
            results (list[dict] or dict): List of dicts or dict.

        Returns:
            dict: The result dict contains the data that is formatted with
            ReID bundle.
        """
        inputs = dict()
        if isinstance(results, list):
            assert len(results) > 1, \
                'the \'results\' only have one item, ' \
                'please directly use normal pipeline not \'Seq\' pipeline.'
            inputs['img'] = np.stack([_results['img'] for _results in results],
                                     axis=3)
            inputs['gt_label'] = np.stack(
                [_results['gt_label'] for _results in results], axis=0)
        elif isinstance(results, dict):
            inputs['img'] = results['img']
            inputs['gt_label'] = results['gt_label']
        else:
            raise TypeError('results must be a list or a dict.')
        outs = self.reid_format_bundle(inputs)

        return outs

    def reid_format_bundle(self, results):
        """Transform and format gt_label fields in results.

        Args:
            results (dict): Result dict contains the data to convert.

        Returns:
            dict: The result dict contains the data that is formatted with
            ReID bundle.
        """
        for key in results:
            if key == 'img':
                img = results[key]
                if img.ndim == 3:
                    img = np.ascontiguousarray(img.transpose(2, 0, 1))
                else:
                    img = np.ascontiguousarray(img.transpose(3, 2, 0, 1))
                results['img'] = DC(to_tensor(img), stack=True)
            elif key == 'gt_label':
                results[key] = DC(
                    to_tensor(results[key]), stack=True, pad_dims=None)
            else:
                raise KeyError(f'key {key} is not supported')
        return results


@PIPELINES.register_module()
class ImageToTensorWithRef(object):

    def __init__(self, keys):
        self.keys = keys

    def __call__(self, results):

        for key in self.keys:
            if key in ['ref_img']:
                if isinstance(results[key], list):
                    img_ref = []
                    for img in results[key]:
                        img = np.ascontiguousarray(img.transpose(2, 0, 1))
                        img_ref.append(img)
                    img_ref = np.array(img_ref)
                    results[key] = to_tensor(img_ref)
                else:
                    img = np.ascontiguousarray(results[key].transpose(2, 0, 1))
                    results[key] = to_tensor(img)
            else:
                results[key] = to_tensor(results[key].transpose(2, 0, 1))
        return results

    def __repr__(self):
        return self.__class__.__name__ + '(keys={})'.format(self.keys)

@PIPELINES.register_module()
class LabelConsistentChecker:
    """This module is to make the annotations are consistent in each video.
    """
    def __init__(self, num_frames=5):
        self.num_frames = num_frames

    def __call__(self, results):
        ref_gt_instance_ids = results['ref_gt_instance_ids'].data
        ins_mul_nframe = ref_gt_instance_ids.size(0)
        if ins_mul_nframe % self.num_frames != 0:
            return None
        num_ins = ins_mul_nframe // self.num_frames
        ins_id_bucket = torch.zeros((num_ins,), dtype=torch.float)
        for i in range(ins_mul_nframe):
            frame_cur = i // num_ins
            ins_cur = i % num_ins
            if ref_gt_instance_ids[i][0] != frame_cur:
                return None
            if frame_cur == 0:
                ins_id_bucket[ins_cur] = ref_gt_instance_ids[i][1]
            else:
                if ref_gt_instance_ids[i][1] != ins_id_bucket[ins_cur]:
                    return None
        return results


================================================
FILE: external/dataset/pipelines/loading.py
================================================
import os.path as osp
import numpy as np

import mmcv
from mmdet.core import BitmapMasks

from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import LoadAnnotations, LoadImageFromFile


@PIPELINES.register_module()
class LoadMultiImagesFromFile(LoadImageFromFile):
    """Load multi images from file.
    Please refer to `mmdet.datasets.pipelines.loading.py:LoadImageFromFile`
    for detailed docstring.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def __call__(self, results):
        """Call function.
        For each dict in `results`, call the call function of
        `LoadImageFromFile` to load image.
        Args:
            results (list[dict]): List of dict from
                :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            list[dict]: List of dict that contains loaded image.
        """
        outs = []
        for _results in results:
            _results = super().__call__(_results)
            outs.append(_results)
        return outs


@PIPELINES.register_module()
class SeqLoadAnnotations(LoadAnnotations):
    """Sequence load annotations.
    Please refer to `mmdet.datasets.pipelines.loading.py:LoadAnnotations`
    for detailed docstring.
    Args:
        with_track (bool): If True, load instance ids of bboxes.
    """

    def __init__(self, with_track=False, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.with_track = with_track

    def _load_track(self, results):
        """Private function to load label annotations.
        Args:
            results (dict): Result dict from :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            dict: The dict contains loaded label annotations.
        """

        results['gt_instance_ids'] = results['ann_info']['instance_ids'].copy()

        return results

    def __call__(self, results):
        """Call function.
        For each dict in results, call the call function of `LoadAnnotations`
        to load annotation.
        Args:
            results (list[dict]): List of dict that from
                :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            list[dict]: List of dict that contains loaded annotations, such as
            bounding boxes, labels, instance ids, masks and semantic
            segmentation annotations.
        """
        outs = []
        for _results in results:
            _results = super().__call__(_results)
            if self.with_track:
                _results = self._load_track(_results)
            outs.append(_results)
        return outs


@PIPELINES.register_module()
class LoadRefImageFromFile(object):
    """
    Code reading reference frame information.
    Specific to Cityscapes-VPS, Cityscapes, and VIPER datasets.
    """

    def __init__(self, sample=True, to_float32=False):
        self.to_float32 = to_float32
        self.sample = sample

    def __call__(self, results):
        # requires dirname for ref images
        assert results['ref_prefix'] is not None, 'ref_prefix must be specified.'

        filename = osp.join(results['img_prefix'],
                            results['img_info']['filename'])
        img = mmcv.imread(filename)
        # if specified by another ref json file.
        if 'ref_filename' in results['img_info']:
            ref_filename = osp.join(results['ref_prefix'],
                                    results['img_info']['ref_filename'])
            ref_img = mmcv.imread(ref_filename)  # [1024, 2048, 3]
        else:
            raise NotImplementedError('We need this implementation.')

        if self.to_float32:
            img = img.astype(np.float32)
            ref_img = ref_img.astype(np.float32)

        results['filename'] = filename
        results['ori_filename'] = results['img_info']['filename']
        results['img'] = img
        results['img_shape'] = img.shape
        results['ori_shape'] = img.shape
        results['ref_img'] = ref_img
        results['iid'] = results['img_info']['id']
        return results

    def __repr__(self):
        return self.__class__.__name__ + '(to_float32={})'.format(
            self.to_float32)


def bitmasks2bboxes(bitmasks):
    bitmasks_array = bitmasks.masks
    boxes = np.zeros((bitmasks_array.shape[0], 4), dtype=np.float32)
    x_any = np.any(bitmasks_array, axis=1)
    y_any = np.any(bitmasks_array, axis=2)
    for idx in range(bitmasks_array.shape[0]):
        x = np.where(x_any[idx, :])[0]
        y = np.where(y_any[idx, :])[0]
        if len(x) > 0 and len(y) > 0:
            boxes[idx, :] = np.array((x[0], y[0], x[-1], y[-1]), dtype=np.float32)
    return boxes


@PIPELINES.register_module()
class LoadAnnotationsInstanceMasks:
    def __init__(self,
                 with_mask=True,
                 with_seg=True,
                 with_inst=False,
                 cherry=None,
                 file_client_args=dict(backend='disk')):
        self.with_mask = with_mask
        self.with_seg = with_seg
        self.with_inst = with_inst
        self.file_client_args = file_client_args.copy()
        self.cherry = cherry
        self.file_client = None

    def _load_masks(self, results):
        """Private function to load mask annotations.
        Args:
            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
        Returns:
            dict: The dict contains loaded mask annotations.
                If ``self.poly2mask`` is set ``True``, `gt_mask` will contain
                :obj:`PolygonMasks`. Otherwise, :obj:`BitmapMasks` is used.
        """

        img_bytes = self.file_client.get(results['ann_info']['inst_map'])
        inst_mask = mmcv.imfrombytes(img_bytes, flag='unchanged').squeeze()
        if self.with_inst:
            results['gt_instance_map'] = inst_mask.copy().astype(int)
            results['gt_instance_map'][inst_mask < 10000] *= 1000
        if not self.with_mask:
            return results
        masks = []
        labels = []
        for inst_id in np.unique(inst_mask):
            if inst_id >= 10000:
                if self.cherry is not None and not (inst_id // 1000 in self.cherry):
                    continue
                masks.append((inst_mask == inst_id).astype(int))
                labels.append(inst_id // 1000)
        if len(masks) == 0:
            return None
        gt_masks = BitmapMasks(masks, height=inst_mask.shape[0], width=inst_mask.shape[1])
        results['gt_masks'] = gt_masks
        results['mask_fields'].append('gt_masks')
        results['gt_labels'] = np.array(labels)

        boxes = bitmasks2bboxes(gt_masks)
        results['gt_bboxes'] = boxes
        results['bbox_fields'].append('gt_bboxes')
        return results

    def _load_semantic_seg(self, results):
        """Private function to load semantic segmentation annotations.
        Args:
            results (dict): Result dict from :obj:`dataset`.
        Returns:
            dict: The dict contains loaded semantic segmentation annotations.
        """
        img_bytes = self.file_client.get(results['ann_info']['seg_map'])
        results['gt_semantic_seg'] = mmcv.imfrombytes(
            img_bytes, flag='unchanged').squeeze()
        results['seg_fields'].append('gt_semantic_seg')
        return results

    def __call__(self, results):
        """Call function to load multiple types annotations.
        Args:
            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
        Returns:
            dict: The dict contains loaded bounding box, label, mask and
                semantic segmentation annotations.
        """
        if self.file_client is None:
            self.file_client = mmcv.FileClient(**self.file_client_args)
        if self.with_mask or self.with_inst:
            results = self._load_masks(results)
            if results is None:
                return None
        if self.with_seg:
            results = self._load_semantic_seg(results)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'with_mask={self.with_mask}, '
        repr_str += f'with_seg={self.with_seg}, '
        return repr_str


================================================
FILE: external/dataset/pipelines/test_time_aug.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import warnings

import mmcv

from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import Compose


@PIPELINES.register_module()
class MultiScaleFlipAugVideo:
    """Test-time augmentation with multiple scales and flipping.
    An example configuration is as followed:
    .. code-block::
        img_scale=[(1333, 400), (1333, 800)],
        flip=True,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ]
    After MultiScaleFLipAug with above configuration, the results are wrapped
    into lists of the same length as followed:
    .. code-block::
        dict(
            img=[...],
            img_shape=[...],
            scale=[(1333, 400), (1333, 400), (1333, 800), (1333, 800)]
            flip=[False, True, False, True]
            ...
        )
    Args:
        transforms (list[dict]): Transforms to apply in each augmentation.
        img_scale (tuple | list[tuple] | None): Images scales for resizing.
        scale_factor (float | list[float] | None): Scale factors for resizing.
        flip (bool): Whether apply flip augmentation. Default: False.
        flip_direction (str | list[str]): Flip augmentation directions,
            options are "horizontal", "vertical" and "diagonal". If
            flip_direction is a list, multiple flip augmentations will be
            applied. It has no effect when flip == False. Default:
            "horizontal".
    """

    def __init__(self,
                 transforms,
                 img_scale=None,
                 scale_factor=None,
                 flip=False,
                 flip_direction='horizontal'):
        self.transforms = Compose(transforms)
        assert (img_scale is None) ^ (scale_factor is None), (
            'Must have but only one variable can be set')
        if img_scale is not None:
            self.img_scale = img_scale if isinstance(img_scale,
                                                     list) else [img_scale]
            self.scale_key = 'scale'
            assert mmcv.is_list_of(self.img_scale, tuple)
        else:
            self.img_scale = scale_factor if isinstance(
                scale_factor, list) else [scale_factor]
            self.scale_key = 'scale_factor'

        self.flip = flip
        self.flip_direction = flip_direction if isinstance(
            flip_direction, list) else [flip_direction]
        assert mmcv.is_list_of(self.flip_direction, str)
        if not self.flip and self.flip_direction != ['horizontal']:
            warnings.warn(
                'flip_direction has no effect when flip is set to False')
        if (self.flip
                and not any([t['type'] == 'RandomFlip' for t in transforms])):
            warnings.warn(
                'flip has no effect when RandomFlip is not in transforms')

    def __call__(self, results):
        """Call function to apply test time augment transforms on results.
        Args:
            results (dict): Result dict contains the data to transform.
        Returns:
           dict[str: list]: The augmented data, where each value is wrapped
               into a list.
        """

        aug_data = []
        flip_args = [(False, None)]
        if self.flip:
            flip_args += [(True, direction)
                          for direction in self.flip_direction]
        for scale in self.img_scale:
            for flip, direction in flip_args:
                _results = []
                for results_single in results:
                    _results_single = results_single.copy()
                    _results_single[self.scale_key] = scale
                    _results_single['flip'] = flip
                    _results_single['flip_direction'] = direction
                    _results.append(_results_single)
                data = self.transforms(_results)
                aug_data.append(data)
        # list of dict to dict of list
        aug_data_dict = {key: [] for key in aug_data[0]}
        for data in aug_data:
            for key, val in data.items():
                aug_data_dict[key].append(val)
        return aug_data_dict

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(transforms={self.transforms}, '
        repr_str += f'img_scale={self.img_scale}, flip={self.flip}, '
        repr_str += f'flip_direction={self.flip_direction})'
        return repr_str

================================================
FILE: external/dataset/pipelines/transforms.py
================================================
import cv2
import mmcv
import numpy as np
import warnings
from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import Normalize, Pad, RandomFlip, Resize


@PIPELINES.register_module()
class SeqColorAug(object):
    """Color augmention for images.
    Args:
        prob (list[float]): The probability to perform color augmention for
            each image. Defaults to [1.0, 1.0].
        rgb_var (list[list]]): The values of color augmentaion. Defaults to
            [[-0.55919361, 0.98062831, -0.41940627],
            [1.72091413, 0.19879334, -1.82968581],
            [4.64467907, 4.73710203, 4.88324118]].
    """

    def __init__(self,
                 prob=[1.0, 1.0],
                 rgb_var=[[-0.55919361, 0.98062831, -0.41940627],
                          [1.72091413, 0.19879334, -1.82968581],
                          [4.64467907, 4.73710203, 4.88324118]]):
        self.prob = prob
        self.rgb_var = np.array(rgb_var, dtype=np.float32)

    def __call__(self, results):
        """Call function.
        For each dict in results, perform color augmention for image in the
        dict.
        Args:
            results (list[dict]): List of dict that from
                :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            list[dict]: List of dict that contains augmented color image.
        """
        outs = []
        for i, _results in enumerate(results):
            image = _results['img']

            if self.prob[i] > np.random.random():
                offset = np.dot(self.rgb_var, np.random.randn(3, 1))
                # bgr to rgb
                offset = offset[::-1]
                offset = offset.reshape(3)
                image = (image - offset).astype(np.float32)

            _results['img'] = image
            outs.append(_results)
        return outs


@PIPELINES.register_module()
class SeqBlurAug(object):
    """Blur augmention for images.
    Args:
        prob (list[float]): The probability to perform blur augmention for
            each image. Defaults to [0.0, 0.2].
    """

    def __init__(self, prob=[0.0, 0.2]):
        self.prob = prob

    def __call__(self, results):
        """Call function.
        For each dict in results, perform blur augmention for image in the
        dict.
        Args:
            results (list[dict]): List of dict that from
                :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            list[dict]: List of dict that contains augmented blur image.
        """
        outs = []
        for i, _results in enumerate(results):
            image = _results['img']

            if self.prob[i] > np.random.random():
                sizes = np.arange(5, 46, 2)
                size = np.random.choice(sizes)
                kernel = np.zeros((size, size))
                c = int(size / 2)
                wx = np.random.random()
                kernel[:, c] += 1. / size * wx
                kernel[c, :] += 1. / size * (1 - wx)
                image = cv2.filter2D(image, -1, kernel)

            _results['img'] = image
            outs.append(_results)
        return outs


@PIPELINES.register_module()
class SeqResize(Resize):
    """Resize images.
    Please refer to `mmdet.datasets.pipelines.transfroms.py:Resize` for
    detailed docstring.
    Args:
        share_params (bool): If True, share the resize parameters for all
            images. Defaults to True.
    """

    def __init__(self, share_params=True, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.share_params = share_params

    def __call__(self, results):
        """Call function.
        For each dict in results, call the call function of `Resize` to resize
        image and corresponding annotations.
        Args:
            results (list[dict]): List of dict that from
                :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            list[dict]: List of dict that contains resized results,
            'img_shape', 'pad_shape', 'scale_factor', 'keep_ratio' keys
            are added into result dict.
        """
        outs, scale = [], None
        for i, _results in enumerate(results):
            if self.share_params and i > 0:
                _results['scale'] = scale
            _results = super().__call__(_results)
            if self.share_params and i == 0:
                scale = _results['scale']
            outs.append(_results)
        return outs


@PIPELINES.register_module()
class SeqNormalize(Normalize):
    """Normalize images.
    Please refer to `mmdet.datasets.pipelines.transfroms.py:Normalize` for
    detailed docstring.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def __call__(self, results):
        """Call function.
        For each dict in results, call the call function of `Normalize` to
        normalize image.
        Args:
            results (list[dict]): List of dict that from
                :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            list[dict]: List of dict that contains normalized results,
            'img_norm_cfg' key is added into result dict.
        """
        outs = []
        for _results in results:
            _results = super().__call__(_results)
            outs.append(_results)
        return outs


@PIPELINES.register_module()
class SeqRandomFlip(RandomFlip):
    """Randomly flip for images.
    Please refer to `mmdet.datasets.pipelines.transfroms.py:RandomFlip` for
    detailed docstring.
    Args:
        share_params (bool): If True, share the flip parameters for all images.
            Defaults to True.
    """

    def __init__(self, share_params, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.share_params = share_params

    def __call__(self, results):
        """Call function.
        For each dict in results, call `RandomFlip` to randomly flip image.
        Args:
            results (list[dict]): List of dict that from
                :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            list[dict]: List of dict that contains flipped results, 'flip',
            'flip_direction' keys are added into the dict.
        """
        if self.share_params:
            if isinstance(self.direction, list):
                # None means non-flip
                direction_list = self.direction + [None]
            else:
                # None means non-flip
                direction_list = [self.direction, None]

            if isinstance(self.flip_ratio, list):
                non_flip_ratio = 1 - sum(self.flip_ratio)
                flip_ratio_list = self.flip_ratio + [non_flip_ratio]
            else:
                non_flip_ratio = 1 - self.flip_ratio
                # exclude non-flip
                single_ratio = self.flip_ratio / (len(direction_list) - 1)
                flip_ratio_list = [single_ratio] * (len(direction_list) -
                                                    1) + [non_flip_ratio]

            cur_dir = np.random.choice(direction_list, p=flip_ratio_list)
            flip = cur_dir is not None
            flip_direction = cur_dir

            for _results in results:
                _results['flip'] = flip
                _results['flip_direction'] = flip_direction

        outs = []
        for _results in results:
            _results = super().__call__(_results)
            outs.append(_results)
        return outs


@PIPELINES.register_module()
class SeqPad(Pad):
    """Pad images.
    Please refer to `mmdet.datasets.pipelines.transfroms.py:Pad` for detailed
    docstring.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def __call__(self, results):
        """Call function.
        For each dict in results, call the call function of `Pad` to pad image.
        Args:
            results (list[dict]): List of dict that from
                :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            list[dict]: List of dict that contains padding results,
            'pad_shape', 'pad_fixed_size' and 'pad_size_divisor' keys are
            added into the dict.
        """
        outs = []
        for _results in results:
            _results = super().__call__(_results)
            outs.append(_results)
        return outs


@PIPELINES.register_module()
class SeqRandomCrop(object):
    """Sequentially random crop the images & bboxes & masks.
    The absolute `crop_size` is sampled based on `crop_type` and `image_size`,
    then the cropped results are generated.
    Args:
        crop_size (tuple): The relative ratio or absolute pixels of
            height and width.
        allow_negative_crop (bool, optional): Whether to allow a crop that does
            not contain any bbox area. Default False.
        share_params (bool, optional): Whether share the cropping parameters
            for the images.
        bbox_clip_border (bool, optional): Whether clip the objects outside
            the border of the image. Defaults to True.
    Note:
        - If the image is smaller than the absolute crop size, return the
            original image.
        - The keys for bboxes, labels and masks must be aligned. That is,
          `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and
          `gt_bboxes_ignore` corresponds to `gt_labels_ignore` and
          `gt_masks_ignore`.
        - If the crop does not contain any gt-bbox region and
          `allow_negative_crop` is set to False, skip this image.
    """

    def __init__(self,
                 crop_size,
                 allow_negative_crop=False,
                 share_params=False,
                 bbox_clip_border=True,
                 check_id_match=True
                 ):
        assert crop_size[0] > 0 and crop_size[1] > 0
        self.crop_size = crop_size
        self.allow_negative_crop = allow_negative_crop
        self.share_params = share_params
        self.bbox_clip_border = bbox_clip_border
        self.check_id_match = check_id_match
        # The key correspondence from bboxes to labels and masks.
        self.bbox2label = {
            'gt_bboxes': ['gt_labels', 'gt_instance_ids'],
            'gt_bboxes_ignore': ['gt_labels_ignore', 'gt_instance_ids_ignore']
        }
        self.bbox2mask = {
            'gt_bboxes': 'gt_masks',
            'gt_bboxes_ignore': 'gt_masks_ignore'
        }

    def get_offsets(self, img):
        """Random generate the offsets for cropping."""
        margin_h = max(img.shape[0] - self.crop_size[0], 0)
        margin_w = max(img.shape[1] - self.crop_size[1], 0)
        offset_h = np.random.randint(0, margin_h + 1)
        offset_w = np.random.randint(0, margin_w + 1)
        return offset_h, offset_w

    def random_crop(self, results, offsets=None):
        """Call function to randomly crop images, bounding boxes, masks,
        semantic segmentation maps.
        Args:
            results (dict): Result dict from loading pipeline.
            offsets (tuple, optional): Pre-defined offsets for cropping.
                Default to None.
        Returns:
            dict: Randomly cropped results, 'img_shape' key in result dict is
            updated according to crop size.
        """

        for key in results.get('img_fields', ['img']):
            img = results[key]
            if offsets is not None:
                offset_h, offset_w = offsets
            else:
                offset_h, offset_w = self.get_offsets(img)
            results['img_info']['crop_offsets'] = (offset_h, offset_w)
            crop_y1, crop_y2 = offset_h, offset_h + self.crop_size[0]
            crop_x1, crop_x2 = offset_w, offset_w + self.crop_size[1]

            # crop the image
            img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
            img_shape = img.shape
            results[key] = img
        results['img_shape'] = img_shape

        # crop bboxes accordingly and clip to the image boundary
        for key in results.get('bbox_fields', []):
            # e.g. gt_bboxes and gt_bboxes_ignore
            bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h],
                                   dtype=np.float32)
            bboxes = results[key] - bbox_offset
            if self.bbox_clip_border:
                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
            valid_inds = (bboxes[:, 2] > bboxes[:, 0]) & (
                    bboxes[:, 3] > bboxes[:, 1])
            # If the crop does not contain any gt-bbox area and
            # self.allow_negative_crop is False, skip this image.
            if (key == 'gt_bboxes' and not valid_inds.any()
                    and not self.allow_negative_crop):
                return None
            results[key] = bboxes[valid_inds, :]
            # label fields. e.g. gt_labels and gt_labels_ignore
            label_keys = self.bbox2label.get(key)
            for label_key in label_keys:
                if label_key in results:
                    results[label_key] = results[label_key][valid_inds]

            # mask fields, e.g. gt_masks and gt_masks_ignore
            mask_key = self.bbox2mask.get(key)
            if mask_key in results:
                results[mask_key] = results[mask_key][
                    valid_inds.nonzero()[0]].crop(
                    np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))

        # crop semantic seg
        for key in results.get('seg_fields', []):
            results[key] = results[key][crop_y1:crop_y2, crop_x1:crop_x2]
        return results

    def __call__(self, results):
        """Call function to sequentially randomly crop images, bounding boxes,
        masks, semantic segmentation maps.
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Randomly cropped results, 'img_shape' key in result dict is
            updated according to crop size.
        """
        if self.share_params:
            offsets = self.get_offsets(results[0]['img'])
        else:
            offsets = None

        outs = []
        for _results in results:
            _results = self.random_crop(_results, offsets)
            if _results is None:
                return None
            outs.append(_results)

        if len(outs) == 2 and self.check_id_match:
            ref_result, result = outs[1], outs[0]
            if self.check_match(ref_result, result):
                return None
        return outs

    def check_match(self, ref_results, results):
        ref_ids = ref_results['gt_instance_ids'].tolist()
        gt_ids = results['gt_instance_ids'].tolist()
        gt_pids = [ref_ids.index(i) if i in ref_ids else -1 for i in gt_ids]
        nomatch = (np.array(gt_pids) == -1).all()
        return nomatch


@PIPELINES.register_module()
class SeqPhotoMetricDistortion(object):
    """Apply photometric distortion to image sequentially, every transformation
    is applied with a probability of 0.5. The position of random contrast is in
    second or second to last.
    1. random brightness
    2. random contrast (mode 0)
    3. convert color from BGR to HSV
    4. random saturation
    5. random hue
    6. convert color from HSV to BGR
    7. random contrast (mode 1)
    8. randomly swap channels
    Args:
        brightness_delta (int): delta of brightness.
        contrast_range (tuple): range of contrast.
        saturation_range (tuple): range of saturation.
        hue_delta (int): delta of hue.
    """

    def __init__(self,
                 share_params=True,
                 brightness_delta=32,
                 contrast_range=(0.5, 1.5),
                 saturation_range=(0.5, 1.5),
                 hue_delta=18):
        self.share_params = share_params
        self.brightness_delta = brightness_delta
        self.contrast_lower, self.contrast_upper = contrast_range
        self.saturation_lower, self.saturation_upper = saturation_range
        self.hue_delta = hue_delta

    def get_params(self):
        """Generate parameters."""
        params = dict()
        # delta
        if np.random.randint(2):
            params['delta'] = np.random.uniform(-self.brightness_delta,
                                                self.brightness_delta)
        else:
            params['delta'] = None
        # mode
        mode = np.random.randint(2)
        params['contrast_first'] = True if mode == 1 else 0
        # alpha
        if np.random.randint(2):
            params['alpha'] = np.random.uniform(self.contrast_lower,
                                                self.contrast_upper)
        else:
            params['alpha'] = None
        # saturation
        if np.random.randint(2):
            params['saturation'] = np.random.uniform(self.saturation_lower,
                                                     self.saturation_upper)
        else:
            params['saturation'] = None
        # hue
        if np.random.randint(2):
            params['hue'] = np.random.uniform(-self.hue_delta, self.hue_delta)
        else:
            params['hue'] = None
        # swap
        if np.random.randint(2):
            params['permutation'] = np.random.permutation(3)
        else:
            params['permutation'] = None
        return params

    def photo_metric_distortion(self, results, params=None):
        """Call function to perform photometric distortion on images.
        Args:
            results (dict): Result dict from loading pipeline.
            params (dict, optional): Pre-defined parameters. Default to None.
        Returns:
            dict: Result dict with images distorted.
        """
        if params is None:
            params = self.get_params()
        results['img_info']['color_jitter'] = params

        if 'img_fields' in results:
            assert results['img_fields'] == ['img'], \
                'Only single img_fields is allowed'
        img = results['img']
        assert img.dtype == np.float32, \
            'PhotoMetricDistortion needs the input image of dtype np.float32,' \
            ' please set "to_float32=True" in "LoadImageFromFile" pipeline'
        # random brightness
        if params['delta'] is not None:
            img += params['delta']

        # mode == 0 --> do random contrast first
        # mode == 1 --> do random contrast last
        if params['contrast_first']:
            if params['alpha'] is not None:
                img *= params['alpha']

        # convert color from BGR to HSV
        img = mmcv.bgr2hsv(img)

        # random saturation
        if params['saturation'] is not None:
            img[..., 1] *= params['saturation']

        # random hue
        if params['hue'] is not None:
            img[..., 0] += params['hue']
            img[..., 0][img[..., 0] > 360] -= 360
            img[..., 0][img[..., 0] < 0] += 360

        # convert color from HSV to BGR
        img = mmcv.hsv2bgr(img)

        # random contrast
        if not params['contrast_first']:
            if params['alpha'] is not None:
                img *= params['alpha']

        # randomly swap channels
        if params['permutation'] is not None:
            img = img[..., params['permutation']]

        results['img'] = img
        return results

    def __call__(self, results):
        """Call function to perform photometric distortion on images.
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Result dict with images distorted.
        """
        if self.share_params:
            params = self.get_params()
        else:
            params = None

        outs = []
        for _results in results:
            _results = self.photo_metric_distortion(_results, params)
            outs.append(_results)

        return outs

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
        repr_str += 'contrast_range='
        repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
        repr_str += 'saturation_range='
        repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
        repr_str += f'hue_delta={self.hue_delta})'
        return repr_str


@PIPELINES.register_module()
class ResizeWithRef(object):
    """Resize images & bbox & mask.

    This transform resizes the input image to some scale. Bboxes and masks are
    then resized with the same scale factor. If the input dict contains the key
    "scale", then the scale in the input dict is used, otherwise the specified
    scale in the init method is used.

    `img_scale` can either be a tuple (single-scale) or a list of tuple
    (multi-scale). There are 3 multiscale modes:
    - `ratio_range` is not None: randomly sample a ratio from the ratio range
        and multiply it with the image scale.
    - `ratio_range` is None and `multiscale_mode` == "range": randomly sample a
        scale from the a range.
    - `ratio_range` is None and `multiscale_mode` == "value": randomly sample a
        scale from multiple scales.

    Args:
        img_scale (tuple or list[tuple]): Images scales for resizing.
        multiscale_mode (str): Either "range" or "value".
        ratio_range (tuple[float]): (min_ratio, max_ratio)
        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
            image.
    """

    def __init__(self,
                 img_scale=None,
                 multiscale_mode='range',
                 ratio_range=None,
                 keep_ratio=True):
        if img_scale is None:
            self.img_scale = None
        else:
            if isinstance(img_scale, list):
                self.img_scale = img_scale
            else:
                self.img_scale = [img_scale]
            assert mmcv.is_list_of(self.img_scale, tuple)

        if ratio_range is not None:
            # mode 1: given a scale and a range of image ratio
            assert len(self.img_scale) == 1
        else:
            # mode 2: given multiple scales or a range of scales
            assert multiscale_mode in ['value', 'range']

        self.multiscale_mode = multiscale_mode
        self.ratio_range = ratio_range
        self.keep_ratio = keep_ratio

    @staticmethod
    def random_select(img_scales):
        assert mmcv.is_list_of(img_scales, tuple)
        scale_idx = np.random.randint(len(img_scales))
        img_scale = img_scales[scale_idx]
        return img_scale, scale_idx

    @staticmethod
    def random_sample(img_scales):
        assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2
        img_scale_long = [max(s) for s in img_scales]
        img_scale_short = [min(s) for s in img_scales]
        long_edge = np.random.randint(
            min(img_scale_long),
            max(img_scale_long) + 1)
        short_edge = np.random.randint(
            min(img_scale_short),
            max(img_scale_short) + 1)
        img_scale = (long_edge, short_edge)
        return img_scale, None

    @staticmethod
    def random_sample_ratio(img_scale, ratio_range):
        assert isinstance(img_scale, tuple) and len(img_scale) == 2
        min_ratio, max_ratio = ratio_range
        assert min_ratio <= max_ratio
        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
        scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
        return scale, None

    def _random_scale(self, results):
        if self.ratio_range is not None:
            scale, scale_idx = self.random_sample_ratio(
                self.img_scale[0], self.ratio_range)
        elif len(self.img_scale) == 1:
            scale, scale_idx = self.img_scale[0], 0
        elif self.multiscale_mode == 'range':
            scale, scale_idx = self.random_sample(self.img_scale)
        elif self.multiscale_mode == 'value':
            scale, scale_idx = self.random_select(self.img_scale)
        else:
            raise NotImplementedError

        results['scale'] = scale
        results['scale_idx'] = scale_idx

    def _resize_img(self, results):
        els = ['ref_img', 'img'] if 'ref_img' in results else ['img']
        for el in els:
            if self.keep_ratio:
                img, scale_factor = mmcv.imrescale(
                    results[el], results['scale'], return_scale=True)
            else:
                img, w_scale, h_scale = mmcv.imresize(
                    results[el], results['scale'], return_scale=True)
                scale_factor = np.array([w_scale, h_scale, w_scale, h_scale],
                                        dtype=np.float32)
            results[el] = img
        results['img_shape'] = img.shape
        results['pad_shape'] = img.shape  # in case that there is no padding
        results['scale_factor'] = scale_factor
        results['keep_ratio'] = self.keep_ratio

    def _resize_bboxes(self, results):
        els = ['ref_bbox_fields', 'bbox_fields'] if 'ref_bbox_fields' in results else ['bbox_fields']
        for el in els:
            img_shape = results['img_shape']
            for key in results.get(el, []):
                bboxes = results[key] * results['scale_factor']
                bboxes[:, 0::2] = np.clip(
                    bboxes[:, 0::2], 0, img_shape[1] - 1)
                bboxes[:, 1::2] = np.clip(
                    bboxes[:, 1::2], 0, img_shape[0] - 1)
                results[key] = bboxes

    def _resize_masks(self, results):
        els = ['ref_mask_fields', 'mask_fields'] if 'ref_mask_fields' in results else ['mask_fields']
        for el in els:
            for key in results.get(el, []):
                if results[key] is None:
                    continue
                if self.keep_ratio:
                    masks = [
                        mmcv.imrescale(
                            mask, results['scale_factor'],
                            interpolation='nearest')
                        for mask in results[key]
                    ]
                else:
                    mask_size = (results['img_shape'][1],
                                 results['img_shape'][0])
                    masks = [
                        mmcv.imresize(mask, mask_size,
                                      interpolation='nearest')
                        for mask in results[key]
                    ]
                results[key] = masks

    def __call__(self, results):
        if 'scale' not in results:
            self._random_scale(results)
        self._resize_img(results)
        self._resize_bboxes(results)
        self._resize_masks(results)
        # self._resize_semantic_seg(results)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += ('(img_scale={}, multiscale_mode={}, ratio_range={}, '
                     'keep_ratio={})').format(self.img_scale,
                                              self.multiscale_mode,
                                              self.ratio_range,
                                              self.keep_ratio)
        return repr_str


@PIPELINES.register_module()
class RandomFlipWithRef(object):
    """Flip the image & bbox & mask.

    If the input dict contains the key "flip", then the flag will be used,
    otherwise it will be randomly decided by a ratio specified in the init
    method.

    Args:
        flip_ratio (float, optional): The flipping probability.
    """

    def __init__(self, flip_ratio=None):
        self.flip_ratio = flip_ratio
        if flip_ratio is not None:
            assert flip_ratio >= 0 and flip_ratio <= 1

    def bbox_flip(self, bboxes, img_shape):
        """Flip bboxes horizontally.

        Args:
            bboxes(ndarray): shape (..., 4*k)
            img_shape(tuple): (height, width)
        """
        assert bboxes.shape[-1] % 4 == 0
        w = img_shape[1]
        flipped = bboxes.copy()
        flipped[..., 0::4] = w - bboxes[..., 2::4] - 1
        flipped[..., 2::4] = w - bboxes[..., 0::4] - 1
        return flipped

    def __call__(self, results):
        if 'flip' not in results:
            flip = True if np.random.rand() < self.flip_ratio else False
            results['flip'] = flip
        if results['flip']:
            # flip image
            results['img'] = mmcv.imflip(results['img'])
            if 'ref_img' in results:
                results['ref_img'] = mmcv.imflip(results['ref_img'])
            # flip bboxes
            for key in results.get('bbox_fields', []):
                results[key] = self.bbox_flip(results[key],
                                              results['img_shape'])
            for key in results.get('ref_bbox_fields', []):
                results[key] = self.bbox_flip(results[key],
                                              results['img_shape'])
            # flip masks
            for key in results.get('mask_fields', []):
                results[key] = [mask[:, ::-1] for mask in results[key]]
            for key in results.get('ref_mask_fields', []):
                results[key] = [mask[:, ::-1] for mask in results[key]]
        return results

    def __repr__(self):
        return self.__class__.__name__ + '(flip_ratio={})'.format(
            self.flip_ratio)


@PIPELINES.register_module()
class PadWithRef(object):
    """Pad the image & mask.

    There are two padding modes: (1) pad to a fixed size and (2) pad to the
    minimum size that is divisible by some number.

    Args:
        size (tuple, optional): Fixed padding size.
        size_divisor (int, optional): The divisor of padded size.
        pad_val (float, optional): Padding value, 0 by default.
    """

    def __init__(self, size=None, size_divisor=None, pad_val=0):
        self.size = size
        self.size_divisor = size_divisor
        self.pad_val = pad_val
        # only one of size and size_divisor should be valid
        assert size is not None or size_divisor is not None
        assert size is None or size_divisor is None

    def _pad_img(self, results):
        els = ['ref_img', 'img'] if 'ref_img' in results else ['img']
        for el in els:
            if self.size is not None:
                padded_img = mmcv.impad(results['img'], self.size)
            elif self.size_divisor is not None:
                padded_img = mmcv.impad_to_multiple(
                    results[el], self.size_divisor, pad_val=self.pad_val)
            results[el] = padded_img
        results['pad_shape'] = padded_img.shape
        results['pad_fixed_size'] = self.size
        results['pad_size_divisor'] = self.size_divisor

    def _pad_masks(self, results):
        els = ['ref_mask_fields', 'mask_fields'] if 'ref_mask_fields' in results else ['mask_fields']
        for el in els:
            pad_shape = results['pad_shape'][:2]
            for key in results.get(el, []):
                padded_masks = [
                    mmcv.impad(mask, pad_shape, pad_val=self.pad_val)
                    for mask in results[key]
                ]
                results[key] = np.stack(padded_masks, axis=0)

    def __call__(self, results):
        self._pad_img(results)
        self._pad_masks(results)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += '(size={}, size_divisor={}, pad_val={})'.format(
            self.size, self.size_divisor, self.pad_val)
        return repr_str


@PIPELINES.register_module()
class NormalizeWithRef(object):
    """Normalize the image.

    Args:
        mean (sequence): Mean values of 3 channels.
        std (sequence): Std values of 3 channels.
        to_rgb (bool): Whether to convert the image from BGR to RGB,
            default is true.
    """

    def __init__(self, mean, std, to_rgb=True):
        self.mean = np.array(mean, dtype=np.float32)
        self.std = np.array(std, dtype=np.float32)
        self.to_rgb = to_rgb

    def __call__(self, results):
        results['img'] = mmcv.imnormalize(
            results['img'], self.mean, self.std, self.to_rgb)
        if 'ref_img' in results:
            results['ref_img'] = mmcv.imnormalize(
                results['ref_img'], self.mean, self.std, self.to_rgb)
        results['img_norm_cfg'] = dict(
            mean=self.mean, std=self.std, to_rgb=self.to_rgb)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += '(mean={}, std={}, to_rgb={})'.format(
            self.mean, self.std, self.to_rgb)
        return repr_str


@PIPELINES.register_module()
class RandomCropWithRef(object):
    """Random crop the image & bboxes & masks.

    Args:
        crop_size (tuple): Expected size after cropping, (h, w).
    """

    def __init__(self, crop_size):
        self.crop_size = crop_size

    def __call__(self, results):
        img = results['img']

        margin_h = max(img.shape[0] - self.crop_size[0], 0)
        margin_w = max(img.shape[1] - self.crop_size[1], 0)
        offset_h = np.random.randint(0, margin_h + 1)
        offset_w = np.random.randint(0, margin_w + 1)
        crop_y1, crop_y2 = offset_h, offset_h + self.crop_size[0]
        crop_x1, crop_x2 = offset_w, offset_w + self.crop_size[1]

        # crop the image
        ori_shape = img.shape
        img = img[crop_y1:crop_y2, crop_x1:crop_x2, :]
        img_shape = img.shape
        results['img'] = img
        if 'ref_img' in results:
            ref_img = results['ref_img']
            ref_img = ref_img[crop_y1:crop_y2, crop_x1:crop_x2, :]
            results['ref_img'] = ref_img
        results['img_shape'] = img_shape
        results['crop_coords'] = [crop_y1, crop_y2, crop_x1, crop_x2]

        # crop bboxes accordingly and clip to the image boundary
        els = ['ref_bbox_fields', 'bbox_fields'] if 'ref_bbox_fields' in results else ['bbox_fields']
        for el in els:
            for key in results.get(el, []):
                bbox_offset = np.array(
                    [offset_w, offset_h, offset_w, offset_h],
                    dtype=np.float32)
                bboxes = results[key] - bbox_offset
                bboxes[:, 0::2] = np.clip(
                    bboxes[:, 0::2], 0, img_shape[1] - 1)
                bboxes[:, 1::2] = np.clip(
                    bboxes[:, 1::2], 0, img_shape[0] - 1)
                results[key] = bboxes

        # filter out the gt bboxes that are completely cropped
        els = ['ref_bboxes', 'gt_bboxes'] if 'ref_bboxes' in results else ['gt_bboxes']
        for el in els:
            if el in results:
                gt_bboxes = results[el]
                valid_inds = (gt_bboxes[:, 2] > gt_bboxes[:, 0]) & (
                        gt_bboxes[:, 3] > gt_bboxes[:, 1])
                # if no gt bbox remains after cropping, just skip this image
                if not np.any(valid_inds):
                    return None
                results[el] = gt_bboxes[valid_inds, :]
                ell = el.replace('_bboxes', '_labels')
                if ell in results:
                    results[ell] = results[ell][valid_inds]
                #### filter gt_obj_ids just like gt_labes.
                elo = el.replace('_bboxes', '_obj_ids')
                if elo in results:
                    results[elo] = results[elo][valid_inds]
                # filter and crop the masks
                elm = el.replace('_bboxes', '_masks')
                if elm in results:
                    valid_gt_masks = []
                    for i in np.where(valid_inds)[0]:
                        gt_mask = results[elm][i][
                                  crop_y1:crop_y2, crop_x1:crop_x2]
                        valid_gt_masks.append(gt_mask)
                    results[elm] = valid_gt_masks

        return results

    def __repr__(self):
        return self.__class__.__name__ + '(crop_size={})'.format(
            self.crop_size)


@PIPELINES.register_module()
class PadFutureMMDet:
    """Pad the image & masks & segmentation map.
    There are two padding modes: (1) pad to a fixed size and (2) pad to the
    minimum size that is divisible by some number.
    Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
    Args:
        size (tuple, optional): Fixed padding size.
        size_divisor (int, optional): The divisor of padded size.
        pad_to_square (bool): Whether to pad the image into a square.
            Currently only used for YOLOX. Default: False.
        pad_val (dict, optional): A dict for padding value, the default
            value is `dict(img=0, masks=0, seg=255)`.
    """

    def __init__(self,
                 size=None,
                 size_divisor=None,
                 pad_to_square=False,
                 pad_val=dict(img=0, masks=0, seg=255)):
        self.size = size
        self.size_divisor = size_divisor
        if isinstance(pad_val, float) or isinstance(pad_val, int):
            warnings.warn(
                'pad_val of float type is deprecated now, '
                f'please use pad_val=dict(img={pad_val}, '
                f'masks={pad_val}, seg=255) instead.', DeprecationWarning)
            pad_val = dict(img=pad_val, masks=pad_val, seg=255)
        assert isinstance(pad_val, dict)
        self.pad_val = pad_val
        self.pad_to_square = pad_to_square

        if pad_to_square:
            assert size is None and size_divisor is None, \
                'The size and size_divisor must be None ' \
                'when pad2square is True'
        else:
            assert size is not None or size_divisor is not None, \
                'only one of size and size_divisor should be valid'
            assert size is None or size_divisor is None

    def _pad_img(self, results):
        """Pad images according to ``self.size``."""
        pad_val = self.pad_val.get('img', 0)
        for key in results.get('img_fields', ['img']):
            if self.pad_to_square:
                max_size = max(results[key].shape[:2])
                self.size = (max_size, max_size)
            if self.size is not None:
                padded_img = mmcv.impad(
                    results[key], shape=self.size, pad_val=pad_val)
            elif self.size_divisor is not None:
                padded_img = mmcv.impad_to_multiple(
                    results[key], self.size_divisor, pad_val=pad_val)
            results[key] = padded_img
        results['pad_shape'] = padded_img.shape
        results['pad_fixed_size'] = self.size
        results['pad_size_divisor'] = self.size_divisor

    def _pad_masks(self, results):
        """Pad masks according to ``results['pad_shape']``."""
        pad_shape = results['pad_shape'][:2]
        pad_val = self.pad_val.get('masks', 0)
        for key in results.get('mask_fields', []):
            results[key] = results[key].pad(pad_shape, pad_val=pad_val)

    def _pad_seg(self, results):
        """Pad semantic segmentation map according to
        ``results['pad_shape']``."""
        pad_val = self.pad_val.get('seg', 255)
        for key in results.get('seg_fields', []):
            results[key] = mmcv.impad(
                results[key], shape=results['pad_shape'][:2], pad_val=pad_val)

    def __call__(self, results):
        """Call function to pad images, masks, semantic segmentation maps.
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Updated result dict.
        """
        self._pad_img(results)
        self._pad_masks(results)
        self._pad_seg(results)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(size={self.size}, '
        repr_str += f'size_divisor={self.size_divisor}, '
        repr_str += f'pad_to_square={self.pad_to_square}, '
        repr_str += f'pad_val={self.pad_val})'
        return repr_str


@PIPELINES.register_module()
class KNetInsAdapter:
    """Adapter that is used to convert city-style instance class-ids
    to coco-style instance-ids (11-starting to 0-starting)
    """

    def __init__(self, stuff_nums=11):
        self.stuff_nums = stuff_nums

    def __call__(self, results):
        """Call function to modify gt_labels
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Updated result dict.
        """
        results['gt_labels'] -= self.stuff_nums
        return results


@PIPELINES.register_module()
class KNetInsAdapterCherryPick:
    """Adapter that is used to convert city-style instance class-ids
    to coco-style instance-ids (11-starting to 0-starting)
    """

    def __init__(self, stuff_nums=11, cherry=(11, 13)):
        self.cherry = cherry
        self.stuff_nums = stuff_nums

    def __call__(self, results):
        """Call function to modify gt_labels
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Updated result dict.
        """
        bias = 0
        for ch in self.cherry:
            results['gt_labels'][results['gt_labels'] == ch] -= bias
            bias += 1
        results['gt_labels'] -= self.stuff_nums
        return results


================================================
FILE: external/evalhooks.py
================================================
import os.path as osp
import warnings
from math import inf

import mmcv
import torch.distributed as dist
from mmcv.runner import Hook
from mmdet.utils import get_root_logger
from torch.nn.modules.batchnorm import _BatchNorm
from torch.utils.data import DataLoader

from external.test import multi_gpu_test, single_gpu_test


class EvalHook(Hook):
    """Evaluation hook.

    Notes:
        If new arguments are added for EvalHook, tools/test.py,
        tools/analysis_tools/eval_metric.py may be effected.

    Attributes:
        dataloader (DataLoader): A PyTorch dataloader.
        start (int, optional): Evaluation starting epoch. It enables evaluation
            before the training starts if ``start`` <= the resuming epoch.
            If None, whether to evaluate is merely decided by ``interval``.
            Default: None.
        interval (int): Evaluation interval (by epochs). Default: 1.
        save_best (str, optional): If a metric is specified, it would measure
            the best checkpoint during evaluation. The information about best
            checkpoint would be save in best.json.
            Options are the evaluation metrics to the test dataset. e.g.,
            ``bbox_mAP``, ``segm_mAP`` for bbox detection and instance
            segmentation. ``AR@100`` for proposal recall. If ``save_best`` is
            ``auto``, the first key will be used. The interval of
            ``CheckpointHook`` should device EvalHook. Default: None.
        rule (str, optional): Comparison rule for best score. If set to None,
            it will infer a reasonable rule. Keys such as 'mAP' or 'AR' will
            be inferred by 'greater' rule. Keys contain 'loss' will be inferred
             by 'less' rule. Options are 'greater', 'less'. Default: None.
        **eval_kwargs: Evaluation arguments fed into the evaluate function of
            the dataset.
    """

    rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y}
    init_value_map = {'greater': -inf, 'less': inf}
    greater_keys = ['mAP', 'AR']
    less_keys = ['loss']

    def __init__(self,
                 dataloader,
                 start=None,
                 interval=1,
                 by_epoch=True,
                 save_best=None,
                 rule=None,
                 **eval_kwargs):
        if not isinstance(dataloader, DataLoader):
            raise TypeError('dataloader must be a pytorch DataLoader, but got'
                            f' {type(dataloader)}')
        if not interval > 0:
            raise ValueError(f'interval must be positive, but got {interval}')
        if start is not None and start < 0:
            warnings.warn(
                f'The evaluation start epoch {start} is smaller than 0, '
                f'use 0 instead', UserWarning)
            start = 0
        self.dataloader = dataloader
        self.interval = interval
        self.by_epoch = by_epoch
        self.start = start
        assert isinstance(save_best, str) or save_best is None
        self.save_best = save_best
        self.eval_kwargs = eval_kwargs
        self.initial_epoch_flag = True

        self.logger = get_root_logger()

        if self.save_best is not None:
            self._init_rule(rule, self.save_best)

    def _init_rule(self, rule, key_indicator):
        """Initialize rule, key_indicator, comparison_func, and best score.

        Args:
            rule (str | None): Comparison rule for best score.
            key_indicator (str | None): Key indicator to determine the
                comparison rule.
        """
        if rule not in self.rule_map and rule is not None:
            raise KeyError(f'rule must be greater, less or None, '
                           f'but got {rule}.')

        if rule is None:
            if key_indicator != 'auto':
                if any(key in key_indicator for key in self.greater_keys):
                    rule = 'greater'
                elif any(key in key_indicator for key in self.less_keys):
                    rule = 'less'
                else:
                    raise ValueError(f'Cannot infer the rule for key '
                                     f'{key_indicator}, thus a specific rule '
                                     f'must be specified.')
        self.rule = rule
        self.key_indicator = key_indicator
        if self.rule is not None:
            self.compare_func = self.rule_map[self.rule]

    def before_run(self, runner):
        if self.save_best is not None:
            if runner.meta is None:
                warnings.warn('runner.meta is None. Creating a empty one.')
                runner.meta = dict()
            runner.meta.setdefault('hook_msgs', dict())

    def before_train_epoch(self, runner):
        """Evaluate the model only at the start of training."""
        if not self.initial_epoch_flag:
            return
        if self.start is not None and runner.epoch >= self.start:
            self.after_train_epoch(runner)
        self.initial_epoch_flag = False

    def evaluation_flag(self, runner):
        """Judge whether to perform_evaluation after this epoch.

        Returns:
            bool: The flag indicating whether to perform evaluation.
        """
        if self.start is None:
            if not self.every_n_epochs(runner, self.interval):
                # No evaluation during the interval epochs.
                return False
        elif (runner.epoch + 1) < self.start:
            # No evaluation if start is larger than the current epoch.
            return False
        else:
            # Evaluation only at epochs 3, 5, 7... if start==3 and interval==2
            if (runner.epoch + 1 - self.start) % self.interval:
                return False
        return True

    def after_train_epoch(self, runner):
        if not self.by_epoch or not self.evaluation_flag(runner):
            return
        results = single_gpu_test(runner.model, self.dataloader, show=False)
        key_score = self.evaluate(runner, results)
        if self.save_best:
            self.save_best_checkpoint(runner, key_score)

    def after_train_iter(self, runner):
        if self.by_epoch or not self.every_n_iters(runner, self.interval):
            return
        results = single_gpu_test(runner.model, self.dataloader, show=False)
        key_score = self.evaluate(runner, results)
        if self.save_best:
            self.save_best_checkpoint(runner, key_score)

    def save_best_checkpoint(self, runner, key_score):
        best_score = runner.meta['hook_msgs'].get(
            'best_score', self.init_value_map[self.rule])
        if self.compare_func(key_score, best_score):
            best_score = key_score
            runner.meta['hook_msgs']['best_score'] = best_score
            last_ckpt = runner.meta['hook_msgs']['last_ckpt']
            runner.meta['hook_msgs']['best_ckpt'] = last_ckpt
            mmcv.symlink(
                last_ckpt,
                osp.join(runner.work_dir, f'best_{self.key_indicator}.pth'))
            time_stamp = runner.epoch + 1 if self.by_epoch else runner.iter + 1
            self.logger.info(f'Now best checkpoint is epoch_{time_stamp}.pth.'
                             f'Best {self.key_indicator} is {best_score:0.4f}')

    def evaluate(self, runner, results):
        eval_res = self.dataloader.dataset.evaluate(
            results, logger=runner.logger, **self.eval_kwargs)
        for name, val in eval_res.items():
            runner.log_buffer.output[name] = val
        runner.log_buffer.ready = True
        if self.save_best is not None:
            if self.key_indicator == 'auto':
                # infer from eval_results
                self._init_rule(self.rule, list(eval_res.keys())[0])
            return eval_res[self.key_indicator]
        else:
            return None


class DistEvalHook(EvalHook):
    """Distributed evaluation hook.

    Notes:
        If new arguments are added, tools/test.py may be effected.

    Attributes:
        dataloader (DataLoader): A PyTorch dataloader.
        start (int, optional): Evaluation starting epoch. It enables evaluation
            before the training starts if ``start`` <= the resuming epoch.
            If None, whether to evaluate is merely decided by ``interval``.
            Default: None.
        interval (int): Evaluation interval (by epochs). Default: 1.
        tmpdir (str | None): Temporary directory to save the results of all
            processes. Default: None.
        gpu_collect (bool): Whether to use gpu or cpu to collect results.
            Default: False.
        save_best (str, optional): If a metric is specified, it would measure
            the best checkpoint during evaluation. The information about best
            checkpoint would be save in best.json.
            Options are the evaluation metrics to the test dataset. e.g.,
            ``bbox_mAP``, ``segm_mAP`` for bbox detection and instance
            segmentation. ``AR@100`` for proposal recall. If ``save_best`` is
            ``auto``, the first key will be used. The interval of
            ``CheckpointHook`` should device EvalHook. Default: None.
        rule (str | None): Comparison rule for best score. If set to None,
            it will infer a reasonable rule. Default: 'None'.
        broadcast_bn_buffer (bool): Whether to broadcast the
            buffer(running_mean and running_var) of rank 0 to other rank
            before evaluation. Default: True.
        **eval_kwargs: Evaluation arguments fed into the evaluate function of
            the dataset.
    """

    def __init__(self,
                 dataloader,
                 start=None,
                 interval=1,
                 by_epoch=True,
                 tmpdir=None,
                 gpu_collect=False,
                 save_best=None,
                 rule=None,
                 broadcast_bn_buffer=True,
                 **eval_kwargs):
        super().__init__(
            dataloader,
            start=start,
            interval=interval,
            by_epoch=by_epoch,
            save_best=save_best,
            rule=rule,
            **eval_kwargs)
        self.broadcast_bn_buffer = broadcast_bn_buffer
        self.tmpdir = tmpdir
        self.gpu_collect = gpu_collect

    def _broadcast_bn_buffer(self, runner):
        # Synchronization of BatchNorm's buffer (running_mean
        # and running_var) is not supported in the DDP of pytorch,
        # which may cause the inconsistent performance of models in
        # different ranks, so we broadcast BatchNorm's buffers
        # of rank 0 to other ranks to avoid this.
        if self.broadcast_bn_buffer:
            model = runner.model
            for name, module in model.named_modules():
                if isinstance(module,
                              _BatchNorm) and module.track_running_stats:
                    dist.broadcast(module.running_var, 0)
                    dist.broadcast(module.running_mean, 0)

    def after_train_epoch(self, runner):
        if not self.by_epoch or not self.evaluation_flag(runner):
            return

        if self.broadcast_bn_buffer:
            self._broadcast_bn_buffer(runner)

        tmpdir = self.tmpdir
        if tmpdir is None:
            tmpdir = osp.join(runner.work_dir, '.eval_hook')
        results = multi_gpu_test(
            runner.model,
            self.dataloader,
            tmpdir=tmpdir,
            gpu_collect=self.gpu_collect)
        if runner.rank == 0:
            print('\n')
            key_score = self.evaluate(runner, results)
            if self.save_best:
                self.save_best_checkpoint(runner, key_score)

    def after_train_iter(self, runner):
        if self.by_epoch or not self.every_n_iters(runner, self.interval):
            return

        if self.broadcast_bn_buffer:
            self._broadcast_bn_buffer(runner)

        tmpdir = self.tmpdir
        if tmpdir is None:
            tmpdir = osp.join(runner.work_dir, '.eval_hook')
        results = multi_gpu_test(
            runner.model,
            self.dataloader,
            tmpdir=tmpdir,
            gpu_collect=self.gpu_collect)
        if runner.rank == 0:
            print('\n')
            key_score = self.evaluate(runner, results)
            if self.save_best:
                self.save_best_checkpoint(runner, key_score)


================================================
FILE: external/ext/mask.py
================================================
__author__ = 'tsungyi'

import pycocotools._mask as _mask

# Interface for manipulating masks stored in RLE format.
#
# RLE is a simple yet efficient format for storing binary masks. RLE
# first divides a vector (or vectorized image) into a series of piecewise
# constant regions and then for each piece simply stores the length of
# that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would
# be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1]
# (note that the odd counts are always the numbers of zeros). Instead of
# storing the counts directly, additional compression is achieved with a
# variable bitrate representation based on a common scheme called LEB128.
#
# Compression is greatest given large piecewise constant regions.
# Specifically, the size of the RLE is proportional to the number of
# *boundaries* in M (or for an image the number of boundaries in the y
# direction). Assuming fairly simple shapes, the RLE representation is
# O(sqrt(n)) where n is number of pixels in the object. Hence space usage
# is substantially lower, especially for large simple objects (large n).
#
# Many common operations on masks can be computed directly using the RLE
# (without need for decoding). This includes computations such as area,
# union, intersection, etc. All of these operations are linear in the
# size of the RLE, in other words they are O(sqrt(n)) where n is the area
# of the object. Computing these operations on the original mask is O(n).
# Thus, using the RLE can result in substantial computational savings.
#
# The following API functions are defined:
#  encode         - Encode binary masks using RLE.
#  decode         - Decode binary masks encoded via RLE.
#  merge          - Compute union or intersection of encoded masks.
#  iou            - Compute intersection over union between masks.
#  area           - Compute area of encoded masks.
#  toBbox         - Get bounding boxes surrounding encoded masks.
#  frPyObjects    - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask.
#
# Usage:
#  Rs     = encode( masks )
#  masks  = decode( Rs )
#  R      = merge( Rs, intersect=false )
#  o      = iou( dt, gt, iscrowd )
#  a      = area( Rs )
#  bbs    = toBbox( Rs )
#  Rs     = frPyObjects( [pyObjects], h, w )
#
# In the API the following formats are used:
#  Rs      - [dict] Run-length encoding of binary masks
#  R       - dict Run-length encoding of binary mask
#  masks   - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order)
#  iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore
#  bbs     - [nx4] Bounding box(es) stored as [x y w h]
#  poly    - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list)
#  dt,gt   - May be either bounding boxes or encoded masks
# Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel).
#
# Finally, a note about the intersection over union (iou) computation.
# The standard iou of a ground truth (gt) and detected (dt) object is
#  iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt))
# For "crowd" regions, we use a modified criteria. If a gt object is
# marked as "iscrowd", we allow a dt to match any subregion of the gt.
# Choosing gt' in the crowd gt that best matches the dt can be done using
# gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing
#  iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt)
# For crowd gt regions we use this modified criteria above for the iou.
#
# To compile run "python setup.py build_ext --inplace"
# Please do not contact us for help with compiling.
#
# Microsoft COCO Toolbox.      version 2.0
# Data, paper, and tutorials available at:  http://mscoco.org/
# Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
# Licensed under the Simplified BSD License [see coco/license.txt]

iou         = _mask.iou
merge       = _mask.merge
frPyObjects = _mask.frPyObjects

def encode(bimask):
    if len(bimask.shape) == 3:
        return _mask.encode(bimask)
    elif len(bimask.shape) == 2:
        h, w = bimask.shape
        return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0]

def decode(rleObjs):
    if type(rleObjs) == list:
        return _mask.decode(rleObjs)
    else:
        return _mask.decode([rleObjs])[:,:,0]

def area(rleObjs):
    if type(rleObjs) == list:
        return _mask.area(rleObjs)
    else:
        return _mask.area([rleObjs])[0]

def toBbox(rleObjs):
    if type(rleObjs) == list:
        return _mask.toBbox(rleObjs)
    else:
        return _mask.toBbox([rleObjs])[0]

================================================
FILE: external/ext/ytvos.py
================================================
__author__ = 'ychfan'
# Interface for accessing the YouTubeVIS dataset.

# The following API functions are defined:
#  YTVOS       - YTVOS api class that loads YouTubeVIS annotation file and prepare data structures.
#  decodeMask - Decode binary mask M encoded via run-length encoding.
#  encodeMask - Encode binary mask M using run-length encoding.
#  getAnnIds  - Get ann ids that satisfy given filter conditions.
#  getCatIds  - Get cat ids that satisfy given filter conditions.
#  getImgIds  - Get img ids that satisfy given filter conditions.
#  loadAnns   - Load anns with the specified ids.
#  loadCats   - Load cats with the specified ids.
#  loadImgs   - Load imgs with the specified ids.
#  annToMask  - Convert segmentation in an annotation to binary mask.
#  loadRes    - Load algorithm results and create API for accessing them.

# Microsoft COCO Toolbox.      version 2.0
# Data, paper, and tutorials available at:  http://mscoco.org/
# Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
# Licensed under the Simplified BSD License [see bsd.txt]

import json
import time
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.patches import Polygon
import numpy as np
import copy
import itertools
from . import mask as maskUtils
import os
from collections import defaultdict
import sys
PYTHON_VERSION = sys.version_info[0]


def _isArrayLike(obj):
    return hasattr(obj, '__iter__') and hasattr(obj, '__len__')


class YTVOS:
    def __init__(self, annotation_file=None):
        """
        Constructor of Microsoft COCO helper class for reading and visualizing annotations.
        :param annotation_file (str): location of annotation file
        :param image_folder (str): location to the folder that hosts images.
        :return:
        """
        # load dataset
        self.dataset,self.anns,self.cats,self.vids = dict(),dict(),dict(),dict()
        self.vidToAnns, self.catToVids = defaultdict(list), defaultdict(list)
        if not annotation_file == None:
            print('loading annotations into memory...')
            tic = time.time()
            dataset = json.load(open(annotation_file, 'r'))
            assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
            print('Done (t={:0.2f}s)'.format(time.time()- tic))
            self.dataset = dataset
            self.createIndex()

    def createIndex(self):
        # create index
        print('creating index...')
        anns, cats, vids = {}, {}, {}
        vidToAnns,catToVids = defaultdict(list),defaultdict(list)
        if 'annotations' in self.dataset:
            for ann in self.dataset['annotations']:
                vidToAnns[ann['video_id']].append(ann)
                anns[ann['id']] = ann

        if 'videos' in self.dataset:
            for vid in self.dataset['videos']:
                vids[vid['id']] = vid

        if 'categories' in self.dataset:
            for cat in self.dataset['categories']:
                cats[cat['id']] = cat

        if 'annotations' in self.dataset and 'categories' in self.dataset:
            for ann in self.dataset['annotations']:
                catToVids[ann['category_id']].append(ann['video_id'])

        print('index created!')

        # create class members
        self.anns = anns
        self.vidToAnns = vidToAnns
        self.catToVids = catToVids
        self.vids = vids
        self.cats = cats

    def info(self):
        """
        Print information about the annotation file.
        :return:
        """
        for key, value in self.dataset['info'].items():
            print('{}: {}'.format(key, value))

    def getAnnIds(self, vidIds=[], catIds=[], areaRng=[], iscrowd=None):
        """
        Get ann ids that satisfy given filter conditions. default skips that filter
        :param vidIds  (int array)     : get anns for given vids
               catIds  (int array)     : get anns for given cats
               areaRng (float array)   : get anns for given area range (e.g. [0 inf])
               iscrowd (boolean)       : get anns for given crowd label (False or True)
        :return: ids (int array)       : integer array of ann ids
        """
        vidIds = vidIds if _isArrayLike(vidIds) else [vidIds]
        catIds = catIds if _isArrayLike(catIds) else [catIds]

        if len(vidIds) == len(catIds) == len(areaRng) == 0:
            anns = self.dataset['annotations']
        else:
            if not len(vidIds) == 0:
                lists = [self.vidToAnns[vidId] for vidId in vidIds if vidId in self.vidToAnns]
                anns = list(itertools.chain.from_iterable(lists))
            else:
                anns = self.dataset['annotations']
            anns = anns if len(catIds)  == 0 else [ann for ann in anns if ann['category_id'] in catIds]
            anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['avg_area'] > areaRng[0] and ann['avg_area'] < areaRng[1]]
        if not iscrowd == None:
            ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
        else:
            ids = [ann['id'] for ann in anns]
        return ids

    def getCatIds(self, catNms=[], supNms=[], catIds=[]):
        """
        filtering parameters. default skips that filter.
        :param catNms (str array)  : get cats for given cat names
        :param supNms (str array)  : get cats for given supercategory names
        :param catIds (int array)  : get cats for given cat ids
        :return: ids (int array)   : integer array of cat ids
        """
        catNms = catNms if _isArrayLike(catNms) else [catNms]
        supNms = supNms if _isArrayLike(supNms) else [supNms]
        catIds = catIds if _isArrayLike(catIds) else [catIds]

        if len(catNms) == len(supNms) == len(catIds) == 0:
            cats = self.dataset['categories']
        else:
            cats = self.dataset['categories']
            cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name']          in catNms]
            cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
            cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id']            in catIds]
        ids = [cat['id'] for cat in cats]
        return ids

    def     getVidIds(self, vidIds=[], catIds=[]):
        '''
        Get vid ids that satisfy given filter conditions.
        :param vidIds (int array) : get vids for given ids
        :param catIds (int array) : get vids with all given cats
        :return: ids (int array)  : integer array of vid ids
        '''
        vidIds = vidIds if _isArrayLike(vidIds) else [vidIds]
        catIds = catIds if _isArrayLike(catIds) else [catIds]

        if len(vidIds) == len(catIds) == 0:
            ids = self.vids.keys()
        else:
            ids = set(vidIds)
            for i, catId in enumerate(catIds):
                if i == 0 and len(ids) == 0:
                    ids = set(self.catToVids[catId])
                else:
                    ids &= set(self.catToVids[catId])
        return list(ids)

    def loadAnns(self, ids=[]):
        """
        Load anns with the specified ids.
        :param ids (int array)       : integer ids specifying anns
        :return: anns (object array) : loaded ann objects
        """
        if _isArrayLike(ids):
            return [self.anns[id] for id in ids]
        elif type(ids) == int:
            return [self.anns[ids]]

    def loadCats(self, ids=[]):
        """
        Load cats with the specified ids.
        :param ids (int array)       : integer ids specifying cats
        :return: cats (object array) : loaded cat objects
        """
        if _isArrayLike(ids):
            return [self.cats[id] for id in ids]
        elif type(ids) == int:
            return [self.cats[ids]]

    def loadVids(self, ids=[]):
        """
        Load anns with the specified ids.
        :param ids (int array)       : integer ids specifying vid
        :return: vids (object array) : loaded vid objects
        """
        if _isArrayLike(ids):
            return [self.vids[id] for id in ids]
        elif type(ids) == int:
            return [self.vids[ids]]


    def loadRes(self, resFile):
        """
        Load result file and return a result api object.
        :param   resFile (str)     : file name of result file
        :return: res (obj)         : result api object
        """
        res = YTVOS()
        res.dataset['videos'] = [img for img in self.dataset['videos']]

        print('Loading and preparing results...')
        tic = time.time()
        if type(resFile) == str or type(resFile) == unicode:
            anns = json.load(open(resFile))
        elif type(resFile) == np.ndarray:
            anns = self.loadNumpyAnnotations(resFile)
        else:
            anns = resFile
        assert type(anns) == list, 'results in not an array of objects'
        annsVidIds = [ann['video_id'] for ann in anns]
        assert set(annsVidIds) == (set(annsVidIds) & set(self.getVidIds())), \
               'Results do not correspond to current coco set'
        if 'segmentations' in anns[0]:
            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
            for id, ann in enumerate(anns):
                ann['areas'] = []
                if not 'bboxes' in ann:
                    ann['bboxes'] = []
                for seg in ann['segmentations']:
                    # now only support compressed RLE format as segmentation results
                    if seg:
                        ann['areas'].append(maskUtils.area(seg))
                        if len(ann['bboxes']) < len(ann['areas']):
                            ann['bboxes'].append(maskUtils.toBbox(seg))
                    else:
                        ann['areas'].append(None)
                        if len(ann['bboxes']) < len(ann['areas']):
                            ann['bboxes'].append(None)
                ann['id'] = id+1
                l = [a for a in ann['areas'] if a]
                if len(l)==0:
                  ann['avg_area'] = 0
                else:
                  ann['avg_area'] = np.array(l).mean()
                ann['iscrowd'] = 0
        print('DONE (t={:0.2f}s)'.format(time.time()- tic))

        res.dataset['annotations'] = anns
        res.createIndex()
        return res

    def annToRLE(self, ann, frameId):
        """
        Convert annotation which can be polygons, uncompressed RLE to RLE.
        :return: binary mask (numpy 2D array)
        """
        t = self.vids[ann['video_id']]
        h, w = t['height'], t['width']
        segm = ann['segmentations'][frameId]
        if type(segm) == list:
            # polygon -- a single object might consist of multiple parts
            # we merge all parts into one mask rle code
            rles = maskUtils.frPyObjects(segm, h, w)
            rle = maskUtils.merge(rles)
        elif type(segm['counts']) == list:
            # uncompressed RLE
            rle = maskUtils.frPyObjects(segm, h, w)
        else:
            # rle
            rle = segm
        return rle

    def annToMask(self, ann, frameId):
        """
        Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
        :return: binary mask (numpy 2D array)
        """
        rle = self.annToRLE(ann, frameId)
        m = maskUtils.decode(rle)
        return m

================================================
FILE: external/fcn_mask_head.py
================================================
import numpy as np
import torch
from mmdet.models.builder import HEADS
from mmdet.models.roi_heads.mask_heads.fcn_mask_head import (FCNMaskHead,
                                                             _do_paste_mask)

BYTES_PER_FLOAT = 4
# TODO: This memory limit may be too much or too little. It would be better to
# determine it based on available resources.
GPU_MEM_LIMIT = 1024**3  # 1 GB memory limit


@HEADS.register_module()
class InstanceMaskHead(FCNMaskHead):

    def __init__(self, **kwargs):
        super(InstanceMaskHead, self).__init__(**kwargs)

    def get_seg_masks(self, mask_pred, det_bboxes, det_labels, rcnn_test_cfg,
                      ori_shape, scale_factor, rescale):
        """Get segmentation masks from mask_pred and bboxes.

        The only difference from InstanceMaskHead and FCNMaskHead is the output
        format of instance masks. The original FCNMaskHead return numpy masks.

        Args:
            mask_pred (Tensor or ndarray): shape (n, #class, h, w).
                For single-scale testing, mask_pred is the direct output of
                model, whose type is Tensor, while for multi-scale testing,
                it will be converted to numpy array outside of this method.
            det_bboxes (Tensor): shape (n, 4/5)
            det_labels (Tensor): shape (n, )
            rcnn_test_cfg (dict): rcnn testing config
            ori_shape (Tuple): original image height and width, shape (2,)
            scale_factor(float | Tensor): If ``rescale is True``, box
                coordinates are divided by this scale factor to fit
                ``ori_shape``.
            rescale (bool): If True, the resulting masks will be rescaled to
                ``ori_shape``.

        Returns:
            list[list]: encoded masks. The c-th item in the outer list
                corresponds to the c-th class. Given the c-th outer list, the
                i-th item in that inner list is the mask for the i-th box with
                class label c.

        Example:
            >>> import mmcv
            >>> from mmdet.models.roi_heads.mask_heads.fcn_mask_head import *  # NOQA
            >>> N = 7  # N = number of extracted ROIs
            >>> C, H, W = 11, 32, 32
            >>> # Create example instance of FCN Mask Head.
            >>> self = FCNMaskHead(num_classes=C, num_convs=0)
            >>> inputs = torch.rand(N, self.in_channels, H, W)
            >>> mask_pred = self.forward(inputs)
            >>> # Each input is associated with some bounding box
            >>> det_bboxes = torch.Tensor([[1, 1, 42, 42 ]] * N)
            >>> det_labels = torch.randint(0, C, size=(N,))
            >>> rcnn_test_cfg = mmcv.Config({'mask_thr_binary': 0, })
            >>> ori_shape = (H * 4, W * 4)
            >>> scale_factor = torch.FloatTensor((1, 1))
            >>> rescale = False
            >>> # Encoded masks are a list for each category.
            >>> encoded_masks = self.get_seg_masks(
            >>>     mask_pred, det_bboxes, det_labels, rcnn_test_cfg, ori_shape,
            >>>     scale_factor, rescale
            >>> )
            >>> assert len(encoded_masks) == C
            >>> assert sum(list(map(len, encoded_masks))) == N
        """
        if isinstance(mask_pred, torch.Tensor):
            mask_pred = mask_pred.sigmoid()
        else:
            mask_pred = det_bboxes.new_tensor(mask_pred)

        device = mask_pred.device
        bboxes = det_bboxes[:, :4]
        labels = det_labels

        if rescale:
            img_h, img_w = ori_shape[:2]
        else:
            if isinstance(scale_factor, float):
                img_h = np.round(ori_shape[0] * scale_factor).astype(np.int32)
                img_w = np.round(ori_shape[1] * scale_factor).astype(np.int32)
            else:
                w_scale, h_scale = scale_factor[0], scale_factor[1]
                img_h = np.round(ori_shape[0] * h_scale.item()).astype(
                    np.int32)
                img_w = np.round(ori_shape[1] * w_scale.item()).astype(
                    np.int32)
            scale_factor = 1.0

        if not isinstance(scale_factor, (float, torch.Tensor)):
            scale_factor = bboxes.new_tensor(scale_factor)
        bboxes = bboxes / scale_factor

        if torch.onnx.is_in_onnx_export():
            # TODO: Remove after F.grid_sample is supported.
            from torchvision.models.detection.roi_heads \
                import paste_masks_in_image
            masks = paste_masks_in_image(mask_pred, bboxes, ori_shape[:2])
            thr = rcnn_test_cfg.get('mask_thr_binary', 0)
            if thr > 0:
                masks = masks >= thr
            return masks

        N = len(mask_pred)
        # The actual implementation split the input into chunks,
        # and paste them chunk by chunk.
        if device.type == 'cpu':
            # CPU is most efficient when they are pasted one by one with
            # skip_empty=True, so that it performs minimal number of
            # operations.
            num_chunks = N
        else:
            # GPU benefits from parallelism for larger chunks,
            # but may have memory issue
            num_chunks = int(
                np.ceil(N * img_h * img_w * BYTES_PER_FLOAT / GPU_MEM_LIMIT))
            assert (num_chunks <=
                    N), 'Default GPU_MEM_LIMIT is too small; try increasing it'
        chunks = torch.chunk(torch.arange(N, device=device), num_chunks)

        threshold = rcnn_test_cfg.mask_thr_binary
        im_mask = torch.zeros(
            N,
            img_h,
            img_w,
            device=device,
            dtype=torch.bool if threshold >= 0 else torch.uint8)

        if not self.class_agnostic:
            mask_pred = mask_pred[range(N), labels][:, None]

        for inds in chunks:
            masks_chunk, spatial_inds = _do_paste_mask(
                mask_pred[inds],
                bboxes[inds],
                img_h,
                img_w,
                skip_empty=device.type == 'cpu')

            if threshold >= 0:
                masks_chunk = (masks_chunk >= threshold).to(dtype=torch.bool)
            else:
                # for visualization and debugging
                masks_chunk = (masks_chunk * 255).to(dtype=torch.uint8)

            im_mask[(inds, ) + spatial_inds] = masks_chunk

        return im_mask


================================================
FILE: external/kitti_step_dvps.py
================================================
import os
import random
from typing import Dict, List

import copy

import mmcv
import numpy as np
import torch

from mmdet.datasets.builder import DATASETS
from mmdet.datasets.pipelines import Compose
from mmdet.datasets import CustomDataset
from mmdet.utils import get_root_logger

from external.dataset.mIoU import eval_miou


class SeqObj:
    # This divisor is orthogonal with panoptic class-instance divisor.
    DIVISOR = 1000000

    def __init__(self, the_dict: Dict):
        self.dict = the_dict
        assert 'seq_id' in self.dict and 'img_id' in self.dict

    def __hash__(self):
        return self.dict['seq_id'] * self.DIVISOR + self.dict['img_id']

    def __eq__(self, other):
        return self.dict['seq_id'] == other.dict['seq_id'] and self.dict['img_id'] == other.dict['img_id']

    def __getitem__(self, attr):
        return self.dict[attr]


@DATASETS.register_module()
class KITTISTEPDVPSDataset:
    CLASSES = ('road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
               'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky',
               'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
               'bicycle')

    def __init__(self,
                 pipeline=None,
                 data_root=None,
                 test_mode=False,
                 split='train',
                 ref_seq_index: List[int] = None,
                 is_instance_only: bool = True,
                 with_depth: bool = False
                 ):
        assert data_root is not None
        data_root = os.path.expanduser(data_root)
        video_seq_dir = os.path.join(data_root, 'video_sequence', split)
        assert os.path.exists(video_seq_dir)
        assert 'leftImg8bit' not in video_seq_dir

        self.num_thing_classes = 2
        self.num_stuff_classes = 17
        self.thing_before_stuff = False

        # ref_seq_index is None means no ref img
        if ref_seq_index is None:
            ref_seq_index = []

        filenames = list(map(lambda x: str(x), os.listdir(video_seq_dir)))
        img_names = sorted(list(filter(lambda x: 'leftImg8bit' in x, filenames)))

        images = []
        for item in img_names:
            seq_id, img_id, _ = item.split(sep="_", maxsplit=2)
            if int(seq_id) == 1 and int(img_id) in [177, 178, 179, 180] and with_depth:
                continue
            item_full = os.path.join(video_seq_dir, item)
            images.append(SeqObj({
                'seq_id': int(seq_id),
                'img_id': int(img_id),
                'img': item_full,
                'depth': item_full.replace('leftImg8bit', 'depth') if with_depth else None,
                'ann': item_full.replace('leftImg8bit', 'panoptic'),
                # This should be modified carefully for each dataset. Usually 255.
                'no_obj_class': 255
            }))
            assert os.path.exists(images[-1]['img'])
            assert images[-1]['depth'] is None or os.path.exists(images[-1]['depth']), \
                "Missing depth : {}".format(images[-1]['depth'])
            # assert os.path.exists(images[-1]['ann'])

        reference_images = {hash(image): image for image in images}
        sequences = []
        for img_cur in images:
            is_seq = True
            seq_now = [img_cur.dict]
            if ref_seq_index:
                for index in random.choices(ref_seq_index, k=1):
                    query_obj = SeqObj({
                        'seq_id': img_cur.dict['seq_id'],
                        'img_id': img_cur.dict['img_id'] + index
                    })
                    if hash(query_obj) in reference_images:
                        seq_now.append(reference_images[hash(query_obj)].dict)
                    else:
                        is_seq = False
                        break
            if is_seq:
                sequences.append(seq_now)

        self.sequences = sequences
        self.ref_seq_index = ref_seq_index

        # mmdet
        self.pipeline = Compose(pipeline)
        self.test_mode = test_mode

        # misc
        self.flag = self._set_groups()
        self.is_instance_only = is_instance_only

        # For evaluation
        self.max_ins = 10000
        self.no_obj_id = 255

    def pre_pipelines(self, results):
        for _results in results:
            _results['img_info'] = []
            _results['thing_lower'] = 0 if self.thing_before_stuff else self.num_stuff_classes
            _results['thing_upper'] = self.num_thing_classes \
                if self.thing_before_stuff else self.num_stuff_classes + self.num_thing_classes
            _results['is_instance_only'] = self.is_instance_only
            _results['ori_filename'] = os.path.basename(_results['img'])

    def prepare_train_img(self, idx):
        """Get training data and annotations after pipeline.

        Args:
            idx (int): Index of data.

        Returns:
            dict: Training data and annotation after pipeline with new keys \
                introduced by pipeline.
        """
        results = copy.deepcopy(self.sequences[idx])
        self.pre_pipelines(results)
        return self.pipeline(results)

    def prepare_test_img(self, idx):
        results = copy.deepcopy(self.sequences[idx])
        self.pre_pipelines(results)
        # During test time, one image inference does not requires seq
        if not self.ref_seq_index:
            results = results[0]
        return self.pipeline(results)

    def _rand_another(self, idx):
        """Get another random index from the same group as the given index."""
        pool = np.where(self.flag == self.flag[idx])[0]
        return np.random.choice(pool)

    # Copy and Modify from mmdet
    def __getitem__(self, idx):
        """Get training/test data after pipeline.

        Args:
            idx (int): Index of data.

        Returns:
            dict: Training/test data (with annotation if `test_mode` is set \
                True).
        """

        if self.test_mode:
            return self.prepare_test_img(idx)
        else:
            while True:
                cur_data = self.prepare_train_img(idx)
                if cur_data is None:
                    idx = self._rand_another(idx)
                    continue
                return cur_data

    def __len__(self):
        """Total number of samples of data."""
        return len(self.sequences)

    def _set_groups(self):
        return np.zeros((len(self)), dtype=np.int64)

    # The evaluate func
    def evaluate(
            self,
            results,
            **kwargs
    ):
        # logger and metric
        thing_knet2real = [11, 13]
        pred_results_handled = []
        pred_depth = []
        pred_depth_final = []
        item_id = 0
        sem_preds = []
        for item in results:
            if item[-1] is not None:
                # With depth
                bbox_results, mask_results, seg_results, depth, depth_final = item
                pred_depth.append(depth)
                pred_depth_final.append(depth_final)
            else:
                bbox_results, mask_results, seg_results, _, _ = item
            # in seg_info id starts from 1
            inst_map, seg_info = seg_results
            cat_map = np.zeros_like(inst_map) + self.num_thing_classes + self.num_stuff_classes
            for instance in seg_info:
                cat_cur = instance['category_id']
                if instance['isthing']:
                    cat_cur = thing_knet2real[cat_cur]
                else:
                    if self.thing_before_stuff:
                        raise NotImplementedError
                    else:
                        # stuff starts from 1 in the model
                        cat_cur -= 1
                        offset = 0
                        for thing_id in thing_knet2real:
                            if cat_cur + offset >= thing_id:
                                offset += 1
                        cat_cur += offset
                assert cat_cur < self.num_thing_classes + self.num_stuff_classes
                cat_map[inst_map == instance['id']] = cat_cur
                if not instance['isthing']:
                    inst_map[inst_map == instance['id']] = 0
            pred_results_handled.append(cat_map.astype(np.int32) * self.max_ins + inst_map.astype(np.int32))
            item_id += 1
            sem_preds.append(cat_map)

        gt_panseg = []
        gt_depth = []
        sem_targets = []
        for item in self.sequences:
            # Only for single
            item = item[0]
            # Only for single
            id_map = mmcv.imread(item['ann'], flag='color', channel_order='rgb')
            gt_semantic_seg = id_map[..., 0].astype(np.int32)
            sem_targets.append(gt_semantic_seg)
            gt_inst_map = id_map[..., 1].astype(np.int32) * 256 + id_map[..., 2].astype(np.int32)
            ps_id = gt_semantic_seg * self.max_ins + gt_inst_map
            gt_panseg.append(ps_id)
            if len(pred_depth) > 0:
                gt_depth_cur = mmcv.imread(item['depth'], flag='unchanged').astype(np.float32) / 256.
                gt_depth.append(gt_depth_cur)

        vpq_results = []
        for pred, gt in zip(pred_results_handled, gt_panseg):
            vpq_result = vpq_eval([pred, gt])
            vpq_results.append(vpq_result)

        iou_per_class = np.stack([result[0] for result in vpq_results]).sum(axis=0)[
                        :self.num_thing_classes + self.num_stuff_classes]
        tp_per_class = np.stack([result[1] for result in vpq_results]).sum(axis=0)[
                       :self.num_thing_classes + self.num_stuff_classes]
        fn_per_class = np.stack([result[2] for result in vpq_results]).sum(axis=0)[
                       :self.num_thing_classes + self.num_stuff_classes]
        fp_per_class = np.stack([result[3] for result in vpq_results]).sum(axis=0)[
                       :self.num_thing_classes + self.num_stuff_classes]

        abs_rels = []
        abs_rel_finals = []
        if len(pred_depth) > 0:
            for pred, pred_final, gt in zip(pred_depth, pred_depth_final, gt_depth):
                depth_mask = gt > 0.
                abs_rel_normal = np.mean(
                    np.abs(
                        pred[depth_mask] -
                        gt[depth_mask]) /
                    gt[depth_mask])
                abs_rel_final = np.mean(
                    np.abs(
                        pred_final[depth_mask] -
                        gt[depth_mask]) /
                    gt[depth_mask])
                abs_rels.append(abs_rel_normal)
                abs_rel_finals.append(abs_rel_final)
            abs_rel = np.stack(abs_rels).mean(axis=0)
            abs_rel_final = np.stack(abs_rel_finals).mean(axis=0)
        else:
            abs_rel = 0.
            abs_rel_final = 0.

        # calculate the PQs
        epsilon = 0.
        sq = iou_per_class / (tp_per_class + epsilon)
        rq = tp_per_class / (tp_per_class + 0.5 *
                             fn_per_class + 0.5 * fp_per_class + epsilon)
        pq = sq * rq
        things_index = np.zeros((19,)).astype(bool)
        things_index[11] = True
        things_index[13] = True
        stuff_pq = pq[np.logical_not(things_index)]
        things_pq = pq[things_index]

        miou_per_class = eval_miou(sem_preds, sem_targets, num_classes=self.num_thing_classes + self.num_stuff_classes)
        print("class        pq\t\tsq\t\trq\t\ttp\t\tfp\t\tfn\t\tmIoU")

        for i in range(len(self.CLASSES)):
            print("{}{}{:.3f}\t\t{:.3f}\t\t{:.3f}\t\t{:.0f}\t\t{:.0f}\t\t{:.0f}\t\t{:.3f}".format(
                self.CLASSES[i], ' ' * (13 - len(self.CLASSES[i])), pq[i], sq[i], rq[i], tp_per_class[i],
                fp_per_class[i], fn_per_class[i], miou_per_class[i]
            ))

        return {
            "abs_rel": abs_rel,
            "abs_rel_final": abs_rel_final,
            "PQ": np.nan_to_num(pq).mean() * 100,
            "Stuff PQ": np.nan_to_num(stuff_pq).mean() * 100,
            "Things PQ": np.nan_to_num(things_pq).mean() * 100,
            "mIoU": np.nan_to_num(miou_per_class).mean() * 100,
        }


def vpq_eval(element):
    import six
    pred_ids, gt_ids = element
    max_ins = 10000
    ign_id = 255
    offset = 2 ** 30
    num_cat = 19 + 1

    iou_per_class = np.zeros(num_cat, dtype=np.float64)
    tp_per_class = np.zeros(num_cat, dtype=np.float64)
    fn_per_class = np.zeros(num_cat, dtype=np.float64)
    fp_per_class = np.zeros(num_cat, dtype=np.float64)

    def _ids_to_counts(id_array):
        ids, counts = np.unique(id_array, return_counts=True)
        return dict(six.moves.zip(ids, counts))

    pred_areas = _ids_to_counts(pred_ids)
    gt_areas = _ids_to_counts(gt_ids)

    void_id = ign_id * max_ins
    ign_ids = {
        gt_id for gt_id in six.iterkeys(gt_areas)
        if (gt_id // max_ins) == ign_id
    }

    int_ids = gt_ids.astype(np.int64) * offset + pred_ids.astype(np.int64)
    int_areas = _ids_to_counts(int_ids)

    def prediction_void_overlap(pred_id):
        void_int_id = void_id * offset + pred_id
        return int_areas.get(void_int_id, 0)

    def prediction_ignored_overlap(pred_id):
        total_ignored_overlap = 0
        for _ign_id in ign_ids:
            int_id = _ign_id * offset + pred_id
            total_ignored_overlap += int_areas.get(int_id, 0)
        return total_ignored_overlap

    gt_matched = set()
    pred_matched = set()

    for int_id, int_area in six.iteritems(int_areas):
        gt_id = int(int_id // offset)
        gt_cat = int(gt_id // max_ins)
        pred_id = int(int_id % offset)
        pred_cat = int(pred_id // max_ins)
        if gt_cat != pred_cat:
            continue
        union = (
                gt_areas[gt_id] + pred_areas[pred_id] - int_area -
                prediction_void_overlap(pred_id)
        )
        iou = int_area / union
        if iou > 0.5:
            tp_per_class[gt_cat] += 1
            iou_per_class[gt_cat] += iou
            gt_matched.add(gt_id)
            pred_matched.add(pred_id)

    for gt_id in six.iterkeys(gt_areas):
        if gt_id in gt_matched:
            continue
        cat_id = gt_id // max_ins
        if cat_id == ign_id:
            continue
        fn_per_class[cat_id] += 1

    for pred_id in six.iterkeys(pred_areas):
        if pred_id in pred_matched:
            continue
        if (prediction_ignored_overlap(pred_id) / pred_areas[pred_id]) > 0.5:
            continue
        cat = pred_id // max_ins
        fp_per_class[cat] += 1

    return iou_per_class, tp_per_class, fn_per_class, fp_per_class


if __name__ == '__main__':
    import dataset.dvps_pipelines.loading
    import dataset.dvps_pipelines.transforms
    import dataset.pipelines.transforms
    import dataset.pipelines.formatting

    img_norm_cfg = dict(
        mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)

    test_pipeline = [
        dict(type='LoadMultiImagesDirect'),
        dict(type='SeqPadWithDepth', size_divisor=32),
        dict(type='SeqNormalize', **img_norm_cfg),
        dict(
            type='VideoCollect',
            keys=['img']),
        dict(type='ConcatVideoReferences'),
        dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
    ]

    data = KITTISTEPDVPSDataset(
        pipeline=[
            dict(type='LoadMultiImagesDirect'),
            dict(type='LoadMultiAnnotationsDirect', with_depth=True, divisor=-1),
            dict(type='SeqFlipWithDepth', flip_ratio=0.5),
            dict(type='SeqPadWithDepth', size_divisor=32),
            dict(type='SeqNormalize', **img_norm_cfg),
            dict(
                type='VideoCollect',
                keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_depth']),
            dict(type='ConcatVideoReferences'),
            dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
        ],
        data_root=os.path.expanduser('~/datasets/kitti-step'),
        split='val',
        ref_seq_index=[-1, 1],
        with_depth=True,
    )
    np.set_string_function(lambda x: '<{} ; {}>'.format(x.shape, x.dtype))
    torch.set_printoptions(profile='short')
    for item in data:
        print(item)


================================================
FILE: external/panoptic_fpn.py
================================================
from mmdet.models.builder import DETECTORS
from mmdet.models.detectors.two_stage import TwoStageDetector


@DETECTORS.register_module()
class PanopticFPN(TwoStageDetector):
    """Implementation of `Panoptic FPN <https://arxiv.org/abs/1901.02446>`_"""

    def __init__(self,
                 backbone,
                 rpn_head,
                 roi_head,
                 train_cfg,
                 test_cfg,
                 neck=None,
                 pretrained=None):
        super(PanopticFPN, self).__init__(
            backbone=backbone,
            neck=neck,
            rpn_head=rpn_head,
            roi_head=roi_head,
            train_cfg=train_cfg,
            test_cfg=test_cfg,
            pretrained=pretrained)

    @property
    def with_semantic(self):
        """bool: whether the detector has a semantic head"""
        return ((hasattr(self, 'roi_head') and self.roi_head.with_semantic)
                or (hasattr(self, 'semantic_head')
                    and self.semantic_head is not None))


================================================
FILE: external/panoptic_head.py
================================================
import torch
from mmdet.core import bbox2result
from mmdet.models.builder import HEADS, build_head
from mmdet.models.roi_heads import StandardRoIHead


class PanopticTestMixin(object):

    def simple_test_semantic(self, x, img_metas):
        segm_feature_pred = self.semantic_head(x)
        semantic_seg_results = []
        for i, img_meta in enumerate(img_metas):
            semantic_seg_results.append(
                self.semantic_head.get_semantic_seg(segm_feature_pred[i:i + 1],
                                                    img_meta['ori_shape'],
                                                    img_meta['img_shape'])[0])

        return semantic_seg_results

    def generate_panoptic(self, det_bboxes, det_labels, mask_preds, sem_seg,
                          img_metas, merge_cfg):
        panoptic_results = []
        for i in range(len(img_metas)):
            panoptic_results.append(
                merge_stuff_thing(det_bboxes[i], det_labels[i], mask_preds[i],
                                  sem_seg[i], merge_cfg))
        return panoptic_results


@HEADS.register_module()
class PanopticHead(StandardRoIHead, PanopticTestMixin):
    """Panoptic Segmentation Head for Panoptic Seg."""

    def __init__(self, *args, semantic_head, **kwargs):
        super(PanopticHead, self).__init__(*args, **kwargs)
        self.semantic_head = build_head(semantic_head)

    @property
    def with_semantic(self):
        """bool: whether the head has semantic head"""
        if hasattr(self, 'semantic_head') and self.semantic_head is not None:
            return True
        else:
            return False

    def init_weights(self, pretrained):
        """Initialize the weights in head.

        Args:
            pretrained (str, optional): Path to pre-trained weights.
                Defaults to None.
        """
        super().init_weights(pretrained)
        if self.with_semantic:
            self.semantic_head.init_weights()

    def forward_train(self,
                      x,
                      img_metas,
                      proposal_list,
                      gt_bboxes,
                      gt_labels,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      gt_semantic_seg=None):
        """
        Args:
            x (list[Tensor]): list of multi-level img features.
            img_metas (list[dict]): list of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                `mmdet/datasets/pipelines/formatting.py:Collect`.
            proposals (list[Tensors]): list of region proposals.
            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
            gt_labels (list[Tensor]): class indices corresponding to each box
            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
                boxes can be ignored when computing the loss.
            gt_masks (None | Tensor) : true segmentation masks for each box
                used if the architecture supports a segmentation task.

        Returns:
            dict[str, Tensor]: a dictionary of loss components
        """
        # assign gts and sample proposals
        if self.with_bbox or self.with_mask:
            num_imgs = len(img_metas)
            if gt_bboxes_ignore is None:
                gt_bboxes_ignore = [None for _ in range(num_imgs)]
            sampling_results = []
            for i in range(num_imgs):
                assign_result = self.bbox_assigner.assign(
                    proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i],
                    gt_labels[i])
                sampling_result = self.bbox_sampler.sample(
                    assign_result,
                    proposal_list[i],
                    gt_bboxes[i],
                    gt_labels[i],
                    feats=[lvl_feat[i][None] for lvl_feat in x])
                sampling_results.append(sampling_result)

        losses = dict()
        # bbox head forward and loss
        if self.with_bbox:
            bbox_results = self._bbox_forward_train(x, sampling_results,
                                                    gt_bboxes, gt_labels,
                                                    img_metas)
            losses.update(bbox_results['loss_bbox'])

        # mask head forward and loss
        if self.with_mask:
            mask_results = self._mask_forward_train(x, sampling_results,
                                                    bbox_results['bbox_feats'],
                                                    gt_masks, img_metas)
            losses.update(mask_results['loss_mask'])

        if self.with_semantic:
            for i in range(gt_semantic_seg.shape[0]):
                gt_semantic_seg[i, :, img_metas[i]['img_shape']
                                [0]:, :] = self.semantic_head.ignore_label
                gt_semantic_seg[i, :, :, img_metas[i]['img_shape']
                                [1]:] = self.semantic_head.ignore_label
            seg_preds = self.semantic_head(x)
            seg_losses = self.semantic_head.loss(seg_preds, gt_semantic_seg)
            losses.update(seg_losses)

        return losses

    async def async_simple_test(self,
                                x,
                                proposal_list,
                                img_metas,
                                proposals=None,
                                rescale=False):
        """Async test without augmentation."""
        raise NotImplementedError('PanopticHead does not support async test')

    def simple_test(self,
                    x,
                    proposal_list,
                    img_metas,
                    proposals=None,
                    rescale=False):
        """Test without augmentation."""
        assert self.with_bbox, 'Bbox head must be implemented.'

        det_bboxes, det_labels = self.simple_test_bboxes(
            x, img_metas, proposal_list, self.test_cfg, rescale=rescale)
        if torch.onnx.is_in_onnx_export():
            if self.with_mask:
                segm_results = self.simple_test_mask(
                    x, img_metas, det_bboxes, det_labels, rescale=rescale)
                return det_bboxes, det_labels, segm_results
            else:
                return det_bboxes, det_labels

        bbox_results = [
            bbox2result(det_bboxes[i], det_labels[i],
                        self.bbox_head.num_classes)
            for i in range(len(det_bboxes))
        ]

        if not self.with_mask:
            return bbox_results
        else:
            mask_preds = self.simple_test_mask(
                x, img_metas, det_bboxes, det_labels, rescale=rescale)
            segm_results = mask2result(mask_preds, det_labels,
                                       self.mask_head.num_classes)

            if self.with_semantic:
                sem_seg = self.simple_test_semantic(x, img_metas)
                panoptic_results = self.generate_panoptic(
                    det_bboxes, det_labels, mask_preds, sem_seg, img_metas,
                    self.test_cfg.merge_stuff_thing)
                return list(zip(bbox_results, segm_results, panoptic_results))
            return list(zip(bbox_results, segm_results))


def mask2result(mask_preds, labels, num_classes):
    cls_segms = []
    for batch_id, mask_pred in enumerate(mask_preds):
        if isinstance(mask_pred, list):
            cls_segms.append(mask_pred)
            continue
        cls_segms.append([[] for _ in range(num_classes)])
        N = mask_preds[batch_id].shape[0]
        for i in range(N):
            cls_segms[batch_id][labels[batch_id][i]].append(
                mask_pred[i].detach().cpu().numpy())
    return cls_segms


def merge_stuff_thing(det_bboxes,
                      det_labels,
                      mask_preds,
                      sem_seg,
                      merge_cfg=None):
    """Merge stuff and thing segmentation maps.

    This function is modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/modeling/meta_arch/panoptic_fpn.py#L183  # noqa

    Args:
        det_bboxes  (torch.Tensor): Bounding boxes in shape (n, 5).
        det_labels (torch.Tensor): Labels of bounding boxes in shape (n, ).
        mask_preds (torch.Tensor): Mask prediction in the original image size.
        sem_seg (torch.Tensor): Semantic segmentation prediction in the original
            image size.
        merge_cfg (dict): The config dict containing merge hyper-parameters.
    """
    sem_seg = sem_seg.argmax(dim=0)
    box_scores = det_bboxes[:, -1]
    panoptic_seg = torch.zeros_like(sem_seg, dtype=torch.int32)

    # sort instance outputs by scores
    sorted_inds = torch.argsort(-box_scores)

    current_segment_id = 0
    segments_info = []

    if isinstance(mask_preds, list):
        instance_masks = None
    else:
        instance_masks = mask_preds.to(
            dtype=torch.bool, device=panoptic_seg.device)

    # Add instances one-by-one, check for overlaps with existing ones
    for inst_id in sorted_inds:
        score = box_scores[inst_id].item()
        if score < merge_cfg.instance_score_thr:
            break
        mask = instance_masks[inst_id]  # H,W
        mask_area = mask.sum().item()

        if mask_area == 0:
            continue

        intersect = (mask > 0) & (panoptic_seg > 0)
        intersect_area = intersect.sum().item()

        if intersect_area * 1.0 / mask_area > merge_cfg.iou_thr:
            continue

        if intersect_area > 0:
            mask = mask & (panoptic_seg == 0)

        current_segment_id += 1
        panoptic_seg[mask] = current_segment_id
        segments_info.append({
            'id': current_segment_id,
            'isthing': True,
            'score': score,
            'category_id': det_labels[inst_id].item(),
            'instance_id': inst_id.item(),
        })

    # Add semantic results to remaining empty areas
    semantic_labels = torch.unique(sem_seg).cpu().tolist()
    for semantic_label in semantic_labels:
        if semantic_label == 0:  # 0 is a special "thing" class
            continue
        mask = (sem_seg == semantic_label) & (panoptic_seg == 0)
        mask_area = mask.sum().item()
        if mask_area < merge_cfg.stuff_max_area:
            continue

        current_segment_id += 1
        panoptic_seg[mask] = current_segment_id
        segments_info.append({
            'id': current_segment_id,
            'isthing': False,
            'category_id': semantic_label,
            'area': mask_area,
        })

    return panoptic_seg.cpu().numpy(), segments_info


================================================
FILE: external/semantic_seg_head.py
================================================
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import kaiming_init
from mmcv.runner import auto_fp16, force_fp32
from mmdet.models.builder import HEADS, build_loss, build_neck
from mmdet.models.roi_heads.mask_heads import FusedSemanticHead


@HEADS.register_module()
class SemanticHead(FusedSemanticHead):
    """Semantic segmentation head that can be used in panoptic segmentation.

    Args:
        semantic_decoder (dict): Config dict of decoder.
            It usually is a neck, like semantic FPN.
        in_channels (int, optional): Input channels. Defaults to 256.
        num_classes (int, optional):  Number of semantic classes including
            the background. Defaults to 183.
        ignore_label (int, optional): Labels to be ignored. Defaults to 255.
        loss_seg (dict, optional): Config dict of loss.
            Defaults to `dict(type='CrossEntropyLoss', use_sigmoid=False, \
            loss_weight=1.0)`.
        conv_cfg (dict, optional): Config of convolutional layers.
            Defaults to None.
        norm_cfg (dict, optional): Config of normalization layers.
            Defaults to None.
    """

    def __init__(self,
                 semantic_decoder,
                 in_channels=256,
                 num_classes=183,
                 ignore_label=255,
                 pred_stride=4,
                 loss_seg=dict(
                     type='CrossEntropyLoss',
                     use_sigmoid=False,
                     loss_weight=1.0),
                 conv_cfg=None,
                 norm_cfg=None):
        super(FusedSemanticHead, self).__init__()
        self.semantic_decoder = build_neck(semantic_decoder)
        self.conv_logits = nn.Conv2d(in_channels, num_classes, 1)
        self.loss_seg = build_loss(loss_seg)

        self.in_channels = in_channels
        self.num_classes = num_classes
        self.ignore_label = ignore_label
        self.pred_stride = pred_stride
        self.conv_cfg = conv_cfg
        self.norm_cfg = norm_cfg
        self.fp16_enabled = False

    def init_weights(self):
        kaiming_init(self.conv_logits)

    @auto_fp16()
    def forward(self, feats):
        x = self.semantic_decoder(feats)
        mask_pred = self.conv_logits(x)
        return mask_pred

    @force_fp32(apply_to=('mask_pred', ))
    def loss(self, mask_pred, labels):
        mask_pred = F.interpolate(
            mask_pred,
            scale_factor=self.pred_stride,
            mode='bilinear',
            align_corners=False)
        labels = labels.squeeze(1).long()
        loss_sem_seg = self.loss_seg.loss_weight * F.cross_entropy(
            mask_pred,
            labels,
            reduction='mean',
            ignore_index=self.ignore_label)
        # loss_semantic_seg = self.loss_seg(
        #     mask_pred, labels, ignore_index=self.ignore_label)
        return dict(loss_sem_seg=loss_sem_seg)

    def get_semantic_seg(self, seg_preds, ori_shape, img_shape_withoutpad):
        """Obtain semantic segmentation map for panoptic segmentation.

        Args:
            seg_preds (torch.Tensor): Segmentation prediction
            ori_shape (tuple[int]): Input image shape with padding.
            img_shape_withoutpad (tuple[int]): Original image shape before
                without padding.
        Returns:
            list[list[np.ndarray]]: The decoded segmentation masks.
                The first dimension is the number of classes.
                The second dimension is the number of masks of a similar class.
        """
        # only surport 1 batch
        seg_preds = F.interpolate(
            seg_preds,
            scale_factor=self.pred_stride,
            mode='bilinear',
            align_corners=False)
        seg_preds = seg_preds[:, :, 0:img_shape_withoutpad[0],
                              0:img_shape_withoutpad[1]]
        # seg_masks = F.softmax(seg_preds, 1)
        # seg_masks = F.interpolate(
        #     seg_masks,
        #     size=ori_shape[0:2],
        #     mode='bilinear',
        #     align_corners=False)
        seg_results = F.interpolate(
            seg_preds,
            size=ori_shape[0:2],
            mode='bilinear',
            align_corners=False)
        return seg_results


================================================
FILE: external/semkitti_dvps.py
================================================
import os
from typing import Dict, List

import copy

import mmcv
import numpy as np
import random
import torch

from mmdet.datasets.builder import DATASETS
from mmdet.datasets.pipelines import Compose


class SeqObj:
    # This divisor is orthogonal with panoptic class-instance divisor.
    DIVISOR = 1000000

    def __init__(self, the_dict: Dict):
        self.dict = the_dict
        assert 'seq_id' in self.dict and 'img_id' in self.dict

    def __hash__(self):
        return self.dict['seq_id'] * self.DIVISOR + self.dict['img_id']

    def __eq__(self, other):
        return self.dict['seq_id'] == other.dict['seq_id'] and self.dict['img_id'] == other.dict['img_id']

    def __getitem__(self, attr):
        return self.dict[attr]


@DATASETS.register_module()
class KITTIDVPSDataset:
    CLASSES = (
        'car', 'bicycle', 'motorcycle', 'truck', 'other-vehicle', 'person', 'bicyclist', 'motorcyclist'
    )

    def __init__(self,
                 pipeline=None,
                 data_root=None,
                 test_mode=False,
                 split='train',
                 ref_seq_index: List[int] = None,
                 is_instance_only: bool = True,
                 ):
        assert data_root is not None
        data_root = os.path.expanduser(data_root)
        video_seq_dir = os.path.join(data_root, 'video_sequence', split)
        assert os.path.exists(video_seq_dir)
        assert 'leftImg8bit' not in video_seq_dir

        self.num_thing_classes = 8
        self.num_stuff_classes = 11
        self.thing_before_stuff = True

        # ref_seq_index is None means no ref img
        if ref_seq_index is None:
            ref_seq_index = []

        filenames = list(map(lambda x: str(x), os.listdir(video_seq_dir)))
        depth_names = sorted(list(filter(lambda x: 'depth' in x, filenames)))
        # No depth annotation
        if not depth_names:
            depth_names = sorted(list(filter(lambda x: 'leftImg8bit' in x, filenames)))

        images = []
        for item in depth_names:
            seq_id, img_id, _ = item.split(sep="_", maxsplit=2)
            item_full = os.path.join(video_seq_dir, item)
            images.append(SeqObj({
                'seq_id': int(seq_id),
                'img_id': int(img_id),
                'img': os.path.join(video_seq_dir, "{}_{}_{}.png".format(seq_id, img_id, 'leftImg8bit')),
                'depth': item_full,
                'ann_class': os.path.join(video_seq_dir, "{}_{}_{}.png".format(seq_id, img_id, 'gtFine_class')),
                'ann_inst': os.path.join(video_seq_dir, "{}_{}_{}.png".format(seq_id, img_id, 'gtFine_instance')),
                # This should be modified carefully for each dataset. Usually 255.
                'no_obj_class': 255
            }))
            assert os.path.exists(images[-1]['img'])
            if not test_mode:
                assert os.path.exists(images[-1]['depth'])
                assert os.path.exists(images[-1]['ann_class'])
                assert os.path.exists(images[-1]['ann_inst'])

        reference_images = {hash(image): image for image in images}
        sequences = []
        for img_cur in images:
            is_seq = True
            seq_now = [img_cur.dict]
            if ref_seq_index:
                for index in random.choices(ref_seq_index, k=1):
                    query_obj = SeqObj({
                        'seq_id': img_cur.dict['seq_id'],
                        'img_id': img_cur.dict['img_id'] + index
                    })
                    if hash(query_obj) in reference_images:
                        seq_now.append(reference_images[hash(query_obj)].dict)
                    else:
                        is_seq = False
                        break
            if is_seq:
                sequences.append(seq_now)

        self.sequences = sequences
        self.ref_seq_index = ref_seq_index

        # mmdet
        self.pipeline = Compose(pipeline)
        self.test_mode = test_mode

        # misc
        self.flag = self._set_groups()
        self.is_instance_only = is_instance_only

        # For evaluation
        self.max_ins = 1000
        self.no_obj_id = 255

    def pre_pipelines(self, results):
        for _results in results:
            _results['img_info'] = []
            _results['thing_lower'] = 0 if self.thing_before_stuff else self.num_stuff_classes
            _results['thing_upper'] = self.num_thing_classes \
                if self.thing_before_stuff else self.num_stuff_classes + self.num_thing_classes
            _results['is_instance_only'] = self.is_instance_only
            _results['ori_filename'] = os.path.basename(_results['img'])

    def prepare_train_img(self, idx):
        """Get training data and annotations after pipeline.

        Args:
            idx (int): Index of data.

        Returns:
            dict: Training data and annotation after pipeline with new keys \
                introduced by pipeline.
        """
        results = copy.deepcopy(self.sequences[idx])
        self.pre_pipelines(results)
        return self.pipeline(results)

    def prepare_test_img(self, idx):
        results = copy.deepcopy(self.sequences[idx])
        self.pre_pipelines(results)
        # During test time, one image inference does not requires seq
        if not self.ref_seq_index:
            results = results[0]
        return self.pipeline(results)

    def _rand_another(self, idx):
        """Get another random index from the same group as the given index."""
        pool = np.where(self.flag == self.flag[idx])[0]
        return np.random.choice(pool)

    # Copy and Modify from mmdet
    def __getitem__(self, idx):
        """Get training/test data after pipeline.

        Args:
            idx (int): Index of data.

        Returns:
            dict: Training/test data (with annotation if `test_mode` is set \
                True).
        """

        if self.test_mode:
            return self.prepare_test_img(idx)
        else:
            while True:
                cur_data = self.prepare_train_img(idx)
                if cur_data is None:
                    idx = self._rand_another(idx)
                    continue
                return cur_data

    def __len__(self):
        """Total number of samples of data."""
        return len(self.sequences)

    def _set_groups(self):
        return np.zeros((len(self)), dtype=np.int64)

    # The evaluate func
    def evaluate(
            self,
            results,
            **kwargs
    ):
        thing_lower = 0 if self.thing_before_stuff else self.num_stuff_classes
        thing_upper = self.num_thing_classes \
            if self.thing_before_stuff else self.num_stuff_classes + self.num_thing_classes
        pred_results_handled = []
        pred_depth = []
        pred_depth_final = []
        for item in results:
            bbox_results, mask_results, seg_results, depth, depth_final = item
            pred_depth.append(depth)
            pred_depth_final.append(depth_final)
            # in seg_info id starts from 1
            inst_map, seg_info = seg_results
            cat_map = np.zeros_like(inst_map) + self.num_thing_classes + self.num_stuff_classes
            for instance in seg_info:
                cat_cur = instance['category_id']
                if instance['isthing']:
                    cat_cur += thing_lower
                else:
                    if self.thing_before_stuff:
                        cat_cur = cat_cur - 1 + thing_upper
                    else:
                        # stuff starts from 1 in the model
                        cat_cur -= 1
                assert cat_cur < self.num_thing_classes + self.num_stuff_classes
                cat_map[inst_map == instance['id']] = cat_cur
                if not instance['isthing']:
                    inst_map[inst_map == instance['id']] = 0
            pred_results_handled.append(cat_map.astype(np.int32) * 10000 + inst_map.astype(np.int32))

        gt_panseg = []
        gt_depth = []
        for item in self.sequences:
            # Only for single
            item = item[0]
            # Only for single
            cat_id = mmcv.imread(item['ann_class'], flag='unchanged').astype(np.int32)
            inst_id = mmcv.imread(item['ann_inst'], flag='unchanged').astype(np.int32)
            ps_id = cat_id * 10000 + inst_id
            gt_panseg.append(ps_id)
            gt_depth_cur = mmcv.imread(item['depth'], flag='unchanged').astype(np.float32) / 256.
            gt_depth.append(gt_depth_cur)

        vpq_results = []
        for pred, gt in zip(pred_results_handled, gt_panseg):
            vpq_result = vpq_eval([pred, gt])
            vpq_results.append(vpq_result)

        iou_per_class = np.stack([result[0] for result in vpq_results]).sum(axis=0)[
                        :self.num_thing_classes + self.num_stuff_classes]
        tp_per_class = np.stack([result[1] for result in vpq_results]).sum(axis=0)[
                       :self.num_thing_classes + self.num_stuff_classes]
        fn_per_class = np.stack([result[2] for result in vpq_results]).sum(axis=0)[
                       :self.num_thing_classes + self.num_stuff_classes]
        fp_per_class = np.stack([result[3] for result in vpq_results]).sum(axis=0)[
                       :self.num_thing_classes + self.num_stuff_classes]

        abs_rels = []
        abs_rel_finals = []
        for pred, pred_final, gt in zip(pred_depth, pred_depth_final, gt_depth):
            depth_mask = gt > 0.
            abs_rel_normal = np.mean(
                np.abs(
                    pred[depth_mask] -
                    gt[depth_mask]) /
                gt[depth_mask])
            abs_rel_final = np.mean(
                np.abs(
                    pred_final[depth_mask] -
                    gt[depth_mask]) /
                gt[depth_mask])
            abs_rels.append(abs_rel_normal)
            abs_rel_finals.append(abs_rel_final)
        abs_rel = np.stack(abs_rels).mean(axis=0)
        abs_rel_final = np.stack(abs_rel_finals).mean(axis=0)

        # calculate the PQs
        epsilon = 0.
        sq = iou_per_class / (tp_per_class + epsilon)
        rq = tp_per_class / (tp_per_class + 0.5 *
                             fn_per_class + 0.5 * fp_per_class + epsilon)
        print("tp per class")
        print(tp_per_class)
        print("fp per class")
        print(fp_per_class)
        print("fn per class")
        print(fn_per_class)

        pq = sq * rq
        print("PQ")
        print(pq[:thing_upper])
        print(pq[thing_upper:])
        print("SQ")
        print(sq)
        print("RQ")
        print(rq)
        stuff_pq = pq[:thing_upper]
        things_pq = pq[thing_upper:]

        return {
            "abs_rel": abs_rel,
            "abs_rel_final": abs_rel_final,
            "PQ": np.nan_to_num(pq).mean() * 100,
            "Stuff PQ": np.nan_to_num(stuff_pq).mean() * 100,
            "Things PQ": np.nan_to_num(things_pq).mean() * 100,
        }


def vpq_eval(element):
    import six
    pred_ids, gt_ids = element
    max_ins = 10000
    ign_id = 255
    offset = 2 ** 30
    num_cat = 19 + 1

    iou_per_class = np.zeros(num_cat, dtype=np.float64)
    tp_per_class = np.zeros(num_cat, dtype=np.float64)
    fn_per_class = np.zeros(num_cat, dtype=np.float64)
    fp_per_class = np.zeros(num_cat, dtype=np.float64)

    def _ids_to_counts(id_array):
        ids, counts = np.unique(id_array, return_counts=True)
        return dict(six.moves.zip(ids, counts))

    pred_areas = _ids_to_counts(pred_ids)
    gt_areas = _ids_to_counts(gt_ids)

    void_id = ign_id * max_ins
    ign_ids = {
        gt_id for gt_id in six.iterkeys(gt_areas)
        if (gt_id // max_ins) == ign_id
    }

    int_ids = gt_ids.astype(np.int64) * offset + pred_ids.astype(np.int64)
    int_areas = _ids_to_counts(int_ids)

    def prediction_void_overlap(pred_id):
        void_int_id = void_id * offset + pred_id
        return int_areas.get(void_int_id, 0)

    def prediction_ignored_overlap(pred_id):
        total_ignored_overlap = 0
        for _ign_id in ign_ids:
            int_id = _ign_id * offset + pred_id
            total_ignored_overlap += int_areas.get(int_id, 0)
        return total_ignored_overlap

    gt_matched = set()
    pred_matched = set()

    for int_id, int_area in six.iteritems(int_areas):
        gt_id = int(int_id // offset)
        gt_cat = int(gt_id // max_ins)
        pred_id = int(int_id % offset)
        pred_cat = int(pred_id // max_ins)
        if gt_cat != pred_cat:
            continue
        union = (
                gt_areas[gt_id] + pred_areas[pred_id] - int_area -
                prediction_void_overlap(pred_id)
        )
        iou = int_area / union
        if iou > 0.5:
            tp_per_class[gt_cat] += 1
            iou_per_class[gt_cat] += iou
            gt_matched.add(gt_id)
            pred_matched.add(pred_id)

    for gt_id in six.iterkeys(gt_areas):
        if gt_id in gt_matched:
            continue
        cat_id = gt_id // max_ins
        if cat_id == ign_id:
            continue
        fn_per_class[cat_id] += 1

    for pred_id in six.iterkeys(pred_areas):
        if pred_id in pred_matched:
            continue
        if (prediction_ignored_overlap(pred_id) / pred_areas[pred_id]) > 0.5:
            continue
        cat = pred_id // max_ins
        fp_per_class[cat] += 1

    return iou_per_class, tp_per_class, fn_per_class, fp_per_class


if __name__ == '__main__':
    import dataset.dvps_pipelines.loading
    import dataset.dvps_pipelines.transforms
    import dataset.pipelines.formatting

    img_norm_cfg = dict(
        mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)
    data = KITTIDVPSDataset(
        pipeline=[
            dict(type='LoadMultiImagesDirect'),
            dict(type='LoadMultiAnnotationsDirect', with_depth=True, divisor=0),
            dict(type='SeqResizeWithDepth', img_scale=(1024, 2048), ratio_range=[1.0, 2.0], keep_ratio=True),
            dict(type='SeqFlipWithDepth', flip_ratio=0.5),
            dict(type='SeqRandomCropWithDepth', crop_size=(1024, 2048), share_params=True),
            dict(type='SeqNormalizeWithDepth', **img_norm_cfg),
            dict(type='SeqPadWithDepth', size_divisor=32),
            dict(
                type='VideoCollect',
                keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_depth', 'gt_instance_ids']),
            dict(type='ConcatVideoReferences'),
            dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
        ],
        data_root=os.path.expanduser('~/datasets/kitti-dvps'),
        split='val',
        ref_seq_index=[-1, 1]
    )
    np.set_string_function(lambda x: '<{} ; {}>'.format(x.shape, x.dtype))
    torch.set_printoptions(profile='short')
    for item in data:
        print(item)


================================================
FILE: external/test.py
================================================
import os.path as osp
import time

import mmcv
import torch
from mmcv.image import tensor2imgs
from mmcv.runner import get_dist_info
from mmdet.apis.test import collect_results_cpu, collect_results_gpu
from mmdet.core import encode_mask_results
from .utils import encode_panoptic


def single_gpu_test(model,
                    data_loader,
                    show=False,
                    out_dir=None,
                    show_score_thr=0.3):
    model.eval()
    results = []
    dataset = data_loader.dataset
    prog_bar = mmcv.ProgressBar(len(dataset))
    for i, data in enumerate(data_loader):
        with torch.no_grad():
            result = model(return_loss=False, rescale=True, **data)

        batch_size = len(result)
        if show or out_dir:
            if batch_size == 1 and isinstance(data['img'][0], torch.Tensor):
                img_tensor = data['img'][0]
            else:
                img_tensor = data['img'][0].data[0]
            img_metas = data['img_metas'][0].data[0]
            imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg'])
            assert len(imgs) == len(img_metas)

            for i, (img, img_meta) in enumerate(zip(imgs, img_metas)):
                h, w, _ = img_meta['img_shape']
                img_show = img[:h, :w, :]

                ori_h, ori_w = img_meta['ori_shape'][:-1]
                img_show = mmcv.imresize(img_show, (ori_w, ori_h))

                if out_dir:
                    out_file = osp.join(out_dir, img_meta['ori_filename'])
                else:
                    out_file = None

                model.module.show_result(
                    img_show,
                    result[i],
                    show=show,
                    out_file=out_file,
                    score_thr=show_score_thr)

        # encode mask results
        if isinstance(result[0], tuple):
            if len(result[0]) == 2:
                result = [(bbox_results, encode_mask_results(mask_results))
                          for bbox_results, mask_results in result]
            # Supporting depth here
            elif len(result[0]) == 5:
                result = [(bbox_results, mask_results,
                           seg_results, depth, depth_final)
                          for bbox_results, mask_results, seg_results, depth, depth_final in result
                          ]
            else:
                result = [(bbox_results, encode_mask_results(mask_results),
                           encode_panoptic(seg_results))
                          for bbox_results, mask_results, seg_results in result
                          ]
        results.extend(result)

        for _ in range(batch_size):
            prog_bar.update()
    return results


def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
    """Test model with multiple gpus.

    This method tests model with multiple gpus and collects the results
    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
    it encodes results to gpu tensors and use gpu communication for results
    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
    and collects them by the rank 0 worker.

    Args:
        model (nn.Module): Model to be tested.
        data_loader (nn.Dataloader): Pytorch data loader.
        tmpdir (str): Path of directory to save the temporary results from
            different gpus under cpu mode.
        gpu_collect (bool): Option to use either gpu or cpu to collect results.

    Returns:
        list: The prediction results.
    """
    model.eval()
    results = []
    dataset = data_loader.dataset
    rank, world_size = get_dist_info()
    if rank == 0:
        prog_bar = mmcv.ProgressBar(len(dataset))
    time.sleep(2)  # This line can prevent deadlock problem in some cases.
    for i, data in enumerate(data_loader):
        with torch.no_grad():
            result = model(return_loss=False, rescale=True, **data)
            # encode mask results
            if isinstance(result[0], tuple):
                if len(result[0]) == 2:
                    result = [(bbox_results, encode_mask_results(mask_results))
                              for bbox_results, mask_results in result]
                # Supporting depth here
                elif len(result[0]) == 5:
                    result = [(bbox_results, mask_results,
                               seg_results, depth, depth_final)
                              for bbox_results, mask_results, seg_results, depth, depth_final in result
                              ]
                else:
                    result = [
                        (bbox_results, encode_mask_results(mask_results),
                         encode_panoptic(seg_results))
                        for bbox_results, mask_results, seg_results in result
                    ]
        results.extend(result)

        if rank == 0:
            batch_size = len(result)
            for _ in range(batch_size * world_size):
                prog_bar.update()

    # collect results from all ranks
    if gpu_collect:
        results = collect_results_gpu(results, len(dataset))
    else:
        results = collect_results_cpu(results, len(dataset), tmpdir)
    return results


================================================
FILE: external/train.py
================================================
import warnings

import torch
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner,
                         Fp16OptimizerHook, OptimizerHook, build_optimizer,
                         build_runner)
from mmcv.utils import build_from_cfg
from mmdet.datasets import (build_dataloader, build_dataset,
                            replace_ImageToTensor)
from mmdet.utils import get_root_logger

from external.evalhooks import DistEvalHook, EvalHook


def train_detector(model,
                   dataset,
                   cfg,
                   distributed=False,
                   validate=False,
                   timestamp=None,
                   meta=None):
    logger = get_root_logger(cfg.log_level)

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    if 'imgs_per_gpu' in cfg.data:
        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
                       'Please use "samples_per_gpu" instead')
        if 'samples_per_gpu' in cfg.data:
            logger.warning(
                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
                f'={cfg.data.imgs_per_gpu} is used in this experiments')
        else:
            logger.warning(
                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
                f'{cfg.data.imgs_per_gpu} in this experiments')
        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu

    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.samples_per_gpu,
            cfg.data.workers_per_gpu,
            # cfg.gpus will be ignored if distributed
            len(cfg.gpu_ids),
            dist=distributed,
            seed=cfg.seed) for ds in dataset
    ]

    # put model on gpus
    if distributed:
        find_unused_parameters = cfg.get('find_unused_parameters', False)
        # Sets the `find_unused_parameters` parameter in
        # torch.nn.parallel.DistributedDataParallel
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(
            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)

    if 'runner' not in cfg:
        cfg.runner = {
            'type': 'EpochBasedRunner',
            'max_epochs': cfg.total_epochs
        }
        warnings.warn(
            'config is now expected to have a `runner` section, '
            'please set `runner` in your config.', UserWarning)
    else:
        if 'total_epochs' in cfg:
            assert cfg.total_epochs == cfg.runner.max_epochs

    runner = build_runner(
        cfg.runner,
        default_args=dict(
            model=model,
            optimizer=optimizer,
            work_dir=cfg.work_dir,
            logger=logger,
            meta=meta))

    # an ugly workaround to make .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(
            **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
    elif distributed and 'type' not in cfg.optimizer_config:
        optimizer_config = OptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = cfg.optimizer_config

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))
    if distributed:
        if isinstance(runner, EpochBasedRunner):
            runner.register_hook(DistSamplerSeedHook())

    # register eval hooks
    if validate:
        # Support batch_size > 1 in validation
        val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
        if val_samples_per_gpu > 1:
            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
            cfg.data.val.pipeline = replace_ImageToTensor(
                cfg.data.val.pipeline)
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        val_dataloader = build_dataloader(
            val_dataset,
            samples_per_gpu=val_samples_per_gpu,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=distributed,
            shuffle=False)
        eval_cfg = cfg.get('evaluation', {})
        eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
        eval_hook = DistEvalHook if distributed else EvalHook
        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))

    # user-defined hooks
    if cfg.get('custom_hooks', None):
        custom_hooks = cfg.custom_hooks
        assert isinstance(custom_hooks, list), \
            f'custom_hooks expect list type, but got {type(custom_hooks)}'
        for hook_cfg in cfg.custom_hooks:
            assert isinstance(hook_cfg, dict), \
                'Each item in custom_hooks expects dict type, but got ' \
                f'{type(hook_cfg)}'
            hook_cfg = hook_cfg.copy()
            priority = hook_cfg.pop('priority', 'NORMAL')
            hook = build_from_cfg(hook_cfg, HOOKS)
            runner.register_hook(hook, priority=priority)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow)


================================================
FILE: external/utils.py
================================================
import io

from panopticapi.utils import id2rgb
from PIL import Image


def encode_panoptic(panoptic_results):
    panoptic_img, segments_info = panoptic_results
    with io.BytesIO() as out:
        Image.fromarray(id2rgb(panoptic_img)).save(out, format='PNG')
        return out.getvalue(), segments_info


================================================
FILE: external/vipseg_dvps.py
================================================
import os
import random
from typing import Dict, List

import copy

import mmcv
import numpy as np
import torch

from mmdet.datasets.builder import DATASETS
from mmdet.datasets.pipelines import Compose
from mmdet.utils import get_root_logger

CLASSES = [
    {"id": 0, "name": "wall", "isthing": 0, "color": [120, 120, 120]},
    {"id": 1, "name": "ceiling", "isthing": 0, "color": [180, 120, 120]},
    {"id": 2, "name": "door", "isthing": 1, "color": [6, 230, 230]},
    {"id": 3, "name": "stair", "isthing": 0, "color": [80, 50, 50]},
    {"id": 4, "name": "ladder", "isthing": 1, "color": [4, 200, 3]},
    {"id": 5, "name": "escalator", "isthing": 0, "color": [120, 120, 80]},
    {"id": 6, "name": "Playground_slide", "isthing": 0, "color": [140, 140, 140]},
    {"id": 7, "name": "handrail_or_fence", "isthing": 0, "color": [204, 5, 255]},
    {"id": 8, "name": "window", "isthing": 1, "color": [230, 230, 230]},
    {"id": 9, "name": "rail", "isthing": 0, "color": [4, 250, 7]},
    {"id": 10, "name": "goal", "isthing": 1, "color": [224, 5, 255]},
    {"id": 11, "name": "pillar", "isthing": 0, "color": [235, 255, 7]},
    {"id": 12, "name": "pole", "isthing": 0, "color": [150, 5, 61]},
    {"id": 13, "name": "floor", "isthing": 0, "color": [120, 120, 70]},
    {"id": 14, "name": "ground", "isthing": 0, "color": [8, 255, 51]},
    {"id": 15, "name": "grass", "isthing": 0, "color": [255, 6, 82]},
    {"id": 16, "name": "sand", "isthing": 0, "color": [143, 255, 140]},
    {"id": 17, "name": "athletic_field", "isthing": 0, "color": [204, 255, 4]},
    {"id": 18, "name": "road", "isthing": 0, "color": [255, 51, 7]},
    {"id": 19, "name": "path", "isthing": 0, "color": [204, 70, 3]},
    {"id": 20, "name": "crosswalk", "isthing": 0, "color": [0, 102, 200]},
    {"id": 21, "name": "building", "isthing": 0, "color": [61, 230, 250]},
    {"id": 22, "name": "house", "isthing": 0, "color": [255, 6, 51]},
    {"id": 23, "name": "bridge", "isthing": 0, "color": [11, 102, 255]},
    {"id": 24, "name": "tower", "isthing": 0, "color": [255, 7, 71]},
    {"id": 25, "name": "windmill", "isthing": 0, "color": [255, 9, 224]},
    {"id": 26, "name": "well_or_well_lid", "isthing": 0, "color": [9, 7, 230]},
    {"id": 27, "name": "other_construction", "isthing": 0, "color": [220, 220, 220]},
    {"id": 28, "name": "sky", "isthing": 0, "color": [255, 9, 92]},
    {"id": 29, "name": "mountain", "isthing": 0, "color": [112, 9, 255]},
    {"id": 30, "name": "stone", "isthing": 0, "color": [8, 255, 214]},
    {"id": 31, "name": "wood", "isthing": 0, "color": [7, 255, 224]},
    {"id": 32, "name": "ice", "isthing": 0, "color": [255, 184, 6]},
    {"id": 33, "name": "snowfield", "isthing": 0, "color": [10, 255, 71]},
    {"id": 34, "name": "grandstand", "isthing": 0, "color": [255, 41, 10]},
    {"id": 35, "name": "sea", "isthing": 0, "color": [7, 255, 255]},
    {"id": 36, "name": "river", "isthing": 0, "color": [224, 255, 8]},
    {"id": 37, "name": "lake", "isthing": 0, "color": [102, 8, 255]},
    {"id": 38, "name": "waterfall", "isthing": 0, "color": [255, 61, 6]},
    {"id": 39, "name": "water", "isthing": 0, "color": [255, 194, 7]},
    {"id": 40, "name": "billboard_or_Bulletin_Board", "isthing": 0, "color": [255, 122, 8]},
    {"id": 41, "name": "sculpture", "isthing": 1, "color": [0, 255, 20]},
    {"id": 42, "name": "pipeline", "isthing": 0, "color": [255, 8, 41]},
    {"id": 43, "name": "flag", "isthing": 1, "color": [255, 5, 153]},
    {"id": 44, "name": "parasol_or_umbrella", "isthing": 1, "color": [6, 51, 255]},
    {"id": 45, "name": "cushion_or_carpet", "isthing": 0, "color": [235, 12, 255]},
    {"id": 46, "name": "tent", "isthing": 1, "color": [160, 150, 20]},
    {"id": 47, "name": "roadblock", "isthing": 1, "color": [0, 163, 255]},
    {"id": 48, "name": "car", "isthing": 1, "color": [140, 140, 140]},
    {"id": 49, "name": "bus", "isthing": 1, "color": [250, 10, 15]},
    {"id": 50, "name": "truck", "isthing": 1, "color": [20, 255, 0]},
    {"id": 51, "name": "bicycle", "isthing": 1, "color": [31, 255, 0]},
    {"id": 52, "name": "motorcycle", "isthing": 1, "color": [255, 31, 0]},
    {"id": 53, "name": "wheeled_machine", "isthing": 0, "color": [255, 224, 0]},
    {"id": 54, "name": "ship_or_boat", "isthing": 1, "color": [153, 255, 0]},
    {"id": 55, "name": "raft", "isthing": 1, "color": [0, 0, 255]},
    {"id": 56, "name": "airplane", "isthing": 1, "color": [255, 71, 0]},
    {"id": 57, "name": "tyre", "isthing": 0, "color": [0, 235, 255]},
    {"id": 58, "name": "traffic_light", "isthing": 0, "color": [0, 173, 255]},
    {"id": 59, "name": "lamp", "isthing": 0, "color": [31, 0, 255]},
    {"id": 60, "name": "person", "isthing": 1, "color": [11, 200, 200]},
    {"id": 61, "name": "cat", "isthing": 1, "color": [255, 82, 0]},
    {"id": 62, "name": "dog", "isthing": 1, "color": [0, 255, 245]},
    {"id": 63, "name": "horse", "isthing": 1, "color": [0, 61, 255]},
    {"id": 64, "name": "cattle", "isthing": 1, "color": [0, 255, 112]},
    {"id": 65, "name": "other_animal", "isthing": 1, "color": [0, 255, 133]},
    {"id": 66, "name": "tree", "isthing": 0, "color": [255, 0, 0]},
    {"id": 67, "name": "flower", "isthing": 0, "color": [255, 163, 0]},
    {"id": 68, "name": "other_plant", "isthing": 0, "color": [255, 102, 0]},
    {"id": 69, "name": "toy", "isthing": 0, "color": [194, 255, 0]},
    {"id": 70, "name": "ball_net", "isthing": 0, "color": [0, 143, 255]},
    {"id": 71, "name": "backboard", "isthing": 0, "color": [51, 255, 0]},
    {"id": 72, "name": "skateboard", "isthing": 1, "color": [0, 82, 255]},
    {"id": 73, "name": "bat", "isthing": 0, "color": [0, 255, 41]},
    {"id": 74, "name": "ball", "isthing": 1, "color": [0, 255, 173]},
    {"id": 75, "name": "cupboard_or_showcase_or_storage_rack", "isthing": 0, "color": [10, 0, 255]},
    {"id": 76, "name": "box", "isthing": 1, "color": [173, 255, 0]},
    {"id": 77, "name": "traveling_case_or_trolley_case", "isthing": 1, "color": [0, 255, 153]},
    {"id": 78, "name": "basket", "isthing": 1, "color": [255, 92, 0]},
    {"id": 79, "name": "bag_or_package", "isthing": 1, "color": [255, 0, 255]},
    {"id": 80, "name": "trash_can", "isthing": 0, "color": [255, 0, 245]},
    {"id": 81, "name": "cage", "isthing": 0, "color": [255, 0, 102]},
    {"id": 82, "name": "plate", "isthing": 1, "color": [255, 173, 0]},
    {"id": 83, "name": "tub_or_bowl_or_pot", "isthing": 1, "color": [255, 0, 20]},
    {"id": 84, "name": "bottle_or_cup", "isthing": 1, "color": [255, 184, 184]},
    {"id": 85, "name": "barrel", "isthing": 1, "color": [0, 31, 255]},
    {"id": 86, "name": "fishbowl", "isthing": 1, "color": [0, 255, 61]},
    {"id": 87, "name": "bed", "isthing": 1, "color": [0, 71, 255]},
    {"id": 88, "name": "pillow", "isthing": 1, "color": [255, 0, 204]},
    {"id": 89, "name": "table_or_desk", "isthing": 1, "color": [0, 255, 194]},
    {"id": 90, "name": "chair_or_seat", "isthing": 1, "color": [0, 255, 82]},
    {"id": 91, "name": "bench", "isthing": 1, "color": [0, 10, 255]},
    {"id": 92, "name": "sofa", "isthing": 1, "color": [0, 112, 255]},
    {"id": 93, "name": "shelf", "isthing": 0, "color": [51, 0, 255]},
    {"id": 94, "name": "bathtub", "isthing": 0, "color": [0, 194, 255]},
    {"id": 95, "name": "gun", "isthing": 1, "color": [0, 122, 255]},
    {"id": 96, "name": "commode", "isthing": 1, "color": [0, 255, 163]},
    {"id": 97, "name": "roaster", "isthing": 1, "color": [255, 153, 0]},
    {"id": 98, "name": "other_machine", "isthing": 0, "color": [0, 255, 10]},
    {"id": 99, "name": "refrigerator", "isthing": 1, "color": [255, 112, 0]},
    {"id": 100, "name": "washing_machine", "isthing": 1, "color": [143, 255, 0]},
    {"id": 101, "name": "Microwave_oven", "isthing": 1, "color": [82, 0, 255]},
    {"id": 102, "name": "fan", "isthing": 1, "color": [163, 255, 0]},
    {"id": 103, "name": "curtain", "isthing": 0, "color": [255, 235, 0]},
    {"id": 104, "name": "textiles", "isthing": 0, "color": [8, 184, 170]},
    {"id": 105, "name": "clothes", "isthing": 0, "color": [133, 0, 255]},
    {"id": 106, "name": "painting_or_poster", "isthing": 1, "color": [0, 255, 92]},
    {"id": 107, "name": "mirror", "isthing": 1, "color": [184, 0, 255]},
    {"id": 108, "name": "flower_pot_or_vase", "isthing": 1, "color": [255, 0, 31]},
    {"id": 109, "name": "clock", "isthing": 1, "color": [0, 184, 255]},
    {"id": 110, "name": "book", "isthing": 0, "color": [0, 214, 255]},
    {"id": 111, "name": "tool", "isthing": 0, "color": [255, 0, 112]},
    {"id": 112, "name": "blackboard", "isthing": 0, "color": [92, 255, 0]},
    {"id": 113, "name": "tissue", "isthing": 0, "color": [0, 224, 255]},
    {"id": 114, "name": "screen_or_television", "isthing": 1, "color": [112, 224, 255]},
    {"id": 115, "name": "computer", "isthing": 1, "color": [70, 184, 160]},
    {"id": 116, "name": "printer", "isthing": 1, "color": [163, 0, 255]},
    {"id": 117, "name": "Mobile_phone", "isthing": 1, "color": [153, 0, 255]},
    {"id": 118, "name": "keyboard", "isthing": 1, "color": [71, 255, 0]},
    {"id": 119, "name": "other_electronic_product", "isthing": 0, "color": [255, 0, 163]},
    {"id": 120, "name": "fruit", "isthing": 0, "color": [255, 204, 0]},
    {"id": 121, "name": "food", "isthing": 0, "color": [255, 0, 143]},
    {"id": 122, "name": "instrument", "isthing": 1, "color": [0, 255, 235]},
    {"id": 123, "name": "train", "isthing": 1, "color": [133, 255, 0]}
]

CLASSES_THING = [
    {'id': 2, 'name': 'door', 'isthing': 1, 'color': [6, 230, 230]},
    {'id': 4, 'name': 'ladder', 'isthing': 1, 'color': [4, 200, 3]},
    {'id': 8, 'name': 'window', 'isthing': 1, 'color': [230, 230, 230]},
    {'id': 10, 'name': 'goal', 'isthing': 1, 'color': [224, 5, 255]},
    {'id': 41, 'name': 'sculpture', 'isthing': 1, 'color': [0, 255, 20]},
    {'id': 43, 'name': 'flag', 'isthing': 1, 'color': [255, 5, 153]},
    {'id': 44, 'name': 'parasol_or_umbrella', 'isthing': 1, 'color': [6, 51, 255]},
    {'id': 46, 'name': 'tent', 'isthing': 1, 'color': [160, 150, 20]},
    {'id': 47, 'name': 'roadblock', 'isthing': 1, 'color': [0, 163, 255]},
    {'id': 48, 'name': 'car', 'isthing': 1, 'color': [140, 140, 140]},
    {'id': 49, 'name': 'bus', 'isthing': 1, 'color': [250, 10, 15]},
    {'id': 50, 'name': 'truck', 'isthing': 1, 'color': [20, 255, 0]},
    {'id': 51, 'name': 'bicycle', 'isthing': 1, 'color': [31, 255, 0]},
    {'id': 52, 'name': 'motorcycle', 'isthing': 1, 'color': [255, 31, 0]},
    {'id': 54, 'name': 'ship_or_boat', 'isthing': 1, 'color': [153, 255, 0]},
    {'id': 55, 'name': 'raft', 'isthing': 1, 'color': [0, 0, 255]},
    {'id': 56, 'name': 'airplane', 'isthing': 1, 'color': [255, 71, 0]},
    {'id': 60, 'name': 'person', 'isthing': 1, 'color': [11, 200, 200]},
    {'id': 61, 'name': 'cat', 'isthing': 1, 'color': [255, 82, 0]},
    {'id': 62, 'name': 'dog', 'isthing': 1, 'color': [0, 255, 245]},
    {'id': 63, 'name': 'horse', 'isthing': 1, 'color': [0, 61, 255]},
    {'id': 64, 'name': 'cattle', 'isthing': 1, 'color': [0, 255, 112]},
    {'id': 65, 'name': 'other_animal', 'isthing': 1, 'color': [0, 255, 133]},
    {'id': 72, 'name': 'skateboard', 'isthing': 1, 'color': [0, 82, 255]},
    {'id': 74, 'name': 'ball', 'isthing': 1, 'color': [0, 255, 173]},
    {'id': 76, 'name': 'box', 'isthing': 1, 'color': [173, 255, 0]},
    {'id': 77, 'name': 'traveling_case_or_trolley_case', 'isthing': 1, 'color': [0, 255, 153]},
    {'id': 78, 'name': 'basket', 'isthing': 1, 'color': [255, 92, 0]},
    {'id': 79, 'name': 'bag_or_package', 'isthing': 1, 'color': [255, 0, 255]},
    {'id': 82, 'name': 'plate', 'isthing': 1, 'color': [255, 173, 0]},
    {'id': 83, 'name': 'tub_or_bowl_or_pot', 'isthing': 1, 'color': [255, 0, 20]},
    {'id': 84, 'name': 'bottle_or_cup', 'isthing': 1, 'color': [255, 184, 184]},
    {'id': 85, 'name': 'barrel', 'isthing': 1, 'color': [0, 31, 255]},
    {'id': 86, 'name': 'fishbowl', 'isthing': 1, 'color': [0, 255, 61]},
    {'id': 87, 'name': 'bed', 'isthing': 1, 'color': [0, 71, 255]},
    {'id': 88, 'name': 'pillow', 'isthing': 1, 'color': [255, 0, 204]},
    {'id': 89, 'name': 'table_or_desk', 'isthing': 1, 'color': [0, 255, 194]},
    {'id': 90, 'name': 'chair_or_seat', 'isthing': 1, 'color': [0, 255, 82]},
    {'id': 91, 'name': 'bench', 'isthing': 1, 'color': [0, 10, 255]},
    {'id': 92, 'name': 'sofa', 'isthing': 1, 'color': [0, 112, 255]},
    {'id': 95, 'name': 'gun', 'isthing': 1, 'color': [0, 122, 255]},
    {'id': 96, 'name': 'commode', 'isthing': 1, 'color': [0, 255, 163]},
    {'id': 97, 'name': 'roaster', 'isthing': 1, 'color': [255, 153, 0]},
    {'id': 99, 'name': 'refrigerator', 'isthing': 1, 'color': [255, 112, 0]},
    {'id': 100, 'name': 'washing_machine', 'isthing': 1, 'color': [143, 255, 0]},
    {'id': 101, 'name': 'Microwave_oven', 'isthing': 1, 'color': [82, 0, 255]},
    {'id': 102, 'name': 'fan', 'isthing': 1, 'color': [163, 255, 0]},
    {'id': 106, 'name': 'painting_or_poster', 'isthing': 1, 'color': [0, 255, 92]},
    {'id': 107, 'name': 'mirror', 'isthing': 1, 'color': [184, 0, 255]},
    {'id': 108, 'name': 'flower_pot_or_vase', 'isthing': 1, 'color': [255, 0, 31]},
    {'id': 109, 'name': 'clock', 'isthing': 1, 'color': [0, 184, 255]},
    {'id': 114, 'name': 'screen_or_television', 'isthing': 1, 'color': [112, 224, 255]},
    {'id': 115, 'name': 'computer', 'isthing': 1, 'color': [70, 184, 160]},
    {'id': 116, 'name': 'printer', 'isthing': 1, 'color': [163, 0, 255]},
    {'id': 117, 'name': 'Mobile_phone', 'isthing': 1, 'color': [153, 0, 255]},
    {'id': 118, 'name': 'keyboard', 'isthing': 1, 'color': [71, 255, 0]},
    {'id': 122, 'name': 'instrument', 'isthing': 1, 'color': [0, 255, 235]},
    {'id': 123, 'name': 'train', 'isthing': 1, 'color': [133, 255, 0]}
]

CLASSES_STUFF = [
    {'id': 0, 'name': 'wall', 'isthing': 0, 'color': [120, 120, 120]},
    {'id': 1, 'name': 'ceiling', 'isthing': 0, 'color': [180, 120, 120]},
    {'id': 3, 'name': 'stair', 'isthing': 0, 'color': [80, 50, 50]},
    {'id': 5, 'name': 'escalator', 'isthing': 0, 'color': [120, 120, 80]},
    {'id': 6, 'name': 'Playground_slide', 'isthing': 0, 'color': [140, 140, 140]},
    {'id': 7, 'name': 'handrail_or_fence', 'isthing': 0, 'color': [204, 5, 255]},
    {'id': 9, 'name': 'rail', 'isthing': 0, 'color': [4, 250, 7]},
    {'id': 11, 'name': 'pillar', 'isthing': 0, 'color': [235, 255, 7]},
    {'id': 12, 'name': 'pole', 'isthing': 0, 'color': [150, 5, 61]},
    {'id': 13, 'name': 'floor', 'isthing': 0, 'color': [120, 120, 70]},
    {'id': 14, 'name': 'ground', 'isthing': 0, 'color': [8, 255, 51]},
    {'id': 15, 'name': 'grass', 'isthing': 0, 'color': [255, 6, 82]},
    {'id': 16, 'name': 'sand', 'isthing': 0, 'color': [143, 255, 140]},
    {'id': 17, 'name': 'athletic_field', 'isthing': 0, 'color': [204, 255, 4]},
    {'id': 18, 'name': 'road', 'isthing': 0, 'color': [255, 51, 7]},
    {'id': 19, 'name': 'path', 'isthing': 0, 'color': [204, 70, 3]},
    {'id': 20, 'name': 'crosswalk', 'isthing': 0, 'color': [0, 102, 200]},
    {'id': 21, 'name': 'building', 'isthing': 0, 'color': [61, 230, 250]},
    {'id': 22, 'name': 'house', 'isthing': 0, 'color': [255, 6, 51]},
    {'id': 23, 'name': 'bridge', 'isthing': 0, 'color': [11, 102, 255]},
    {'id': 24, 'name': 'tower', 'isthing': 0, 'color': [255, 7, 71]},
    {'id': 25, 'name': 'windmill', 'isthing': 0, 'color': [255, 9, 224]},
    {'id': 26, 'name': 'well_or_well_lid', 'isthing': 0, 'color': [9, 7, 230]},
    {'id': 27, 'name': 'other_construction', 'isthing': 0, 'color': [220, 220, 220]},
    {'id': 28, 'name': 'sky', 'isthing': 0, 'color': [255, 9, 92]},
    {'id': 29, 'name': 'mountain', 'isthing': 0, 'color': [112, 9, 255]},
    {'id': 30, 'name': 'stone', 'isthing': 0, 'color': [8, 255, 214]},
    {'id': 31, 'name': 'wood', 'isthing': 0, 'color': [7, 255, 224]},
    {'id': 32, 'name': 'ice', 'isthing': 0, 'color': [255, 184, 6]},
    {'id': 33, 'name': 'snowfield', 'isthing': 0, 'color': [10, 255, 71]},
    {'id': 34, 'name': 'grandstand', 'isthing': 0, 'color': [255, 41, 10]},
    {'id': 35, 'name': 'sea', 'isthing': 0, 'color': [7, 255, 255]},
    {'id': 36, 'name': 'river', 'isthing': 0, 'color': [224, 255, 8]},
    {'id': 37, 'name': 'lake', 'isthing': 0, 'color': [102, 8, 255]},
    {'id': 38, 'name': 'waterfall', 'isthing': 0, 'color': [255, 61, 6]},
    {'id': 39, 'name': 'water', 'isthing': 0, 'color': [255, 194, 7]},
    {'id': 40, 'name': 'billboard_or_Bulletin_Board', 'isthing': 0, 'color': [255, 122, 8]},
    {'id': 42, 'name': 'pipeline', 'isthing': 0, 'color': [255, 8, 41]},
    {'id': 45, 'name': 'cushion_or_carpet', 'isthing': 0, 'color': [235, 12, 255]},
    {'id': 53, 'name': 'wheeled_machine', 'isthing': 0, 'color': [255, 224, 0]},
    {'id': 57, 'name': 'tyre', 'isthing': 0, 'color': [0, 235, 255]},
    {'id': 58, 'name': 'traffic_light', 'isthing': 0, 'color': [0, 173, 255]},
    {'id': 59, 'name': 'lamp', 'isthing': 0, 'color': [31, 0, 255]},
    {'id': 66, 'name': 'tree', 'isthing': 0, 'color': [255, 0, 0]},
    {'id': 67, 'name': 'flower', 'isthing': 0, 'color': [255, 163, 0]},
    {'id': 68, 'name': 'other_plant', 'isthing': 0, 'color': [255, 102, 0]},
    {'id': 69, 'name': 'toy', 'isthing': 0, 'color': [194, 255, 0]},
    {'id': 70, 'name': 'ball_net', 'isthing': 0, 'color': [0, 143, 255]},
    {'id': 71, 'name': 'backboard', 'isthing': 0, 'color': [51, 255, 0]},
    {'id': 73, 'name': 'bat', 'isthing': 0, 'color': [0, 255, 41]},
    {'id': 75, 'name': 'cupboard_or_showcase_or_storage_rack', 'isthing': 0, 'color': [10, 0, 255]},
    {'id': 80, 'name': 'trash_can', 'isthing': 0, 'color': [255, 0, 245]},
    {'id': 81, 'name': 'cage', 'isthing': 0, 'color': [255, 0, 102]},
    {'id': 93, 'name': 'shelf', 'isthing': 0, 'color': [51, 0, 255]},
    {'id': 94, 'name': 'bathtub', 'isthing': 0, 'color': [0, 194, 255]},
    {'id': 98, 'name': 'other_machine', 'isthing': 0, 'color': [0, 255, 10]},
    {'id': 103, 'name': 'curtain', 'isthing': 0, 'color': [255, 235, 0]},
    {'id': 104, 'name': 'textiles', 'isthing': 0, 'color': [8, 184, 170]},
    {'id': 105, 'name': 'clothes', 'isthing': 0, 'color': [133, 0, 255]},
    {'id': 110, 'name': 'book', 'isthing': 0, 'color': [0, 214, 255]},
    {'id': 111, 'name': 'tool', 'isthing': 0, 'color': [255, 0, 112]},
    {'id': 112, 'name': 'blackboard', 'isthing': 0, 'color': [92, 255, 0]},
    {'id': 113, 'name': 'tissue', 'isthing': 0, 'color': [0, 224, 255]},
    {'id': 119, 'name': 'other_electronic_product', 'isthing': 0, 'color': [255, 0, 163]},
    {'id': 120, 'name': 'fruit', 'isthing': 0, 'color': [255, 204, 0]},
    {'id': 121, 'name': 'food', 'isthing': 0, 'color': [255, 0, 143]}
]

# stuff -> thing
NO_OBJ = 0
NO_OBJ_HB = 255
DIVISOR_PAN = 100
DIVISOR_NEW = 1000
NUM_THING = 58
NUM_STUFF = 66
THING_B_STUFF = False


def vip2hb(pan_map):
    assert not THING_B_STUFF, "VIPSeg only supports stuff -> thing"
    pan_new = - np.ones_like(pan_map)
    vip2hb_thing = {itm['id'] + 1: idx for idx, itm in enumerate(CLASSES_THING)}
    vip2hb_stuff = {itm['id'] + 1: idx for idx, itm in enumerate(CLASSES_STUFF)}
    for idx in np.unique(pan_map):
        if idx == NO_OBJ or idx == 200:
            pan_new[pan_map == idx] = NO_OBJ_HB
        elif idx > 128:
            cls_id = idx // DIVISOR_PAN
            cls_new_id = vip2hb_thing[cls_id]
            inst_id = idx % DIVISOR_PAN
            # since stuff -> thing
            cls_new_id += NUM_STUFF
            pan_new[pan_map == idx] = cls_new_id * DIVISOR_NEW + inst_id
        else:
            pan_new[pan_map == idx] = vip2hb_stuff[idx]
    assert -1. not in np.unique(pan_new)
    return pan_new


class SeqObj:
    # This divisor is orthogonal with panoptic class-instance divisor.
    DIVISOR = 1000000

    def __init__(self, the_dict: Dict):
        self.dict = the_dict
        assert 'seq_id' in self.dict and 'img_id' in self.dict

    def __hash__(self):
        return self.dict['seq_id'] * self.DIVISOR + self.dict['img_id']

    def __eq__(self, other):
        return self.dict['seq_id'] == other.dict['seq_id'] and self.dict['img_id'] == other.dict['img_id']

    def __getitem__(self, attr):
        return self.dict[attr]


@DATASETS.register_module()
class VIPSegDVPSDataset:
    CLASSES = (
        'dummy'
    )

    def __init__(self,
                 pipeline=None,
                 data_root=None,
                 test_mode=False,
                 split='train',
                 ref_seq_index: List[int] = None,
                 is_instance_only: bool = True,
                 ):
        logger = get_root_logger()

        assert data_root is not None
        data_root = os.path.expanduser(data_root)
        img_root = os.path.join(data_root, 'images')
        seg_root = os.path.join(data_root, 'panomasks')
        assert os.path.exists(img_root)
        assert os.path.exists(seg_root)

        # read split file
        split_file = os.path.join(data_root, split + '.txt')
        video_folders = mmcv.list_from_file(split_file, prefix=img_root + '/')
        ann_folders = mmcv.list_from_file(split_file, prefix=seg_root + '/')
        logger.info("VIPSegDVPSDataset : There are totally {} videos in {} split.".format(len(video_folders), split))

        # 58 things and 66 stuff, totally 124 classes
        self.num_thing_classes = 58
        self.num_stuff_classes = 66
        assert len(CLASSES_THING) == self.num_thing_classes
        assert len(CLASSES_STUFF) == self.num_stuff_classes
        assert len(CLASSES) == self.num_thing_classes + self.num_stuff_classes
        self.thing_before_stuff = False

        # ref_seq_index is None means no ref img
        if ref_seq_index is None:
            ref_seq_index = []

        images = []
        # remember that both img_id and seq_id start from 0
        _tmp_seq_id = -1
        for vid_folder, ann_folder in zip(video_folders, ann_folders):
            assert os.path.basename(vid_folder) == os.path.basename(ann_folder)
            _tmp_seq_id += 1
            _tmp_img_id = -1
            imgs_cur = sorted(list(map(lambda x: str(x), mmcv.scandir(vid_folder, recursive=False, suffix='.jpg'))))
            pans_cur = sorted(list(map(lambda x: str(x), mmcv.scandir(ann_folder, recursive=False, suffix='.png'))))
            for img_cur, pan_cur in zip(imgs_cur, pans_cur):
                assert img_cur.split('.')[0] == pan_cur.split('.')[0]
                _tmp_img_id += 1
                seq_id = _tmp_seq_id
                img_id = _tmp_img_id
                item_full = os.path.join(vid_folder, img_cur)
                inst_map = os.path.join(ann_folder, pan_cur)
                images.append(SeqObj({
                    'seq_id': int(seq_id),
                    'img_id': int(img_id),
                    'img': item_full,
                    'ann': inst_map,
                    'no_obj_class': 255
                }))
                assert os.path.exists(images[-1]['img'])
                assert os.path.exists(images[-1]['ann'])

        # Warning from Haobo: the following codes are dangerous
        # because they rely on a consistent seed among different
        # processes. Please contact me before using it.
        reference_images = {hash(image): image for image in images}
        sequences = []
        for img_cur in images:
            is_seq = True
            seq_now = [img_cur.dict]
            if ref_seq_index:
                for index in random.choices(ref_seq_index, k=1):
                    query_obj = SeqObj({
                        'seq_id': img_cur.dict['seq_id'],
                        'img_id': img_cur.dict['img_id'] + index
                    })
                    if hash(query_obj) in reference_images:
                        seq_now.append(reference_images[hash(query_obj)].dict)
                    else:
                        is_seq = False
                        break
            if is_seq:
                sequences.append(seq_now)

        self.sequences = sequences
        self.ref_seq_index = ref_seq_index
        logger.info("VIPSegDVPSDataset : There are totally {} clips in {} split for training.".format(
            len(self.sequences), split))

        # mmdet
        self.pipeline = Compose(pipeline)
        self.test_mode = test_mode

        # misc
        self.flag = self._set_groups()
        self.is_instance_only = is_instance_only

        # For evaluation
        self.max_ins = 1000
        self.no_obj_id = 255

    def pre_pipelines(self, results):
        for _results in results:
            _results['img_info'] = []
            _results['thing_lower'] = 0 if self.thing_before_stuff else self.num_stuff_classes
            _results['thing_upper'] = self.num_thing_classes \
                if self.thing_before_stuff else self.num_stuff_classes + self.num_thing_classes
            _results['is_instance_only'] = self.is_instance_only
            _results['ori_filename'] = os.path.basename(_results['img'])
            _results['filename'] = _results['img']
            _results['pre_hook'] = vip2hb

    def prepare_train_img(self, idx):
        """Get training data and annotations after pipeline.

        Args:
            idx (int): Index of data.

        Returns:
            dict: Training data and annotation after pipeline with new keys \
                introduced by pipeline.
        """
        results = copy.deepcopy(self.sequences[idx])
        self.pre_pipelines(results)
        return self.pipeline(results)

    def prepare_test_img(self, idx):
        results = copy.deepcopy(self.sequences[idx])
        self.pre_pipelines(results)
        # During test time, one image inference does not requires seq
        if not self.ref_seq_index:
            results = results[0]
        return self.pipeline(results)

    def _rand_another(self, idx):
        """Get another random index from the same group as the given index."""
        pool = np.where(self.flag == self.flag[idx])[0]
        return np.random.choice(pool)

    # Copy and Modify from mmdet
    def __getitem__(self, idx):
        """Get training/test data after pipeline.

        Args:
            idx (int): Index of data.

        Returns:
            dict: Training/test data (with annotation if `test_mode` is set \
                True).
        """

        if self.test_mode:
            return self.prepare_test_img(idx)
        else:
            while True:
                cur_data = self.prepare_train_img(idx)
                if cur_data is None:
                    idx = self._rand_another(idx)
                    continue
                return cur_data

    def __len__(self):
        """Total number of samples of data."""
        return len(self.sequences)

    def _set_groups(self):
        return np.zeros((len(self)), dtype=np.int64)

    # The evaluate func
    def evaluate(
            self,
            results,
            **kwargs
    ):
        raise NotImplementedError


if __name__ == '__main__':
    import dataset.dvps_pipelines.loading
    import dataset.dvps_pipelines.transforms
    import dataset.pipelines.transforms
    import dataset.pipelines.formatting
    import dataset.dvps_pipelines.tricks

    img_norm_cfg = dict(
        mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)

    test_pipeline = [
        dict(type='LoadMultiImagesDirect'),
        dict(type='SeqPadWithDepth', size_divisor=32),
        dict(type='SeqNormalize', **img_norm_cfg),
        dict(
            type='VideoCollect',
            keys=['img']),
        dict(type='ConcatVideoReferences'),
        dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
    ]

    _auto_aug_polices = [
        [
            dict(type='ColorTransform', prob=0.5, level=3),
            dict(type='EqualizeTransform', prob=0.5),
            dict(type='BrightnessTransform', prob=0.5, level=3),
            dict(type='ContrastTransform', prob=0.5, level=3),
        ],
        [
            dict(type='EqualizeTransform', prob=0),
        ]
    ]

    data = VIPSegDVPSDataset(
        pipeline=[
            dict(type='LoadMultiImagesDirect'),
            dict(type='LoadMultiAnnotationsDirect', with_depth=False, vipseg=True),
            dict(type='SeqAutoAug', policies=_auto_aug_polices),
            dict(type='SeqResizeWithDepth', img_scale=(720, 100000), ratio_range=[1., 2.], keep_ratio=True),
            dict(type='SeqFlipWithDepth', flip_ratio=0.5),
            dict(type='SeqRandomCropWithDepth', crop_size=(736, 736), share_params=True),
            dict(type='SeqPadWithDepth', size_divisor=32),
            dict(type='SeqNormalize', **img_norm_cfg),
            dict(
                type='VideoCollect',
                keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
            dict(type='ConcatVideoReferences'),
            dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
        ],
        data_root="data/VIPSeg",
        test_mode=False,
        split='train',
        ref_seq_index=[-1, 1],
        is_instance_only=False,
    )
    np.set_string_function(lambda x: '<{} ; {}>'.format(x.shape, x.dtype))
    torch.set_printoptions(profile='short')
    for item in data:
        print(item)


================================================
FILE: knet/__init__.py
================================================


================================================
FILE: knet/cross_entropy_loss.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmdet.models.builder import LOSSES
from mmdet.models.losses.utils import weight_reduce_loss


def cross_entropy(pred,
                  label,
                  weight=None,
                  reduction='mean',
                  avg_factor=None,
                  class_weight=None,
                  ignore_index=-100):
    """Calculate the CrossEntropy loss.
    Args:
        pred (torch.Tensor): The prediction with shape (N, C), C is the number
            of classes.
        label (torch.Tensor): The learning label of the prediction.
        weight (torch.Tensor, optional): Sample-wise loss weight.
        reduction (str, optional): The method used to reduce the loss.
        avg_factor (int, optional): Average factor that is used to average
            the loss. Defaults to None.
        class_weight (list[float], optional): The weight for each class.
    Returns:
        torch.Tensor: The calculated loss
    """
    # element-wise losses
    loss = F.cross_entropy(
        pred,
        label,
        weight=class_weight,
        reduction='none',
        ignore_index=ignore_index)

    # apply weights and do the reduction
    if weight is not None:
        weight = weight.float()
    loss = weight_reduce_loss(
        loss, weight=weight, reduction=reduction, avg_factor=avg_factor)

    return loss


def _expand_onehot_labels(labels, label_weights, label_channels):
    bin_labels = labels.new_full((labels.size(0), label_channels), 0)
    inds = torch.nonzero(
        (labels >= 0) & (labels < label_channels), as_tuple=False).squeeze()
    if inds.numel() > 0:
        bin_labels[inds, labels[inds]] = 1

    if label_weights is None:
        bin_label_weights = None
    else:
        bin_label_weights = label_weights.view(-1, 1).expand(
            label_weights.size(0), label_channels)

    return bin_labels, bin_label_weights


def binary_cross_entropy(pred,
                         label,
                         weight=None,
                         reduction='mean',
                         avg_factor=None,
                         class_weight=None):
    """Calculate the binary CrossEntropy loss.
    Args:
        pred (torch.Tensor): The prediction with shape (N, 1).
        label (torch.Tensor): The learning label of the prediction.
        weight (torch.Tensor, optional): Sample-wise loss weight.
        reduction (str, optional): The method used to reduce the loss.
            Options are "none", "mean" and "sum".
        avg_factor (int, optional): Average factor that is used to average
            the loss. Defaults to None.
        class_weight (list[float], optional): The weight for each class.
    Returns:
        torch.Tensor: The calculated loss
    """
    if pred.dim() != label.dim():
        label, weight = _expand_onehot_labels(label, weight, pred.size(-1))

    # weighted element-wise losses
    if weight is not None:
        weight = weight.float()
    loss = F.binary_cross_entropy_with_logits(
        pred, label.float(), pos_weight=class_weight, reduction='none')
    # do the reduction for the weighted loss
    loss = weight_reduce_loss(
        loss, weight, reduction=reduction, avg_factor=avg_factor)

    return loss


def mask_cross_entropy(pred,
                       target,
                       label,
                       reduction='mean',
                       avg_factor=None,
                       class_weight=None):
    """Calculate the CrossEntropy loss for masks.
    Args:
        pred (torch.Tensor): The prediction with shape (N, C, *), C is the
            number of classes. The trailing * indicates arbitrary shape.
        target (torch.Tensor): The learning label of the prediction.
        label (torch.Tensor): ``label`` indicates the class label of the mask
            corresponding object. This will be used to select the mask in the
            of the class which the object belongs to when the mask prediction
            if not class-agnostic.
        reduction (str, optional): The method used to reduce the loss.
            Options are "none", "mean" and "sum".
        avg_factor (int, optional): Average factor that is used to average
            the loss. Defaults to None.
        class_weight (list[float], optional): The weight for each class.
    Returns:
        torch.Tensor: The calculated loss
    Example:
        >>> N, C = 3, 11
        >>> H, W = 2, 2
        >>> pred = torch.randn(N, C, H, W) * 1000
        >>> target = torch.rand(N, H, W)
        >>> label = torch.randint(0, C, size=(N,))
        >>> reduction = 'mean'
        >>> avg_factor = None
        >>> class_weights = None
        >>> loss = mask_cross_entropy(pred, target, label, reduction,
        >>>                           avg_factor, class_weights)
        >>> assert loss.shape == (1,)
    """
    # TODO: handle these two reserved arguments
    assert reduction == 'mean' and avg_factor is None
    num_rois = pred.size()[0]
    inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device)
    pred_slice = pred[inds, label].squeeze(1)
    return F.binary_cross_entropy_with_logits(
        pred_slice, target, weight=class_weight, reduction='mean')[None]


@LOSSES.register_module(force=True)
class CrossEntropyLoss(nn.Module):

    def __init__(self,
                 use_sigmoid=False,
                 use_mask=False,
                 reduction='mean',
                 class_weight=None,
                 loss_weight=1.0):
        """CrossEntropyLoss.
        Args:
            use_sigmoid (bool, optional): Whether the prediction uses sigmoid
                of softmax. Defaults to False.
            use_mask (bool, optional): Whether to use mask cross entropy loss.
                Defaults to False.
            reduction (str, optional): . Defaults to 'mean'.
                Options are "none", "mean" and "sum".
            class_weight (list[float], optional): Weight of each class.
                Defaults to None.
            loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
        """
        super(CrossEntropyLoss, self).__init__()
        assert (use_sigmoid is False) or (use_mask is False)
        self.use_sigmoid = use_sigmoid
        self.use_mask = use_mask
        self.reduction = reduction
        self.loss_weight = loss_weight
        self.class_weight = class_weight

        if self.use_sigmoid:
            self.cls_criterion = binary_cross_entropy
        elif self.use_mask:
            self.cls_criterion = mask_cross_entropy
        else:
            self.cls_criterion = cross_entropy

    def forward(self,
                cls_score,
                label,
                weight=None,
                avg_factor=None,
                reduction_override=None,
                **kwargs):
        """Forward function.
        Args:
            cls_score (torch.Tensor): The prediction.
            label (torch.Tensor): The learning label of the prediction.
            weight (torch.Tensor, optional): Sample-wise loss weight.
            avg_factor (int, optional): Average factor that is used to average
                the loss. Defaults to None.
            reduction (str, optional): The method used to reduce the loss.
                Options are "none", "mean" and "sum".
        Returns:
            torch.Tensor: The calculated loss
        """
        assert reduction_override in (None, 'none', 'mean', 'sum')
        reduction = (
            reduction_override if reduction_override else self.reduction)
        if self.class_weight is not None:
            class_weight = cls_score.new_tensor(
                self.class_weight, device=cls_score.device)
        else:
            class_weight = None
        loss_cls = self.loss_weight * self.cls_criterion(
            cls_score,
            label,
            weight,
            class_weight=class_weight,
            reduction=reduction,
            avg_factor=avg_factor,
            **kwargs)
        return loss_cls


================================================
FILE: knet/det/dice_loss.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmdet.models.builder import LOSSES, build_loss
from mmdet.models.losses.utils import weighted_loss


@weighted_loss
def dice_loss(input, target, eps=1e-3, numerator_eps=0):
    input = input.reshape(input.size()[0], -1)
    target = target.reshape(target.size()[0], -1).float()

    a = torch.sum(input * target, 1)
    b = torch.sum(input * input, 1) + eps
    c = torch.sum(target * target, 1) + eps
    d = (2 * a + numerator_eps) / (b + c)
    return 1 - d

#
# @LOSSES.register_module()
# class DiceLoss(nn.Module):
#
#     def __init__(self,
#                  eps=1e-3,
#                  numerator_eps=0.0,
#                  use_sigmoid=True,
#                  reduction='mean',
#                  loss_weight=1.0):
#         super(DiceLoss, self).__init__()
#         self.eps = eps
#         self.reduction = reduction
#         self.loss_weight = loss_weight
#         self.use_sigmoid = use_sigmoid
#         self.numerator_eps = numerator_eps
#
#     def forward(self,
#                 pred,
#                 target,
#                 weight=None,
#                 avg_factor=None,
#                 reduction_override=None,
#                 **kwargs):
#         if weight is not None and not torch.any(weight > 0):
#             return (pred * weight).sum()  # 0
#         assert reduction_override in (None, 'none', 'mean', 'sum')
#         reduction = (
#             reduction_override if reduction_override else self.reduction)
#         pred = pred.sigmoid()
#         loss = self.loss_weight * dice_loss(
#             pred,
#             target,
#             weight,
#             eps=self.eps,
#             numerator_eps=self.numerator_eps,
#             reduction=reduction,
#             avg_factor=avg_factor,
#             **kwargs)
#         return loss


================================================
FILE: knet/det/kernel_head.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import (ConvModule, bias_init_with_prob, normal_init)
from mmdet.core import build_assigner, build_sampler, multi_apply, reduce_mean
from mmdet.models.builder import HEADS, build_loss, build_neck
from mmdet.models.losses import accuracy
from mmdet.utils import get_root_logger


@HEADS.register_module()
class ConvKernelHead(nn.Module):

    def __init__(self,
                 num_proposals=100,
                 in_channels=256,
                 out_channels=256,
                 num_heads=8,
                 num_cls_fcs=1,
                 num_seg_convs=1,
                 num_loc_convs=1,
                 att_dropout=False,
                 localization_fpn=None,
                 conv_kernel_size=1,
                 norm_cfg=dict(type='GN', num_groups=32),
                 semantic_fpn=True,
                 train_cfg=None,
                 num_classes=80,
                 xavier_init_kernel=False,
                 kernel_init_std=0.01,
                 use_binary=False,
                 proposal_feats_with_obj=False,
                 loss_mask=None,
                 loss_seg=None,
                 loss_cls=None,
                 loss_dice=None,
                 loss_rank=None,
                 feat_downsample_stride=1,
                 feat_refine_stride=1,
                 feat_refine=True,
                 with_embed=False,
                 feat_embed_only=False,
                 conv_normal_init=False,
                 mask_out_stride=4,
                 hard_target=False,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 ignore_label=255,
                 thing_label_in_seg=0,
                 cat_stuff_mask=False,
                 **kwargs):
        super(ConvKernelHead, self).__init__()
        self.num_proposals = num_proposals
        self.num_cls_fcs = num_cls_fcs
        self.train_cfg = train_cfg
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.num_classes = num_classes
        self.proposal_feats_with_obj = proposal_feats_with_obj
        self.sampling = False
        self.localization_fpn = build_neck(localization_fpn)
        self.semantic_fpn = semantic_fpn
        self.norm_cfg = norm_cfg
        self.num_heads = num_heads
        self.att_dropout = att_dropout
        self.mask_out_stride = mask_out_stride
        self.hard_target = hard_target
        self.conv_kernel_size = conv_kernel_size
        self.xavier_init_kernel = xavier_init_kernel
        self.kernel_init_std = kernel_init_std
        self.feat_downsample_stride = feat_downsample_stride
        self.feat_refine_stride = feat_refine_stride
        self.conv_normal_init = conv_normal_init
        self.feat_refine = feat_refine
        self.with_embed = with_embed
        self.feat_embed_only = feat_embed_only
        self.num_loc_convs = num_loc_convs
        self.num_seg_convs = num_seg_convs
        self.use_binary = use_binary
        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.ignore_label = ignore_label
        self.thing_label_in_seg = thing_label_in_seg
        self.cat_stuff_mask = cat_stuff_mask

        if loss_mask is not None:
            self.loss_mask = build_loss(loss_mask)
        else:
            self.loss_mask = loss_mask

        if loss_dice is not None:
            self.loss_dice = build_loss(loss_dice)
        else:
            self.loss_dice = loss_dice

        if loss_seg is not None:
            self.loss_seg = build_loss(loss_seg)
        else:
            self.loss_seg = loss_seg
        if loss_cls is not None:
            self.loss_cls = build_loss(loss_cls)
        else:
            self.loss_cls = loss_cls

        if loss_rank is not None:
            self.loss_rank = build_loss(loss_rank)
        else:
            self.loss_rank = loss_rank

        if self.train_cfg:
            self.assigner = build_assigner(self.train_cfg.assigner)
            # use PseudoSampler when sampling is False
            if self.sampling and hasattr(self.train_cfg, 'sampler'):
                sampler_cfg = self.train_cfg.sampler
            else:
                sampler_cfg = dict(type='MaskPseudoSampler')
            self.sampler = build_sampler(sampler_cfg, context=self)
        self._init_layers()

    def _init_layers(self):
        """Initialize a sparse set of proposal boxes and proposal features."""
        self.init_kernels = nn.Conv2d(
            self.out_channels,
            self.num_proposals,
            self.conv_kernel_size,
            padding=int(self.conv_kernel_size // 2),
            bias=False)  # (N, C)

        if self.semantic_fpn:
            self.conv_seg = nn.Conv2d(self.out_channels, self.num_classes, 1)


        if self.feat_downsample_stride > 1 and self.feat_refine:
            self.ins_downsample = ConvModule(
                self.in_channels,
                self.out_channels,
                3,
                stride=self.feat_refine_stride,  # 2
                padding=1,
                norm_cfg=self.norm_cfg)
            self.seg_downsample = ConvModule(
                self.in_channels,
                self.out_channels,
                3,
                stride=self.feat_refine_stride,  # 2
                padding=1,
                norm_cfg=self.norm_cfg)

        self.loc_convs = nn.ModuleList()
        for i in range(self.num_loc_convs):
            self.loc_convs.append(
                ConvModule(
                    self.in_channels,
                    self.out_channels,
                    1,
                    norm_cfg=self.norm_cfg))

        self.seg_convs = nn.ModuleList()
        for i in range(self.num_seg_convs):
            self.seg_convs.append(
                ConvModule(
                    self.in_channels,
                    self.out_channels,
                    1,
                    norm_cfg=self.norm_cfg))

    def init_weights(self):
        self.localization_fpn.init_weights()

        if self.feat_downsample_stride > 1 and self.conv_normal_init:
            logger = get_root_logger()
            logger.info('Initialize convs in KPN head by normal std 0.01')
            for conv in [self.loc_convs, self.seg_convs]:
                for m in conv.modules():
                    if isinstance(m, nn.Conv2d):
                        normal_init(m, std=0.01)

        if self.semantic_fpn:
            bias_seg = bias_init_with_prob(0.01)
            if self.loss_seg.use_sigmoid:
                normal_init(self.conv_seg, std=0.01, bias=bias_seg)
            else:
                normal_init(self.conv_seg, mean=0, std=0.01)
        if self.xavier_init_kernel:
            logger = get_root_logger()
            logger.info('Initialize kernels by xavier uniform')
            nn.init.xavier_uniform_(self.init_kernels.weight)
        else:
            logger = get_root_logger()
            logger.info(
                f'Initialize kernels by normal std: {self.kernel_init_std}')
            normal_init(self.init_kernels, mean=0, std=self.kernel_init_std)

    def _decode_init_proposals(self, img, img_metas):
        num_imgs = len(img_metas)

        localization_feats = self.localization_fpn(img)

        ## thing branch
        if isinstance(localization_feats, list):
            loc_feats = localization_feats[0]
        else:
            loc_feats = localization_feats
        for conv in self.loc_convs:
            loc_feats = conv(loc_feats)
        if self.feat_downsample_stride > 1 and self.feat_refine:
            loc_feats = self.ins_downsample(loc_feats)

        # init kernel prediction
        mask_preds = self.init_kernels(loc_feats)

        # stuff branch
        if self.semantic_fpn:
            if isinstance(localization_feats, list):
                semantic_feats = localization_feats[1]
            else:
                semantic_feats = localization_feats
            for conv in self.seg_convs:
                semantic_feats = conv(semantic_feats)
            if self.feat_downsample_stride > 1 and self.feat_refine:
                semantic_feats = self.seg_downsample(semantic_feats)
        else:
            semantic_feats = None

        if semantic_feats is not None:
            seg_preds = self.conv_seg(semantic_feats)
        else:
            seg_preds = None

        proposal_feats = self.init_kernels.weight.clone()
        proposal_feats = proposal_feats[None].expand(num_imgs,
                                                     *proposal_feats.size())

        if semantic_feats is not None:
            x_feats = semantic_feats + loc_feats
        else:
            x_feats = loc_feats

        if self.proposal_feats_with_obj:
            sigmoid_masks = mask_preds.sigmoid()
            nonzero_inds = sigmoid_masks > 0.5
            if self.use_binary:
                sigmoid_masks = nonzero_inds.float()
            else:
                sigmoid_masks = nonzero_inds.float() * sigmoid_masks
            obj_feats = torch.einsum('bnhw, bchw->bnc', sigmoid_masks, x_feats)

        cls_scores = None

        if self.proposal_feats_with_obj:  # important use
            proposal_feats = proposal_feats + obj_feats.view(
                num_imgs, self.num_proposals, self.out_channels, 1, 1)

        if self.cat_stuff_mask and not self.training:
            mask_preds = torch.cat(
                [mask_preds, seg_preds[:, self.num_thing_classes:]], dim=1)
            stuff_kernels = self.conv_seg.weight[self.
                                                 num_thing_classes:].clone()
            stuff_kernels = stuff_kernels[None].expand(num_imgs,
                                                       *stuff_kernels.size())
            proposal_feats = torch.cat([proposal_feats, stuff_kernels], dim=1)  # (b, N_{st}+N_{th}, c)

        return proposal_feats, x_feats, mask_preds, cls_scores, seg_preds

    def forward_train(self,
                      img,
                      img_metas,
                      gt_masks,
                      gt_labels,
                      gt_sem_seg=None,
                      gt_sem_cls=None):
        """Forward function in training stage."""
        num_imgs = len(img_metas)
        results = self._decode_init_proposals(img, img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = results
        if self.feat_downsample_stride > 1:
            scaled_mask_preds = F.interpolate(
                mask_preds,
                scale_factor=self.feat_downsample_stride,
                mode='bilinear',
                align_corners=False)
            if seg_preds is not None:
                scaled_seg_preds = F.interpolate(
                    seg_preds,
                    scale_factor=self.feat_downsample_stride,
                    mode='bilinear',
                    align_corners=False)
        else:
            scaled_mask_preds = mask_preds  # thing
            scaled_seg_preds = seg_preds   # stuff

        if self.hard_target:
            gt_masks = [x.bool().float() for x in gt_masks]
        else:
            gt_masks = gt_masks

        sampling_results = []
        if cls_scores is None:
            detached_cls_scores = [None] * num_imgs
        else:
            detached_cls_scores = cls_scores.detach()

        for i in range(num_imgs):
            assign_result = self.assigner.assign(scaled_mask_preds[i].detach(),
                                                 detached_cls_scores[i],
                                                 gt_masks[i], gt_labels[i],
                                                 img_metas[i])
            sampling_result = self.sampler.sample(assign_result,
                                                  scaled_mask_preds[i],
                                                  gt_masks[i])
            sampling_results.append(sampling_result)

        mask_targets = self.get_targets(
            sampling_results,
            gt_masks,
            self.train_cfg,
            True,
            gt_sem_seg=gt_sem_seg,
            gt_sem_cls=gt_sem_cls)

        losses = self.loss(scaled_mask_preds, cls_scores, scaled_seg_preds,
                           proposal_feats, *mask_targets)

        if self.cat_stuff_mask and self.training:
            mask_preds = torch.cat(
                [mask_preds, seg_preds[:, self.num_thing_classes:]], dim=1)
            stuff_kernels = self.conv_seg.weight[self.
                                                 num_thing_classes:].clone()
            stuff_kernels = stuff_kernels[None].expand(num_imgs,
                                                       *stuff_kernels.size())
            proposal_feats = torch.cat([proposal_feats, stuff_kernels], dim=1)

        return losses, proposal_feats, x_feats, mask_preds, cls_scores

    def loss(self,
             mask_pred,
             cls_scores,
             seg_preds,
             proposal_feats,
             labels,
             label_weights,
             mask_targets,
             mask_weights,
             seg_targets,
             reduction_override=None,
             **kwargs):
        losses = dict()
        bg_class_ind = self.num_classes
        # note in spare rcnn num_gt == num_pos
        pos_inds = (labels >= 0) & (labels < bg_class_ind)
        num_preds = mask_pred.shape[0] * mask_pred.shape[1]

        if cls_scores is not None:
            num_pos = pos_inds.sum().float()
            avg_factor = reduce_mean(num_pos)
            assert mask_pred.shape[0] == cls_scores.shape[0]
            assert mask_pred.shape[1] == cls_scores.shape[1]
            losses['loss_rpn_cls'] = self.loss_cls(
                cls_scores.view(num_preds, -1),
                labels,
                label_weights,
                avg_factor=avg_factor,
                reduction_override=reduction_override)
            losses['rpn_pos_acc'] = accuracy(
                cls_scores.view(num_preds, -1)[pos_inds], labels[pos_inds])

        bool_pos_inds = pos_inds.type(torch.bool)
        # 0~self.num_classes-1 are FG, self.num_classes is BG
        # do not perform bounding box regression for BG anymore.
        H, W = mask_pred.shape[-2:]
        if pos_inds.any():
            pos_mask_pred = mask_pred.reshape(num_preds, H, W)[bool_pos_inds]
            pos_mask_targets = mask_targets[bool_pos_inds]
            losses['loss_rpn_mask'] = self.loss_mask(pos_mask_pred,
                                                     pos_mask_targets)
            losses['loss_rpn_dice'] = self.loss_dice(pos_mask_pred,
                                                     pos_mask_targets)

            if self.loss_rank is not None:
                batch_size = mask_pred.size(0)
                rank_target = mask_targets.new_full((batch_size, H, W),
                                                    self.ignore_label,
                                                    dtype=torch.long)
                rank_inds = pos_inds.view(batch_size,
                                          -1).nonzero(as_tuple=False)
                batch_mask_targets = mask_targets.view(batch_size, -1, H,
                                                       W).bool()
                for i in range(batch_size):
                    curr_inds = (rank_inds[:, 0] == i)
                    curr_rank = rank_inds[:, 1][curr_inds]
                    for j in curr_rank:
                        rank_target[i][batch_mask_targets[i][j]] = j
                losses['loss_rpn_rank'] = self.loss_rank(
                    mask_pred, rank_target, ignore_index=self.ignore_label)

        else:
            losses['loss_rpn_mask'] = mask_pred.sum() * 0
            losses['loss_rpn_dice'] = mask_pred.sum() * 0
            if self.loss_rank is not None:
                losses['loss_rank'] = mask_pred.sum() * 0

        if seg_preds is not None:
            # focal loss
            if self.loss_seg.use_sigmoid:
                cls_channel = seg_preds.shape[1]
                flatten_seg = seg_preds.view(
                    -1, cls_channel,
                    H * W).permute(0, 2, 1).reshape(-1, cls_channel)
                flatten_seg_target = seg_targets.view(-1)
                num_dense_pos = (flatten_seg_target >= 0) & (
                    flatten_seg_target < bg_class_ind)
                num_dense_pos = num_dense_pos.sum().float().clamp(min=1.0)
                losses['loss_rpn_seg'] = self.loss_seg(
                    flatten_seg,
                    flatten_seg_target,
                    avg_factor=num_dense_pos)
            # ce loss
            else:
                cls_channel = seg_preds.shape[1]
                flatten_seg = seg_preds.view(-1, cls_channel, H * W).permute(
                    0, 2, 1).reshape(-1, cls_channel)
                flatten_seg_target = seg_targets.view(-1)
                losses['loss_rpn_seg'] = self.loss_seg(flatten_seg,
                                                       flatten_seg_target, ignore_index=self.num_classes)

        return losses

    def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask,
                           pos_gt_mask, pos_gt_labels, gt_sem_seg, gt_sem_cls,
                           cfg):
        num_pos = pos_mask.size(0)
        num_neg = neg_mask.size(0)
        num_samples = num_pos + num_neg
        H, W = pos_mask.shape[-2:]
        # original implementation uses new_zeros since BG are set to be 0
        # now use empty & fill because BG cat_id = num_classes,
        # FG cat_id = [0, num_classes-1]
        labels = pos_mask.new_full((num_samples, ),
                                   self.num_classes,
                                   dtype=torch.long)
        label_weights = pos_mask.new_zeros(num_samples)
        mask_targets = pos_mask.new_zeros(num_samples, H, W)
        mask_weights = pos_mask.new_zeros(num_samples, H, W)
        seg_targets = pos_mask.new_full((H, W),
                                        self.num_classes,
                                        dtype=torch.long)

        if gt_sem_cls is not None and gt_sem_seg is not None:
            gt_sem_seg = gt_sem_seg.bool()
            for sem_mask, sem_cls in zip(gt_sem_seg, gt_sem_cls):
                seg_targets[sem_mask] = sem_cls.long()

        if num_pos > 0:
            labels[pos_inds] = pos_gt_labels
            pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
            label_weights[pos_inds] = pos_weight
            mask_targets[pos_inds, ...] = pos_gt_mask
            mask_weights[pos_inds, ...] = 1
            for i in range(num_pos):
                seg_targets[pos_gt_mask[i].bool()] = pos_gt_labels[i]

        if num_neg > 0:
            label_weights[neg_inds] = 1.0

        return labels, label_weights, mask_targets, mask_weights, seg_targets

    def get_targets(self,
                    sampling_results,
                    gt_mask,
                    rpn_train_cfg,
                    concat=True,
                    gt_sem_seg=None,
                    gt_sem_cls=None):
        pos_inds_list = [res.pos_inds for res in sampling_results]
        neg_inds_list = [res.neg_inds for res in sampling_results]
        pos_mask_list = [res.pos_masks for res in sampling_results]
        neg_mask_list = [res.neg_masks for res in sampling_results]
        pos_gt_mask_list = [res.pos_gt_masks for res in sampling_results]
        pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results]
        if gt_sem_seg is None:
            gt_sem_seg = [None] * 2
            gt_sem_cls = [None] * 2
        results = multi_apply(
            self._get_target_single,
            pos_inds_list,
            neg_inds_list,
            pos_mask_list,
            neg_mask_list,
            pos_gt_mask_list,
            pos_gt_labels_list,
            gt_sem_seg,
            gt_sem_cls,
            cfg=rpn_train_cfg)
        (labels, label_weights, mask_targets, mask_weights,
         seg_targets) = results
        if concat:
            labels = torch.cat(labels, 0)
            label_weights = torch.cat(label_weights, 0)
            mask_targets = torch.cat(mask_targets, 0)
            mask_weights = torch.cat(mask_weights, 0)
            seg_targets = torch.stack(seg_targets, 0)
        return labels, label_weights, mask_targets, mask_weights, seg_targets

    def simple_test_rpn(self, img, img_metas):
        """Forward function in testing stage."""
        return self._decode_init_proposals(img, img_metas)

    def forward_dummy(self, img, img_metas):
        """Dummy forward function.

        Used in flops calculation.
        """
        return self._decode_init_proposals(img, img_metas)


================================================
FILE: knet/det/kernel_iter_head.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmdet.core import build_assigner, build_sampler
from mmdet.models.builder import HEADS, build_head
from mmdet.models.roi_heads import BaseRoIHead

from .mask_pseudo_sampler import MaskPseudoSampler


@HEADS.register_module()
class KernelIterHead(BaseRoIHead):

    def __init__(self,
                 num_stages=6,
                 recursive=False,
                 assign_stages=5,
                 stage_loss_weights=(1, 1, 1, 1, 1, 1),
                 proposal_feature_channel=256,
                 merge_cls_scores=False,
                 do_panoptic=False,
                 post_assign=False,
                 hard_target=False,
                 merge_joint=False,
                 num_proposals=100,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 ignore_label=255,
                 thing_label_in_seg=0,
                 mask_head=dict(
                     type='KernelUpdateHead',
                     num_classes=80,
                     num_fcs=2,
                     num_heads=8,
                     num_cls_fcs=1,
                     num_reg_fcs=3,
                     feedforward_channels=2048,
                     hidden_channels=256,
                     dropout=0.0,
                     roi_feat_size=7,
                     ffn_act_cfg=dict(type='ReLU', inplace=True)),
                 mask_out_stride=4,
                 train_cfg=None,
                 test_cfg=None,
                 **kwargs):
        assert mask_head is not None
        assert len(stage_loss_weights) == num_stages
        self.num_stages = num_stages
        self.stage_loss_weights = stage_loss_weights
        self.proposal_feature_channel = proposal_feature_channel
        self.merge_cls_scores = merge_cls_scores
        self.recursive = recursive
        self.post_assign = post_assign
        self.mask_out_stride = mask_out_stride
        self.hard_target = hard_target
        self.assign_stages = assign_stages
        self.do_panoptic = do_panoptic
        self.merge_joint = merge_joint
        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.num_classes = self.num_thing_classes + self.num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.thing_label_in_seg = thing_label_in_seg
        self.num_proposals = num_proposals
        self.ignore_label = ignore_label
        super(KernelIterHead, self).__init__(
            mask_head=mask_head, train_cfg=train_cfg, test_cfg=test_cfg, **kwargs)
        # train_cfg would be None when run the test.py
        if train_cfg is not None:
            for stage in range(num_stages):
                assert isinstance(
                    self.mask_sampler[stage], MaskPseudoSampler), \
                    'Sparse Mask only support `MaskPseudoSampler`'

    def init_bbox_head(self, mask_roi_extractor, mask_head):
        """Initialize box head and box roi extractor.

        Args:
            mask_roi_extractor (dict): Config of box roi extractor.
            mask_head (dict): Config of box in box head.
        """
        pass

    def init_assigner_sampler(self):
        """Initialize assigner and sampler for each stage."""
        self.mask_assigner = []
        self.mask_sampler = []
        if self.train_cfg is not None:
            for idx, rcnn_train_cfg in enumerate(self.train_cfg):
                self.mask_assigner.append(
                    build_assigner(rcnn_train_cfg.assigner))
                self.current_stage = idx
                self.mask_sampler.append(
                    build_sampler(rcnn_train_cfg.sampler, context=self))

    def init_weights(self):
        for i in range(self.num_stages):
            self.mask_head[i].init_weights()

    def init_mask_head(self, mask_roi_extractor, mask_head):
        """Initialize mask head and mask roi extractor.

        Args:
            mask_roi_extractor (dict): Config of mask roi extractor.
            mask_head (dict): Config of mask in mask head.
        """
        self.mask_head = nn.ModuleList()
        if not isinstance(mask_head, list):
            mask_head = [mask_head for _ in range(self.num_stages)]
        assert len(mask_head) == self.num_stages
        for head in mask_head:
            self.mask_head.append(build_head(head))
        if self.recursive:
            for i in range(self.num_stages):
                self.mask_head[i] = self.mask_head[0]

    def _mask_forward(self, stage, x, object_feats, mask_preds, img_metas):
        mask_head = self.mask_head[stage]
        cls_score, mask_preds, object_feats = mask_head(
            x, object_feats, mask_preds, img_metas=img_metas)
        if mask_head.mask_upsample_stride > 1 and (stage == self.num_stages - 1
                                                   or self.training):
            scaled_mask_preds = F.interpolate(
                mask_preds,
                scale_factor=mask_head.mask_upsample_stride,
                align_corners=False,
                mode='bilinear')
        else:
            scaled_mask_preds = mask_preds
        mask_results = dict(
            cls_score=cls_score,
            mask_preds=mask_preds,
            scaled_mask_preds=scaled_mask_preds,
            object_feats=object_feats)

        return mask_results

    def forward_train(self,
                      x,
                      proposal_feats,
                      mask_preds,
                      cls_score,
                      img_metas,
                      gt_masks,
                      gt_labels,
                      gt_bboxes_ignore=None,
                      imgs_whwh=None,
                      gt_bboxes=None,
                      gt_sem_seg=None,
                      gt_sem_cls=None):

        num_imgs = len(img_metas)
        if self.mask_head[0].mask_upsample_stride > 1:
            prev_mask_preds = F.interpolate(
                mask_preds.detach(),
                scale_factor=self.mask_head[0].mask_upsample_stride,
                mode='bilinear',
                align_corners=False)
        else:
            prev_mask_preds = mask_preds.detach()

        if cls_score is not None:
            prev_cls_score = cls_score.detach()
        else:
            prev_cls_score = [None] * num_imgs

        if self.hard_target:
            gt_masks = [x.bool().float() for x in gt_masks]
        else:
            gt_masks = gt_masks

        object_feats = proposal_feats
        all_stage_loss = {}
        all_stage_mask_results = []
        assign_results = []
        for stage in range(self.num_stages):
            mask_results = self._mask_forward(stage, x, object_feats,
                                              mask_preds, img_metas)
            all_stage_mask_results.append(mask_results)
            mask_preds = mask_results['mask_preds']
            scaled_mask_preds = mask_results['scaled_mask_preds']
            cls_score = mask_results['cls_score']
            object_feats = mask_results['object_feats']

            if self.post_assign:
                prev_mask_preds = scaled_mask_preds.detach()
                prev_cls_score = cls_score.detach()

            sampling_results = []
            if stage < self.assign_stages:
                assign_results = []
            for i in range(num_imgs):
                if stage < self.assign_stages:
                    mask_for_assign = prev_mask_preds[i][:self.num_proposals]
                    if prev_cls_score[i] is not None:
                        cls_for_assign = prev_cls_score[
                            i][:self.num_proposals, :self.num_thing_classes]
                    else:
                        cls_for_assign = None
                    assign_result = self.mask_assigner[stage].assign(
                        mask_for_assign, cls_for_assign, gt_masks[i],
                        gt_labels[i], img_metas[i])
                    assign_results.append(assign_result)
                sampling_result = self.mask_sampler[stage].sample(
                    assign_results[i], scaled_mask_preds[i], gt_masks[i])
                sampling_results.append(sampling_result)
            mask_targets = self.mask_head[stage].get_targets(
                sampling_results,
                gt_masks,
                gt_labels,
                self.train_cfg[stage],
                True,
                gt_sem_seg=gt_sem_seg,
                gt_sem_cls=gt_sem_cls)

            single_stage_loss = self.mask_head[stage].loss(
                object_feats,
                cls_score,
                scaled_mask_preds,
                *mask_targets,
                imgs_whwh=imgs_whwh)
            for key, value in single_stage_loss.items():
                all_stage_loss[f's{stage}_{key}'] = value * \
                                    self.stage_loss_weights[stage]

            if not self.post_assign:
                prev_mask_preds = scaled_mask_preds.detach()
                prev_cls_score = cls_score.detach()

        return all_stage_loss

    def simple_test(self,
                    x,
                    proposal_feats,
                    mask_preds,
                    cls_score,
                    img_metas,
                    imgs_whwh=None,
                    rescale=False):

        # Decode initial proposals
        num_imgs = len(img_metas)
        # num_proposals = proposal_feats.size(1)

        object_feats = proposal_feats
        for stage in range(self.num_stages):
            mask_results = self._mask_forward(stage, x, object_feats,
                                              mask_preds, img_metas)
            object_feats = mask_results['object_feats']
            cls_score = mask_results['cls_score']
            mask_preds = mask_results['mask_preds']
            scaled_mask_preds = mask_results['scaled_mask_preds']

        num_classes = self.mask_head[-1].num_classes
        results = []

        if self.mask_head[-1].loss_cls.use_sigmoid:
            cls_score = cls_score.sigmoid()
        else:
            cls_score = cls_score.softmax(-1)[..., :-1]

        if self.do_panoptic:
            for img_id in range(num_imgs):
                single_result = self.get_panoptic(cls_score[img_id],
                                                  scaled_mask_preds[img_id],
                                                  self.test_cfg,
                                                  img_metas[img_id])
                results.append(single_result)
        else:
            for img_id in range(num_imgs):
                cls_score_per_img = cls_score[img_id]
                scores_per_img, topk_indices = cls_score_per_img.flatten(
                    0, 1).topk(
                        self.test_cfg.max_per_img, sorted=True)
                mask_indices = topk_indices // num_classes
                labels_per_img = topk_indices % num_classes
                masks_per_img = scaled_mask_preds[img_id][mask_indices]
                single_result = self.mask_head[-1].get_seg_masks(
                    masks_per_img, labels_per_img, scores_per_img,
                    self.test_cfg, img_metas[img_id])
                results.append(single_result)
        return results

    def simple_test_mask_preds(self,
                    x,
                    proposal_feats,
                    mask_preds,
                    cls_score,
                    img_metas,
                    imgs_whwh=None,
                    rescale=False):

        # Decode initial proposals
        num_imgs = len(img_metas)
        # num_proposals = proposal_feats.size(1)

        object_feats = proposal_feats
        for stage in range(self.num_stages):
            mask_results = self._mask_forward(stage, x, object_feats,
                                              mask_preds, img_metas)
            object_feats = mask_results['object_feats']
            cls_score = mask_results['cls_score']
            mask_preds = mask_results['mask_preds']
            scaled_mask_preds = mask_results['scaled_mask_preds']

        if self.mask_head[-1].loss_cls.use_sigmoid:
            cls_score = cls_score.sigmoid()
        else:
            cls_score = cls_score.softmax(-1)[..., :-1]
        return object_feats, cls_score, mask_preds, scaled_mask_preds


    def aug_test(self, features, proposal_list, img_metas, rescale=False):
        raise NotImplementedError('SparseMask does not support `aug_test`')

    def forward_dummy(self, x, proposal_boxes, proposal_feats, img_metas):
        """Dummy forward function when do the flops computing."""
        all_stage_mask_results = []
        num_imgs = len(img_metas)
        num_proposals = proposal_feats.size(1)
        C, H, W = x.shape[-3:]
        mask_preds = proposal_feats.bmm(x.view(num_imgs, C, -1)).view(
            num_imgs, num_proposals, H, W)
        object_feats = proposal_feats
        for stage in range(self.num_stages):
            mask_results = self._mask_forward(stage, x, object_feats,
                                              mask_preds, img_metas)
            all_stage_mask_results.append(mask_results)
        return all_stage_mask_results

    def get_panoptic(self, cls_scores, mask_preds, test_cfg, img_meta):
        # resize mask predictions back
        thing_scores = cls_scores[:self.num_proposals][:, :self.
                                                       num_thing_classes]
        thing_mask_preds = mask_preds[:self.num_proposals]
        thing_scores, topk_indices = thing_scores.flatten(0, 1).topk(
            self.test_cfg.max_per_img, sorted=True)
        mask_indices = topk_indices // self.num_thing_classes
        thing_labels = topk_indices % self.num_thing_classes
        masks_per_img = thing_mask_preds[mask_indices]
        thing_masks = self.mask_head[-1].rescale_masks(masks_per_img, img_meta)

        if not self.merge_joint:
            thing_masks = thing_masks > test_cfg.mask_thr
        bbox_result, segm_result = self.mask_head[-1].segm2result(
            thing_masks, thing_labels, thing_scores)

        stuff_scores = cls_scores[
            self.num_proposals:][:, self.num_thing_classes:].diag()
        stuff_scores, stuff_inds = torch.sort(stuff_scores, descending=True)
        stuff_masks = mask_preds[self.num_proposals:][stuff_inds]
        stuff_masks = self.mask_head[-1].rescale_masks(stuff_masks, img_meta)

        if not self.merge_joint:
            stuff_masks = stuff_masks > test_cfg.mask_thr

        if self.merge_joint:
            stuff_labels = stuff_inds + self.num_thing_classes
            panoptic_result = self.merge_stuff_thing_stuff_joint(thing_masks, thing_labels,
                                                     thing_scores, stuff_masks,
                                                     stuff_labels, stuff_scores,
                                                     test_cfg.merge_stuff_thing)
        else:
            stuff_labels = stuff_inds + 1
            panoptic_result = self.merge_stuff_thing(thing_masks, thing_labels,
                                                     thing_scores, stuff_masks,
                                                     stuff_labels, stuff_scores,
                                                     test_cfg.merge_stuff_thing)
        return bbox_result, segm_result, panoptic_result

    def split_thing_stuff(self, mask_preds, det_labels, cls_scores):
        thing_scores = cls_scores[:self.num_proposals]
        thing_masks = mask_preds[:self.num_proposals]
        thing_labels = det_labels[:self.num_proposals]

        stuff_labels = det_labels[self.num_proposals:]
        stuff_labels = stuff_labels - self.num_thing_classes + 1
        stuff_masks = mask_preds[self.num_proposals:]
        stuff_scores = cls_scores[self.num_proposals:]

        results = (thing_masks, thing_labels, thing_scores, stuff_masks,
                   stuff_labels, stuff_scores)
        return results

    def merge_stuff_thing(self,
                          thing_masks,
                          thing_labels,
                          thing_scores,
                          stuff_masks,
                          stuff_labels,
                          stuff_scores,
                          merge_cfg=None):

        H, W = thing_masks.shape[-2:]
        panoptic_seg = thing_masks.new_zeros((H, W), dtype=torch.int32)
        thing_masks = thing_masks.to(
            dtype=torch.bool, device=panoptic_seg.device)
        stuff_masks = stuff_masks.to(
            dtype=torch.bool, device=panoptic_seg.device)

        # sort instance outputs by scores
        sorted_inds = torch.argsort(-thing_scores)
        current_segment_id = 0
        segments_info = []
        # Add instances one-by-one, check for overlaps with existing ones
        for inst_id in sorted_inds:
            score = thing_scores[inst_id].item()
            if score < merge_cfg.instance_score_thr:
                break
            mask = thing_masks[inst_id]  # H,W
            mask_area = mask.sum().item()

            if mask_area == 0:
                continue

            intersect = (mask > 0) & (panoptic_seg > 0)
            intersect_area = intersect.sum().item()

            if intersect_area * 1.0 / mask_area > merge_cfg.iou_thr:
                continue

            if intersect_area > 0:
                mask = mask & (panoptic_seg == 0)

            mask_area = mask.sum().item()
            if mask_area == 0:
                continue

            current_segment_id += 1
            panoptic_seg[mask.bool()] = current_segment_id
            segments_info.append({
                'id': current_segment_id,
                'isthing': True,
                'score': score,
                'category_id': thing_labels[inst_id].item(),
                'instance_id': inst_id.item(),
            })

        # Add semantic results to remaining empty areas
        sorted_inds = torch.argsort(-stuff_scores)
        sorted_stuff_labels = stuff_labels[sorted_inds]
        # paste semantic masks following the order of scores
        processed_label = []
        for semantic_label in sorted_stuff_labels:
            semantic_label = semantic_label.item()
            if semantic_label in processed_label:
                continue
            processed_label.append(semantic_label)
            sem_inds = stuff_labels == semantic_label
            sem_masks = stuff_masks[sem_inds].sum(0).bool()
            mask = sem_masks & (panoptic_seg == 0)
            mask_area = mask.sum().item()
            if mask_area < merge_cfg.stuff_max_area:
                continue

            current_segment_id += 1
            panoptic_seg[mask] = current_segment_id
            segments_info.append({
                'id': current_segment_id,
                'isthing': False,
                'category_id': semantic_label,
                'area': mask_area,
            })
        return panoptic_seg.cpu().numpy(), segments_info

    def merge_stuff_thing_stuff_joint(self,
                                      thing_masks,
                                      thing_labels,
                                      thing_scores,
                                      stuff_masks,
                                      stuff_labels,
                                      stuff_scores,
                                      merge_cfg=None):

        H, W = thing_masks.shape[-2:]
        panoptic_seg = thing_masks.new_zeros((H, W), dtype=torch.int32)

        total_masks = torch.cat([thing_masks, stuff_masks], dim=0)
        total_scores = torch.cat([thing_scores, stuff_scores], dim=0)
        total_labels = torch.cat([thing_labels, stuff_labels], dim=0)

        cur_prob_masks = total_scores.view(-1, 1, 1) * total_masks
        segments_info = []
        cur_mask_ids = cur_prob_masks.argmax(0)

        # sort instance outputs by scores
        sorted_inds = torch.argsort(-total_scores)
        current_segment_id = 0

        for k in sorted_inds:
            pred_class = total_labels[k].item()
            isthing = pred_class < self.num_thing_classes
            if isthing and total_scores[k] < merge_cfg.instance_score_thr:
                continue

            mask = cur_mask_ids == k
            mask_area = mask.sum().item()
            original_area = (total_masks[k] >= 0.5).sum().item()

            if mask_area > 0 and original_area > 0:
                if mask_area / original_area < merge_cfg.overlap_thr:
                    continue
                current_segment_id += 1

                panoptic_seg[mask] = current_segment_id

                if isthing:
                    segments_info.append({
                        'id': current_segment_id,
                        'isthing': isthing,
                        'score': total_scores[k].item(),
                        'category_id': pred_class,
                        'instance_id': k.item(),
                    })
                else:
                    segments_info.append({
                        'id': current_segment_id,
                        'isthing': isthing,
                        'category_id': pred_class - self.num_thing_classes + 1,
                        'area': mask_area,
                    })

        return panoptic_seg.cpu().numpy(), segments_info


================================================
FILE: knet/det/kernel_update_head.py
================================================
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import (ConvModule, bias_init_with_prob,
                      build_activation_layer, build_norm_layer)
from mmcv.runner import force_fp32
from mmdet.core import multi_apply
from mmdet.models.builder import HEADS, build_loss
from mmdet.models.dense_heads.atss_head import reduce_mean
from mmdet.models.losses import accuracy
from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention, build_transformer_layer
from mmdet.utils import get_root_logger


@HEADS.register_module()
class KernelUpdateHead(nn.Module):

    def __init__(self,
                 num_classes=80,
                 num_ffn_fcs=2,
                 num_heads=8,
                 num_cls_fcs=1,
                 num_mask_fcs=3,
                 feedforward_channels=2048,
                 in_channels=256,
                 out_channels=256,
                 dropout=0.0,
                 mask_thr=0.5,
                 act_cfg=dict(type='ReLU', inplace=True),
                 ffn_act_cfg=dict(type='ReLU', inplace=True),
                 conv_kernel_size=3,
                 feat_transform_cfg=None,
                 hard_mask_thr=0.5,
                 kernel_init=False,
                 with_ffn=True,
                 mask_out_stride=4,
                 relative_coors=False,
                 relative_coors_off=False,
                 feat_gather_stride=1,
                 mask_transform_stride=1,
                 mask_upsample_stride=1,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 ignore_label=255,
                 thing_label_in_seg=0,
                 kernel_updator_cfg=dict(
                     type='DynamicConv',
                     in_channels=256,
                     feat_channels=64,
                     out_channels=256,
                     input_feat_shape=1,
                     act_cfg=dict(type='ReLU', inplace=True),
                     norm_cfg=dict(type='LN')),
                 loss_rank=None,
                 loss_mask=dict(
                     type='CrossEntropyLoss', use_mask=True, loss_weight=1.0),
                 loss_dice=dict(type='DiceLoss', loss_weight=3.0),
                 loss_cls=dict(
                     type='FocalLoss',
                     use_sigmoid=True,
                     gamma=2.0,
                     alpha=0.25,
                     loss_weight=2.0)):
        super(KernelUpdateHead, self).__init__()
        self.num_classes = num_classes
        self.loss_cls = build_loss(loss_cls)
        self.loss_mask = build_loss(loss_mask)
        self.loss_dice = build_loss(loss_dice)
        if loss_rank is not None:
            self.loss_rank = build_loss(loss_rank)
        else:
            self.loss_rank = loss_rank

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.mask_thr = mask_thr
        self.fp16_enabled = False
        self.dropout = dropout

        self.num_heads = num_heads
        self.hard_mask_thr = hard_mask_thr
        self.kernel_init = kernel_init
        self.with_ffn = with_ffn
        self.mask_out_stride = mask_out_stride
        self.relative_coors = relative_coors
        self.relative_coors_off = relative_coors_off
        self.conv_kernel_size = conv_kernel_size
        self.feat_gather_stride = feat_gather_stride
        self.mask_transform_stride = mask_transform_stride
        self.mask_upsample_stride = mask_upsample_stride

        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.ignore_label = ignore_label
        self.thing_label_in_seg = thing_label_in_seg

        self.attention = MultiheadAttention(
            in_channels * conv_kernel_size**2, num_heads, dropout)
        self.attention_norm = build_norm_layer(
            dict(type='LN'), in_channels * conv_kernel_size**2)[1]

        self.kernel_update_conv = build_transformer_layer(kernel_updator_cfg)

        if feat_transform_cfg is not None:
            kernel_size = feat_transform_cfg.pop('kernel_size', 1)
            self.feat_transform = ConvModule(
                in_channels,
                in_channels,
                kernel_size,
                stride=feat_gather_stride,
                padding=int(feat_gather_stride // 2),
                **feat_transform_cfg)
        else:
            self.feat_transform = None

        if self.with_ffn:
            self.ffn = FFN(
                in_channels,
                feedforward_channels,
                num_ffn_fcs,
                act_cfg=ffn_act_cfg,
                dropout=dropout)
            self.ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1]

        self.cls_fcs = nn.ModuleList()
        for _ in range(num_cls_fcs):
            self.cls_fcs.append(
                nn.Linear(in_channels, in_channels, bias=False))
            self.cls_fcs.append(
                build_norm_layer(dict(type='LN'), in_channels)[1])
            self.cls_fcs.append(build_activation_layer(act_cfg))

        if self.loss_cls.use_sigmoid:
            self.fc_cls = nn.Linear(in_channels, self.num_classes)
        else:
            self.fc_cls = nn.Linear(in_channels, self.num_classes + 1)

        self.mask_fcs = nn.ModuleList()
        for _ in range(num_mask_fcs):
            self.mask_fcs.append(
                nn.Linear(in_channels, in_channels, bias=False))
            self.mask_fcs.append(
                build_norm_layer(dict(type='LN'), in_channels)[1])
            self.mask_fcs.append(build_activation_layer(act_cfg))

        self.fc_mask = nn.Linear(in_channels, out_channels)

    def init_weights(self):
        """Use xavier initialization for all weight parameter and set
        classification head bias as a specific value when use focal loss."""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
            else:
                # adopt the default initialization for
                # the weight and bias of the layer norm
                pass
        if self.loss_cls.use_sigmoid:
            bias_init = bias_init_with_prob(0.01)
            nn.init.constant_(self.fc_cls.bias, bias_init)
        if self.kernel_init:
            logger = get_root_logger()
            logger.info(
                'mask kernel in mask head is normal initialized by std 0.01')
            nn.init.normal_(self.fc_mask.weight, mean=0, std=0.01)

    def forward(self,
                x,
                proposal_feat,
                mask_preds,
                prev_cls_score=None,
                mask_shape=None,
                img_metas=None):

        N, num_proposals = proposal_feat.shape[:2]
        if self.feat_transform is not None:
            x = self.feat_transform(x)
        C, H, W = x.shape[-3:]

        mask_h, mask_w = mask_preds.shape[-2:]
        if mask_h != H or mask_w != W:
            gather_mask = F.interpolate(
                mask_preds, (H, W), align_corners=False, mode='bilinear')
        else:
            gather_mask = mask_preds

        sigmoid_masks = gather_mask.sigmoid()
        nonzero_inds = sigmoid_masks > self.hard_mask_thr
        sigmoid_masks = nonzero_inds.float()

        # einsum is faster than bmm by 30%
        x_feat = torch.einsum('bnhw,bchw->bnc', sigmoid_masks, x)

        # obj_feat in shape [B, N, C, K, K] -> [B, N, C, K*K] -> [B, N, K*K, C]
        proposal_feat = proposal_feat.reshape(N, num_proposals,
                                              self.in_channels,
                                              -1).permute(0, 1, 3, 2)
        obj_feat = self.kernel_update_conv(x_feat, proposal_feat)

        # [B, N, K*K, C] -> [B, N, K*K*C] -> [N, B, K*K*C]
        obj_feat = obj_feat.reshape(N, num_proposals,
                                    -1).permute(1, 0, 2)
        obj_feat = self.attention_norm(self.attention(obj_feat))
        # [N, B, K*K*C] -> [B, N, K*K*C]
        obj_feat = obj_feat.permute(1, 0, 2)

        # obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C]
        obj_feat = obj_feat.reshape(N, num_proposals, -1, self.in_channels)

        # FFN
        if self.with_ffn:
            obj_feat = self.ffn_norm(self.ffn(obj_feat))

        cls_feat = obj_feat.sum(-2)
        mask_feat = obj_feat

        for cls_layer in self.cls_fcs:
            cls_feat = cls_layer(cls_feat)
        for reg_layer in self.mask_fcs:
            mask_feat = reg_layer(mask_feat)

        cls_score = self.fc_cls(cls_feat).view(N, num_proposals, -1)
        # [B, N, K*K, C] -> [B, N, C, K*K]
        mask_feat = self.fc_mask(mask_feat).permute(0, 1, 3, 2)


        if (self.mask_transform_stride == 2
                and self.feat_gather_stride == 1):
            mask_x = F.interpolate(
                x, scale_factor=0.5, mode='bilinear', align_corners=False)
            H, W = mask_x.shape[-2:]
        else:
            mask_x = x
        # group conv is 5x faster than unfold and uses about 1/5 memory
        # Group conv vs. unfold vs. concat batch, 2.9ms :13.5ms :3.8ms
        # Group conv vs. unfold vs. concat batch, 278 : 1420 : 369
        # fold_x = F.unfold(
        #     mask_x,
        #     self.conv_kernel_size,
        #     padding=int(self.conv_kernel_size // 2))
        # mask_feat = mask_feat.reshape(N, num_proposals, -1)
        # new_mask_preds = torch.einsum('bnc,bcl->bnl', mask_feat, fold_x)
        # [B, N, C, K*K] -> [B*N, C, K, K]
        mask_feat = mask_feat.reshape(N, num_proposals, C,
                                      self.conv_kernel_size,
                                      self.conv_kernel_size)
        # [B, C, H, W] -> [1, B*C, H, W]
        new_mask_preds = []
        for i in range(N):
            new_mask_preds.append(
                F.conv2d(
                    mask_x[i:i + 1],
                    mask_feat[i],
                    padding=int(self.conv_kernel_size // 2)))

        new_mask_preds = torch.cat(new_mask_preds, dim=0)
        new_mask_preds = new_mask_preds.reshape(N, num_proposals, H, W)
        if self.mask_transform_stride == 2:
            new_mask_preds = F.interpolate(
                new_mask_preds,
                scale_factor=2,
                mode='bilinear',
                align_corners=False)

        if mask_shape is not None and mask_shape[0] != H:
            new_mask_preds = F.interpolate(
                new_mask_preds,
                mask_shape,
                align_corners=False,
                mode='bilinear')

        return cls_score, new_mask_preds, obj_feat.permute(0, 1, 3, 2).reshape(
            N, num_proposals, self.in_channels, self.conv_kernel_size,
            self.conv_kernel_size)

    @force_fp32(apply_to=('cls_score', 'mask_pred'))
    def loss(self,
             object_feats,
             cls_score,
             mask_pred,
             labels,
             label_weights,
             mask_targets,
             mask_weights,
             imgs_whwh=None,
             reduction_override=None,
             **kwargs):

        losses = dict()
        bg_class_ind = self.num_classes
        # note in spare rcnn num_gt == num_pos
        pos_inds = (labels >= 0) & (labels < bg_class_ind)
        num_pos = pos_inds.sum().float()
        avg_factor = reduce_mean(num_pos).clamp_(min=1.0)

        num_preds = mask_pred.shape[0] * mask_pred.shape[1]
        assert mask_pred.shape[0] == cls_score.shape[0]
        assert mask_pred.shape[1] == cls_score.shape[1]

        if cls_score is not None:
            if cls_score.numel() > 0:
                losses['loss_cls'] = self.loss_cls(
                    cls_score.view(num_preds, -1),
                    labels,
                    label_weights,
                    avg_factor=avg_factor,
                    reduction_override=reduction_override)
                losses['pos_acc'] = accuracy(
                    cls_score.view(num_preds, -1)[pos_inds], labels[pos_inds])
        if mask_pred is not None:
            bool_pos_inds = pos_inds.type(torch.bool)
            # 0~self.num_classes-1 are FG, self.num_classes is BG
            # do not perform bounding box regression for BG anymore.
            H, W = mask_pred.shape[-2:]
            if pos_inds.any():
                pos_mask_pred = mask_pred.reshape(num_preds, H,
                                                  W)[bool_pos_inds]
                pos_mask_targets = mask_targets[bool_pos_inds]
                losses['loss_mask'] = self.loss_mask(pos_mask_pred,
                                                     pos_mask_targets)
                losses['loss_dice'] = self.loss_dice(pos_mask_pred,
                                                     pos_mask_targets)

                if self.loss_rank is not None:
                    batch_size = mask_pred.size(0)
                    rank_target = mask_targets.new_full((batch_size, H, W),
                                                        self.ignore_label,
                                                        dtype=torch.long)
                    rank_inds = pos_inds.view(batch_size,
                                              -1).nonzero(as_tuple=False)
                    batch_mask_targets = mask_targets.view(
                        batch_size, -1, H, W).bool()
                    for i in range(batch_size):
                        curr_inds = (rank_inds[:, 0] == i)
                        curr_rank = rank_inds[:, 1][curr_inds]
                        for j in curr_rank:
                            rank_target[i][batch_mask_targets[i][j]] = j
                    losses['loss_rank'] = self.loss_rank(
                        mask_pred, rank_target, ignore_index=self.ignore_label)
            else:
                losses['loss_mask'] = mask_pred.sum() * 0
                losses['loss_dice'] = mask_pred.sum() * 0
                if self.loss_rank is not None:
                    losses['loss_rank'] = mask_pred.sum() * 0

        return losses

    def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask,
                           pos_gt_mask, pos_gt_labels, gt_sem_seg, gt_sem_cls,
                           cfg):

        num_pos = pos_mask.size(0)
        num_neg = neg_mask.size(0)
        num_samples = num_pos + num_neg
        H, W = pos_mask.shape[-2:]
        # original implementation uses new_zeros since BG are set to be 0
        # now use empty & fill because BG cat_id = num_classes,
        # FG cat_id = [0, num_classes-1]
        labels = pos_mask.new_full((num_samples, ),
                                   self.num_classes,
                                   dtype=torch.long)
        label_weights = pos_mask.new_zeros((num_samples, self.num_classes))
        mask_targets = pos_mask.new_zeros(num_samples, H, W)
        mask_weights = pos_mask.new_zeros(num_samples, H, W)
        if num_pos > 0:
            labels[pos_inds] = pos_gt_labels
            pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
            label_weights[pos_inds] = pos_weight
            pos_mask_targets = pos_gt_mask
            mask_targets[pos_inds, ...] = pos_mask_targets
            mask_weights[pos_inds, ...] = 1

        if num_neg > 0:
            label_weights[neg_inds] = 1.0

        if gt_sem_cls is not None and gt_sem_seg is not None:
            sem_labels = pos_mask.new_full((self.num_stuff_classes, ),
                                           self.num_classes,
                                           dtype=torch.long)
            sem_targets = pos_mask.new_zeros(self.num_stuff_classes, H, W)
            sem_weights = pos_mask.new_zeros(self.num_stuff_classes, H, W)
            sem_stuff_weights = torch.eye(
                self.num_stuff_classes, device=pos_mask.device)
            sem_thing_weights = pos_mask.new_zeros(
                (self.num_stuff_classes, self.num_thing_classes))
            sem_label_weights = torch.cat(
                [sem_thing_weights, sem_stuff_weights], dim=-1)
            if len(gt_sem_cls > 0):
                sem_inds = gt_sem_cls - self.num_thing_classes
                sem_inds = sem_inds.long()
                sem_labels[sem_inds] = gt_sem_cls.long()
                sem_targets[sem_inds] = gt_sem_seg
                sem_weights[sem_inds] = 1

            label_weights[:, self.num_thing_classes:] = 0
            labels = torch.cat([labels, sem_labels])
            label_weights = torch.cat([label_weights, sem_label_weights])
            mask_targets = torch.cat([mask_targets, sem_targets])
            mask_weights = torch.cat([mask_weights, sem_weights])

        return labels, label_weights, mask_targets, mask_weights

    def get_targets(self,
                    sampling_results,
                    gt_mask,
                    gt_labels,
                    rcnn_train_cfg,
                    concat=True,
                    gt_sem_seg=None,
                    gt_sem_cls=None):
  
        pos_inds_list = [res.pos_inds for res in sampling_results]
        neg_inds_list = [res.neg_inds for res in sampling_results]
        pos_mask_list = [res.pos_masks for res in sampling_results]
        neg_mask_list = [res.neg_masks for res in sampling_results]
        pos_gt_mask_list = [res.pos_gt_masks for res in sampling_results]
        pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results]
        if gt_sem_seg is None:
            gt_sem_seg = [None] * 2
            gt_sem_cls = [None] * 2

        labels, label_weights, mask_targets, mask_weights = multi_apply(
            self._get_target_single,
            pos_inds_list,
            neg_inds_list,
            pos_mask_list,
            neg_mask_list,
            pos_gt_mask_list,
            pos_gt_labels_list,
            gt_sem_seg,
            gt_sem_cls,
            cfg=rcnn_train_cfg)
        if concat:
            labels = torch.cat(labels, 0)
            label_weights = torch.cat(label_weights, 0)
            mask_targets = torch.cat(mask_targets, 0)
            mask_weights = torch.cat(mask_weights, 0)
        return labels, label_weights, mask_targets, mask_weights

    def rescale_masks(self, masks_per_img, img_meta):
        h, w, _ = img_meta['img_shape']
        masks_per_img = F.interpolate(
            masks_per_img.unsqueeze(0).sigmoid(),
            size=img_meta['batch_input_shape'],
            mode='bilinear',
            align_corners=False)

        masks_per_img = masks_per_img[:, :, :h, :w]
        ori_shape = img_meta['ori_shape']
        seg_masks = F.interpolate(
            masks_per_img,
            size=ori_shape[:2],
            mode='bilinear',
            align_corners=False).squeeze(0)
        return seg_masks

    def get_seg_masks(self, masks_per_img, labels_per_img, scores_per_img,
                      test_cfg, img_meta):
        # resize mask predictions back
        seg_masks = self.rescale_masks(masks_per_img, img_meta)
        seg_masks = seg_masks > test_cfg.mask_thr
        bbox_result, segm_result = self.segm2result(seg_masks, labels_per_img,
                                                    scores_per_img)
        return bbox_result, segm_result

    def segm2result(self, mask_preds, det_labels, cls_scores):
        num_classes = self.num_classes
        bbox_result = None
        segm_result = [[] for _ in range(num_classes)]
        mask_preds = mask_preds.cpu().numpy()
        det_labels = det_labels.cpu().numpy()
        cls_scores = cls_scores.cpu().numpy()
        num_ins = mask_preds.shape[0]
        # fake bboxes
        bboxes = np.zeros((num_ins, 5), dtype=np.float32)
        bboxes[:, -1] = cls_scores
        bbox_result = [bboxes[det_labels == i, :] for i in range(num_classes)]
        for idx in range(num_ins):
            segm_result[det_labels[idx]].append(mask_preds[idx])
        return bbox_result, segm_result


================================================
FILE: knet/det/knet.py
================================================
import torch
import torch.nn.functional as F
from mmdet.models.builder import DETECTORS
from mmdet.models.detectors import TwoStageDetector

from .utils import sem2ins_masks, sem2ins_masks_cityscapes, sem2ins_masks_kitti_step


@DETECTORS.register_module()
class KNet(TwoStageDetector):

    def __init__(self,
                 *args,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 ignore_label=255,
                 thing_label_in_seg=0,
                 cityscapes=False,
                 kitti_step=False,
                 **kwargs):
        super(KNet, self).__init__(*args, **kwargs)
        assert self.with_rpn, 'KNet does not support external proposals'
        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.thing_label_in_seg = thing_label_in_seg
        self.ignore_label = ignore_label
        self.cityscapes = cityscapes  # whether to train the cityscape panoptic segmentation
        self.kitti_step = kitti_step

    def forward_train(self,
                      img,
                      img_metas,
                      gt_bboxes=None,
                      gt_labels=None,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      proposals=None,
                      gt_semantic_seg=None,
                      **kwargs):
        """Forward function of SparseR-CNN in train stage.

        Args:
            img (Tensor): of shape (N, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.
            img_metas (list[dict]): list of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                :class:`mmdet.datasets.pipelines.Collect`.
            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
            gt_labels (list[Tensor]): class indices corresponding to each box
            gt_bboxes_ignore (None | list[Tensor): specify which bounding
                boxes can be ignored when computing the loss.
            gt_masks (List[Tensor], optional) : Segmentation masks for
                each box. But we don't support it in this architecture.
            proposals (List[Tensor], optional): override rpn proposals with
                custom proposals. Use when `with_rpn` is False.

        Returns:
            dict[str, Tensor]: a dictionary of loss components
        """
        super(TwoStageDetector, self).forward_train(img, img_metas)
        assert proposals is None, 'KNet does not support' \
                                  ' external proposals'
        assert gt_masks is not None

        # gt_masks and gt_semantic_seg are not padded when forming batch
        gt_masks_tensor = []
        gt_sem_seg = []
        gt_sem_cls = []
        # batch_input_shape shoud be the same across images
        pad_H, pad_W = img_metas[0]['batch_input_shape']
        assign_H = pad_H // self.mask_assign_stride
        assign_W = pad_W // self.mask_assign_stride

        for i, gt_mask in enumerate(gt_masks):
            mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device)
            if gt_mask.width != pad_W or gt_mask.height != pad_H:
                pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height)
                mask_tensor = F.pad(mask_tensor, pad_wh, value=0)

            if gt_semantic_seg is not None:
                # gt_semantic seg is padded by zero when forming a batch
                # need to convert them from 0 to ignore
                gt_semantic_seg[
                    i, :, img_metas[i]['img_shape'][0]:, :] = self.ignore_label
                gt_semantic_seg[
                    i, :, :, img_metas[i]['img_shape'][1]:] = self.ignore_label
                if self.cityscapes:
                    sem_labels, sem_seg = sem2ins_masks_cityscapes(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=self.num_thing_classes)
                elif self.kitti_step:
                    sem_labels, sem_seg = sem2ins_masks_kitti_step(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=2,
                        thing_label_in_seg=(11, 13))
                else:
                    sem_labels, sem_seg = sem2ins_masks(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=self.num_thing_classes,
                        thing_label_in_seg=self.thing_label_in_seg)

                if sem_seg.shape[0] == 0:
                    gt_sem_seg.append(
                        mask_tensor.new_zeros(
                            (mask_tensor.size(0), assign_H, assign_W)))
                else:
                    gt_sem_seg.append(
                        F.interpolate(
                            sem_seg[None], (assign_H, assign_W),
                            mode='bilinear',
                            align_corners=False)[0])
                gt_sem_cls.append(sem_labels)
            else:
                gt_sem_seg = None
                gt_sem_cls = None

            if mask_tensor.shape[0] == 0:
                gt_masks_tensor.append(
                    mask_tensor.new_zeros(
                        (mask_tensor.size(0), assign_H, assign_W)))
            else:
                gt_masks_tensor.append(
                    F.interpolate(
                        mask_tensor[None], (assign_H, assign_W),
                        mode='bilinear',
                        align_corners=False)[0])

        gt_masks = gt_masks_tensor
        x = self.extract_feat(img)
        rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks,
                                                  gt_labels, gt_sem_seg,
                                                  gt_sem_cls)
        (rpn_losses, proposal_feats, x_feats, mask_preds,
         cls_scores) = rpn_results

        losses = self.roi_head.forward_train(
            x_feats,
            proposal_feats,
            mask_preds,
            cls_scores,
            img_metas,
            gt_masks,
            gt_labels,
            gt_bboxes_ignore=gt_bboxes_ignore,
            gt_bboxes=gt_bboxes,
            gt_sem_seg=gt_sem_seg,
            gt_sem_cls=gt_sem_cls,
            imgs_whwh=None)

        losses.update(rpn_losses)
        return losses

    def simple_test(self, img, img_metas, rescale=False):
        """Test function without test time augmentation.

        Args:
            imgs (list[torch.Tensor]): List of multiple images
            img_metas (list[dict]): List of image information.
            rescale (bool): Whether to rescale the results.
                Defaults to False.

        Returns:
            list[list[np.ndarray]]: BBox results of each image and classes.
                The outer list corresponds to each image. The inner list
                corresponds to each class.
        """
        x = self.extract_feat(img)
        rpn_results = self.rpn_head.simple_test_rpn(x, img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results
        segm_results = self.roi_head.simple_test(
            x_feats,
            proposal_feats,
            mask_preds,
            cls_scores,
            img_metas,
            imgs_whwh=None,
            rescale=rescale)
        if self.kitti_step:
            res = segm_results[0]
            segm_results[0] = (*res, None, None)
        return segm_results

    def forward_dummy(self, img):
        """Used for computing network flops.

        See `mmdetection/tools/get_flops.py`
        """
        # backbone
        x = self.extract_feat(img)
        # rpn
        num_imgs = len(img)
        dummy_img_metas = [
            dict(img_shape=(3, *img.shape[-2:])) for _ in range(num_imgs)
        ]
        rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results
        # roi_head
        roi_outs = self.roi_head.simple_test_mask_preds(x_feats, proposal_feats, mask_preds, cls_scores, dummy_img_metas)
        return roi_outs


================================================
FILE: knet/det/mask_hungarian_assigner.py
================================================
import numpy as np
import torch

from mmdet.core import AssignResult, BaseAssigner, reduce_mean
from mmdet.core.bbox.builder import BBOX_ASSIGNERS
from mmdet.core.bbox.match_costs.builder import MATCH_COST, build_match_cost

try:
    from scipy.optimize import linear_sum_assignment
except ImportError:
    linear_sum_assignment = None


@MATCH_COST.register_module()
class DiceCost(object):
    """DiceCost.

     Args:
         weight (int | float, optional): loss_weight
         pred_act (bool): Whether to activate the prediction
            before calculating cost

     Examples:
         >>> from mmdet.core.bbox.match_costs.match_cost import BBoxL1Cost
         >>> import torch
         >>> self = BBoxL1Cost()
         >>> bbox_pred = torch.rand(1, 4)
         >>> gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
         >>> factor = torch.tensor([10, 8, 10, 8])
         >>> self(bbox_pred, gt_bboxes, factor)
         tensor([[1.6172, 1.6422]])
    """

    def __init__(self,
                 weight=1.,
                 pred_act=False,
                 act_mode='sigmoid',
                 eps=1e-3):
        self.weight = weight
        self.pred_act = pred_act
        self.act_mode = act_mode
        self.eps = eps

    def dice_loss(cls, input, target, eps=1e-3):
        input = input.reshape(input.size()[0], -1)
        target = target.reshape(target.size()[0], -1).float()
        # einsum saves 10x memory
        # a = torch.sum(input[:, None] * target[None, ...], -1)
        a = torch.einsum('nh,mh->nm', input, target)
        b = torch.sum(input * input, 1) + eps
        c = torch.sum(target * target, 1) + eps
        d = (2 * a) / (b[:, None] + c[None, ...])
        # 1 is a constance that will not affect the matching, so ommitted
        return -d

    def __call__(self, mask_preds, gt_masks):
        """
        Args:
            bbox_pred (Tensor): Predicted boxes with normalized coordinates
                (cx, cy, w, h), which are all in range [0, 1]. Shape
                [num_query, 4].
            gt_bboxes (Tensor): Ground truth boxes with normalized
                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].

        Returns:
            torch.Tensor: bbox_cost value with weight
        """
        if self.pred_act and self.act_mode == 'sigmoid':
            mask_preds = mask_preds.sigmoid().clamp(min=0.001, max=1.0)
        elif self.pred_act:
            mask_preds = mask_preds.softmax(dim=0)
        # print("mask pred:", mask_preds)
        dice_cost = self.dice_loss(mask_preds, gt_masks, self.eps)
        return dice_cost * self.weight


@MATCH_COST.register_module()
class MaskCost(object):
    """MaskCost.

    Args:
        weight (int | float, optional): loss_weight
    """

    def __init__(self, weight=1., pred_act=False, act_mode='sigmoid'):
        self.weight = weight
        self.pred_act = pred_act
        self.act_mode = act_mode

    def __call__(self, cls_pred, target):
        """
        Args:
            cls_pred (Tensor): Predicted classification logits, shape
                [num_query, num_class].
            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).

        Returns:
            torch.Tensor: cls_cost value with weight
        """
        if self.pred_act and self.act_mode == 'sigmoid':
            cls_pred = cls_pred.sigmoid().clamp(min=0.01, max=1.0)
        elif self.pred_act:
            cls_pred = cls_pred.softmax(dim=0)
        num_proposals = cls_pred.shape[0]
        num_gts, H, W = target.shape
        # flatten_cls_pred = cls_pred.view(num_proposals, -1)
        # eingum is ~10 times faster than matmul
        pos_cost = torch.einsum('nhw,mhw->nm', cls_pred, target)
        neg_cost = torch.einsum('nhw,mhw->nm', 1 - cls_pred, 1 - target)
        # flatten_target = target.view(num_gts, -1).t()
        # pos_cost = flatten_cls_pred.matmul(flatten_target)
        # neg_cost = (1 - flatten_cls_pred).matmul(1 - flatten_target)
        cls_cost = -(pos_cost + neg_cost) / (H * W)
        return cls_cost * self.weight


@BBOX_ASSIGNERS.register_module()
class MaskHungarianAssigner(BaseAssigner):
    """Computes one-to-one matching between predictions and ground truth.

    This class computes an assignment between the targets and the predictions
    based on the costs. The costs are weighted sum of three components:
    classfication cost, regression L1 cost and regression iou cost. The
    targets don't include the no_object, so generally there are more
    predictions than targets. After the one-to-one matching, the un-matched
    are treated as backgrounds. Thus each query prediction will be assigned
    with `0` or a positive integer indicating the ground truth index:

    - 0: negative sample, no assigned gt
    - positive integer: positive sample, index (1-based) of assigned gt

    Args:
        cls_weight (int | float, optional): The scale factor for classification
            cost. Default 1.0.
        bbox_weight (int | float, optional): The scale factor for regression
            L1 cost. Default 1.0.
        iou_weight (int | float, optional): The scale factor for regression
            iou cost. Default 1.0.
        iou_calculator (dict | optional): The config for the iou calculation.
            Default type `BboxOverlaps2D`.
        iou_mode (str | optional): "iou" (intersection over union), "iof"
                (intersection over foreground), or "giou" (generalized
                intersection over union). Default "giou".
    """

    def __init__(self,
                 cls_cost=dict(type='ClassificationCost', weight=1.),
                 mask_cost=dict(type='SigmoidCost', weight=1.0),
                 dice_cost=dict(),
                 boundary_cost=None,
                 topk=1):
        self.cls_cost = build_match_cost(cls_cost)
        self.mask_cost = build_match_cost(mask_cost)
        self.dice_cost = build_match_cost(dice_cost)
        if boundary_cost is not None:
            self.boundary_cost = build_match_cost(boundary_cost)
        else:
            self.boundary_cost = None
        self.topk = topk

    def assign(self,
               bbox_pred,
               cls_pred,
               gt_bboxes,
               gt_labels,
               img_meta=None,
               gt_bboxes_ignore=None,
               eps=1e-7):
        """Computes one-to-one matching based on the weighted costs.

        This method assign each query prediction to a ground truth or
        background. The `assigned_gt_inds` with -1 means don't care,
        0 means negative sample, and positive number is the index (1-based)
        of assigned gt.
        The assignment is done in the following steps, the order matters.

        1. assign every prediction to -1
        2. compute the weighted costs
        3. do Hungarian matching on CPU based on the costs
        4. assign all to 0 (background) first, then for each matched pair
           between predictions and gts, treat this prediction as foreground
           and assign the corresponding gt index (plus 1) to it.

        Args:
            bbox_pred (Tensor): Predicted boxes with normalized coordinates
                (cx, cy, w, h), which are all in range [0, 1]. Shape
                [num_query, 4].
            cls_pred (Tensor): Predicted classification logits, shape
                [num_query, num_class].
            gt_bboxes (Tensor): Ground truth boxes with unnormalized
                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
            img_meta (dict): Meta information for current image.
            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
                labelled as `ignored`. Default None.
            eps (int | float, optional): A value added to the denominator for
                numerical stability. Default 1e-7.

        Returns:
            :obj:`AssignResult`: The assigned result.
        """
        assert gt_bboxes_ignore is None, \
            'Only case when gt_bboxes_ignore is None is supported.'
        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)

        # 1. assign -1 by default
        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
                                              -1,
                                              dtype=torch.long)
        assigned_labels = bbox_pred.new_full((num_bboxes, ),
                                             -1,
                                             dtype=torch.long)
        assigned_instance_ids = bbox_pred.new_full((num_bboxes,),
                                             -1,
                                             dtype=torch.long)
        if num_gts == 0 or num_bboxes == 0:
            # No ground truth or boxes, return empty assignment
            if num_gts == 0:
                # No ground truth, assign all to background
                assigned_gt_inds[:] = 0
            return AssignResult(
                num_gts, assigned_gt_inds, None, labels=assigned_labels)

        # 2. compute the weighted costs
        # classification and bboxcost.
        if self.cls_cost.weight != 0 and cls_pred is not None:
            cls_cost = self.cls_cost(cls_pred, gt_labels)
        else:
            cls_cost = 0
        if self.mask_cost.weight != 0:
            reg_cost = self.mask_cost(bbox_pred, gt_bboxes)
        else:
            reg_cost = 0
        if self.dice_cost.weight != 0:
            dice_cost = self.dice_cost(bbox_pred, gt_bboxes)
        else:
            dice_cost = 0
        if self.boundary_cost is not None and self.boundary_cost.weight != 0:
            b_cost = self.boundary_cost(bbox_pred, gt_bboxes)
        else:
            b_cost = 0
        cost = cls_cost + reg_cost + dice_cost + b_cost


        # 3. do Hungarian matching on CPU using linear_sum_assignment
        cost = cost.detach().cpu()
        if linear_sum_assignment is None:
            raise ImportError('Please run "pip install scipy" '
                              'to install scipy first.')
        if self.topk == 1:
            matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
        else:
            topk_matched_row_inds = []
            topk_matched_col_inds = []
            for i in range(self.topk):
                matched_row_inds, matched_col_inds = linear_sum_assignment(
                    cost)
                topk_matched_row_inds.append(matched_row_inds)
                topk_matched_col_inds.append(matched_col_inds)
                cost[matched_row_inds] = 1e10
            matched_row_inds = np.concatenate(topk_matched_row_inds)
            matched_col_inds = np.concatenate(topk_matched_col_inds)

        matched_row_inds = torch.from_numpy(matched_row_inds).to(
            bbox_pred.device)
        matched_col_inds = torch.from_numpy(matched_col_inds).to(
            bbox_pred.device)

        # 4. assign backgrounds and foregrounds
        # assign all indices to backgrounds first
        assigned_gt_inds[:] = 0
        # assign foregrounds based on matching results
        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]

        return AssignResult(
        num_gts, assigned_gt_inds, None, labels=assigned_labels)


================================================
FILE: knet/det/mask_pseudo_sampler.py
================================================
import torch

from mmdet.core.bbox import BaseSampler, SamplingResult
from mmdet.core.bbox.builder import BBOX_SAMPLERS


class MaskSamplingResult(SamplingResult):
    """Bbox sampling result.

    Example:
        >>> # xdoctest: +IGNORE_WANT
        >>> from mmdet.core.bbox.samplers.sampling_result import *  # NOQA
        >>> self = SamplingResult.random(rng=10)
        >>> print(f'self = {self}')
        self = <SamplingResult({
            'neg_masks': torch.Size([12, 4]),
            'neg_inds': tensor([ 0,  1,  2,  4,  5,  6,  7,  8,  9, 10, 11, 12]),
            'num_gts': 4,
            'pos_assigned_gt_inds': tensor([], dtype=torch.int64),
            'pos_masks': torch.Size([0, 4]),
            'pos_inds': tensor([], dtype=torch.int64),
            'pos_is_gt': tensor([], dtype=torch.uint8)
        })>
    """

    def __init__(self, pos_inds, neg_inds, masks, gt_masks, assign_result,
                 gt_flags):
        self.pos_inds = pos_inds
        self.neg_inds = neg_inds
        self.pos_masks = masks[pos_inds]
        self.neg_masks = masks[neg_inds]
        self.pos_is_gt = gt_flags[pos_inds]

        self.num_gts = gt_masks.shape[0]
        self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1

        if gt_masks.numel() == 0:
            # hack for index error case
            assert self.pos_assigned_gt_inds.numel() == 0
            self.pos_gt_masks = torch.empty_like(gt_masks)
        else:
            self.pos_gt_masks = gt_masks[self.pos_assigned_gt_inds, :]

        if assign_result.labels is not None:
            self.pos_gt_labels = assign_result.labels[pos_inds]
        else:
            self.pos_gt_labels = None

        if "pids" in assign_result._extra_properties.keys():
            self.pos_gt_pids = assign_result._extra_properties['pids'][pos_inds]
        else:
            self.pos_gt_pids = None

    @property
    def masks(self):
        """torch.Tensor: concatenated positive and negative boxes"""
        return torch.cat([self.pos_masks, self.neg_masks])

    def __nice__(self):
        data = self.info.copy()
        data['pos_masks'] = data.pop('pos_masks').shape
        data['neg_masks'] = data.pop('neg_masks').shape
        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
        body = '    ' + ',\n    '.join(parts)
        return '{\n' + body + '\n}'

    @property
    def info(self):
        """Returns a dictionary of info about the object."""
        return {
            'pos_inds': self.pos_inds,
            'neg_inds': self.neg_inds,
            'pos_masks': self.pos_masks,
            'neg_masks': self.neg_masks,
            'pos_is_gt': self.pos_is_gt,
            'num_gts': self.num_gts,
            'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
        }


class MaskSamplingResultWithScore(SamplingResult):
    """Bbox sampling result.

    Example:
        >>> # xdoctest: +IGNORE_WANT
        >>> from mmdet.core.bbox.samplers.sampling_result import *  # NOQA
        >>> self = SamplingResult.random(rng=10)
        >>> print(f'self = {self}')
        self = <SamplingResult({
            'neg_masks': torch.Size([12, 4]),
            'neg_inds': tensor([ 0,  1,  2,  4,  5,  6,  7,  8,  9, 10, 11, 12]),
            'num_gts': 4,
            'pos_assigned_gt_inds': tensor([], dtype=torch.int64),
            'pos_masks': torch.Size([0, 4]),
            'pos_inds': tensor([], dtype=torch.int64),
            'pos_is_gt': tensor([], dtype=torch.uint8)
        })>
    """

    def __init__(self, pos_inds, neg_inds, masks, scores, gt_masks, assign_result,
                 gt_flags):
        self.pos_inds = pos_inds
        self.neg_inds = neg_inds
        self.pos_masks = masks[pos_inds]
        self.neg_masks = masks[neg_inds]

        self.pos_scores = scores[pos_inds]
        self.neg_scores = scores[neg_inds]

        self.pos_is_gt = gt_flags[pos_inds]

        self.num_gts = gt_masks.shape[0]
        self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1

        if gt_masks.numel() == 0:
            # hack for index error case
            assert self.pos_assigned_gt_inds.numel() == 0
            self.pos_gt_masks = torch.empty_like(gt_masks)
        else:
            self.pos_gt_masks = gt_masks[self.pos_assigned_gt_inds, :]

        if assign_result.labels is not None:
            self.pos_gt_labels = assign_result.labels[pos_inds]
        else:
            self.pos_gt_labels = None

        if "pids" in assign_result._extra_properties.keys():
            self.pos_gt_pids = assign_result._extra_properties['pids'][pos_inds]
        else:
            self.pos_gt_pids = None

    @property
    def masks(self):
        """torch.Tensor: concatenated positive and negative boxes"""
        return torch.cat([self.pos_masks, self.neg_masks])

    def __nice__(self):
        data = self.info.copy()
        data['pos_masks'] = data.pop('pos_masks').shape
        data['neg_masks'] = data.pop('neg_masks').shape
        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
        body = '    ' + ',\n    '.join(parts)
        return '{\n' + body + '\n}'

    @property
    def info(self):
        """Returns a dictionary of info about the object."""
        return {
            'pos_inds': self.pos_inds,
            'neg_inds': self.neg_inds,
            'pos_masks': self.pos_masks,
            'neg_masks': self.neg_masks,
            'pos_is_gt': self.pos_is_gt,
            'num_gts': self.num_gts,
            'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
        }

@BBOX_SAMPLERS.register_module()
class MaskPseudoSampler(BaseSampler):
    """A pseudo sampler that does not do sampling actually."""

    def __init__(self, **kwargs):
        pass

    def _sample_pos(self, **kwargs):
        """Sample positive samples."""
        raise NotImplementedError

    def _sample_neg(self, **kwargs):
        """Sample negative samples."""
        raise NotImplementedError

    def sample(self, assign_result, masks, gt_masks, **kwargs):
        """Directly returns the positive and negative indices  of samples.

        Args:
            assign_result (:obj:`AssignResult`): Assigned results
            masks (torch.Tensor): Bounding boxes
            gt_masks (torch.Tensor): Ground truth boxes

        Returns:
            :obj:`SamplingResult`: sampler results
        """
        pos_inds = torch.nonzero(
            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
        neg_inds = torch.nonzero(
            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
        gt_flags = masks.new_zeros(masks.shape[0], dtype=torch.uint8)
        sampling_result = MaskSamplingResult(pos_inds, neg_inds, masks,
                                             gt_masks, assign_result, gt_flags)
        return sampling_result


@BBOX_SAMPLERS.register_module()
class MaskScorePseudoSampler(BaseSampler):
    """A pseudo sampler that does not do sampling actually."""

    def __init__(self, **kwargs):
        pass

    def _sample_pos(self, **kwargs):
        """Sample positive samples."""
        raise NotImplementedError

    def _sample_neg(self, **kwargs):
        """Sample negative samples."""
        raise NotImplementedError

    def sample(self, assign_result, masks, score, gt_masks, **kwargs):
        """Directly returns the positive and negative indices  of samples.

        Args:
            assign_result (:obj:`AssignResult`): Assigned results
            masks (torch.Tensor): Bounding boxes
            gt_masks (torch.Tensor): Ground truth boxes

        Returns:
            :obj:`SamplingResult`: sampler results
        """
        pos_inds = torch.nonzero(
            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
        neg_inds = torch.nonzero(
            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
        gt_flags = masks.new_zeros(masks.shape[0], dtype=torch.uint8)
        sampling_result = MaskSamplingResultWithScore(pos_inds, neg_inds, masks, score,
                                             gt_masks, assign_result, gt_flags)
        return sampling_result

================================================
FILE: knet/det/msdeformattn_decoder.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import (Conv2d, ConvModule, caffe2_xavier_init,
                      normal_init, xavier_init)
from mmdet.models.builder import NECKS

from mmcv.cnn.bricks.transformer import (build_positional_encoding,
                                         build_transformer_layer_sequence)
from mmcv.runner import BaseModule, ModuleList

from mmdet.core.anchor import MlvlPointGenerator
from mmdet.models.utils.transformer import MultiScaleDeformableAttention


@NECKS.register_module()
class MSDeformAttnPixelDecoder(BaseModule):
    """Pixel decoder with multi-scale deformable attention.

    Args:
        in_channels (list[int] | tuple[int]): Number of channels in the
            input feature maps.
        strides (list[int] | tuple[int]): Output strides of feature from
            backbone.
        feat_channels (int): Number of channels for feature.
        out_channels (int): Number of channels for output.
        num_outs (int): Number of output scales.
        norm_cfg (:obj:`mmcv.ConfigDict` | dict): Config for normalization.
            Defaults to dict(type='GN', num_groups=32).
        act_cfg (:obj:`mmcv.ConfigDict` | dict): Config for activation.
            Defaults to dict(type='ReLU').
        encoder (:obj:`mmcv.ConfigDict` | dict): Config for transformer
            encoder. Defaults to `DetrTransformerEncoder`.
        positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for
            transformer encoder position encoding. Defaults to
            dict(type='SinePositionalEncoding', num_feats=128,
            normalize=True).
        init_cfg (:obj:`mmcv.ConfigDict` | dict): Initialization config dict.
    """

    def __init__(self,
                 in_channels=[256, 512, 1024, 2048],
                 strides=[4, 8, 16, 32],
                 feat_channels=256,
                 out_channels=256,
                 num_outs=3,
                 return_one_list=True,
                 norm_cfg=dict(type='GN', num_groups=32),
                 act_cfg=dict(type='ReLU'),
                 encoder=dict(
                     type='DetrTransformerEncoder',
                     num_layers=6,
                     transformerlayers=dict(
                         type='BaseTransformerLayer',
                         attn_cfgs=dict(
                             type='MultiScaleDeformableAttention',
                             embed_dims=256,
                             num_heads=8,
                             num_levels=3,
                             num_points=4,
                             im2col_step=64,
                             dropout=0.0,
                             batch_first=False,
                             norm_cfg=None,
                             init_cfg=None),
                         feedforward_channels=1024,
                         ffn_dropout=0.0,
                         operation_order=('self_attn', 'norm', 'ffn', 'norm')),
                     init_cfg=None),
                 positional_encoding=dict(
                     type='SinePositionalEncoding',
                     num_feats=128,
                     normalize=True),
                 init_cfg=None):
        super().__init__(init_cfg=init_cfg)
        self.strides = strides
        self.num_input_levels = len(in_channels)
        self.return_one_list = return_one_list
        self.num_encoder_levels = \
            encoder.transformerlayers.attn_cfgs.num_levels
        assert self.num_encoder_levels >= 1, \
            'num_levels in attn_cfgs must be at least one'
        input_conv_list = []
        # from top to down (low to high resolution)
        for i in range(self.num_input_levels - 1,
                       self.num_input_levels - self.num_encoder_levels - 1,
                       -1):
            input_conv = ConvModule(
                in_channels[i],
                feat_channels,
                kernel_size=1,
                norm_cfg=norm_cfg,
                act_cfg=None,
                bias=True)
            input_conv_list.append(input_conv)
        self.input_convs = ModuleList(input_conv_list)

        self.encoder = build_transformer_layer_sequence(encoder)
        self.postional_encoding = build_positional_encoding(
            positional_encoding)
        # high resolution to low resolution
        self.level_encoding = nn.Embedding(self.num_encoder_levels,
                                           feat_channels)

        # fpn-like structure
        self.lateral_convs = ModuleList()
        self.output_convs = ModuleList()
        self.use_bias = norm_cfg is None
        # from top to down (low to high resolution)
        # fpn for the rest features that didn't pass in encoder
        for i in range(self.num_input_levels - self.num_encoder_levels - 1, -1,
                       -1):
            lateral_conv = ConvModule(
                in_channels[i],
                feat_channels,
                kernel_size=1,
                bias=self.use_bias,
                norm_cfg=norm_cfg,
                act_cfg=None)
            output_conv = ConvModule(
                feat_channels,
                feat_channels,
                kernel_size=3,
                stride=1,
                padding=1,
                bias=self.use_bias,
                norm_cfg=norm_cfg,
                act_cfg=act_cfg)
            self.lateral_convs.append(lateral_conv)
            self.output_convs.append(output_conv)

        self.mask_feature = Conv2d(
            feat_channels, out_channels, kernel_size=1, stride=1, padding=0)

        self.num_outs = num_outs
        self.point_generator = MlvlPointGenerator(strides)

    def init_weights(self):
        """Initialize weights."""
        for i in range(0, self.num_encoder_levels):
            xavier_init(
                self.input_convs[i].conv,
                gain=1,
                bias=0,
                distribution='uniform')

        for i in range(0, self.num_input_levels - self.num_encoder_levels):
            caffe2_xavier_init(self.lateral_convs[i].conv, bias=0)
            caffe2_xavier_init(self.output_convs[i].conv, bias=0)

        caffe2_xavier_init(self.mask_feature, bias=0)

        normal_init(self.level_encoding, mean=0, std=1)
        for p in self.encoder.parameters():
            if p.dim() > 1:
                nn.init.xavier_normal_(p)

        # init_weights defined in MultiScaleDeformableAttention
        for layer in self.encoder.layers:
            for attn in layer.attentions:
                if isinstance(attn, MultiScaleDeformableAttention):
                    attn.init_weights()

    def forward(self, feats):
        """
        Args:
            feats (list[Tensor]): Feature maps of each level. Each has
                shape of (batch_size, c, h, w).

        Returns:
            tuple: A tuple containing the following:

            - mask_feature (Tensor): shape (batch_size, c, h, w).
            - multi_scale_features (list[Tensor]): Multi scale \
                    features, each in shape (batch_size, c, h, w).
        """
        # generate padding mask for each level, for each image
        batch_size = feats[0].shape[0]
        encoder_input_list = []
        padding_mask_list = []
        level_positional_encoding_list = []
        spatial_shapes = []
        reference_points_list = []
        for i in range(self.num_encoder_levels):
            level_idx = self.num_input_levels - i - 1
            feat = feats[level_idx]
            feat_projected = self.input_convs[i](feat)
            h, w = feat.shape[-2:]

            # no padding
            padding_mask_resized = feat.new_zeros(
                (batch_size, ) + feat.shape[-2:], dtype=torch.bool)
            pos_embed = self.postional_encoding(padding_mask_resized)
            level_embed = self.level_encoding.weight[i]
            level_pos_embed = level_embed.view(1, -1, 1, 1) + pos_embed
            # (h_i * w_i, 2)
            reference_points = self.point_generator.single_level_grid_priors(
                feat.shape[-2:], level_idx, device=feat.device)
            # normalize
            factor = feat.new_tensor([[w, h]]) * self.strides[level_idx]
            reference_points = reference_points / factor

            # shape (batch_size, c, h_i, w_i) -> (h_i * w_i, batch_size, c)
            feat_projected = feat_projected.flatten(2).permute(2, 0, 1)
            level_pos_embed = level_pos_embed.flatten(2).permute(2, 0, 1)
            padding_mask_resized = padding_mask_resized.flatten(1)

            encoder_input_list.append(feat_projected)
            padding_mask_list.append(padding_mask_resized)
            level_positional_encoding_list.append(level_pos_embed)
            spatial_shapes.append(feat.shape[-2:])
            reference_points_list.append(reference_points)
        # shape (batch_size, total_num_query),
        # total_num_query=sum([., h_i * w_i,.])
        padding_masks = torch.cat(padding_mask_list, dim=1)
        # shape (total_num_query, batch_size, c)
        encoder_inputs = torch.cat(encoder_input_list, dim=0)
        level_positional_encodings = torch.cat(
            level_positional_encoding_list, dim=0)
        device = encoder_inputs.device
        # shape (num_encoder_levels, 2), from low
        # resolution to high resolution
        spatial_shapes = torch.as_tensor(
            spatial_shapes, dtype=torch.long, device=device)
        # shape (0, h_0*w_0, h_0*w_0+h_1*w_1, ...)
        level_start_index = torch.cat((spatial_shapes.new_zeros(
            (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
        reference_points = torch.cat(reference_points_list, dim=0)
        reference_points = reference_points[None, :, None].repeat(
            batch_size, 1, self.num_encoder_levels, 1)
        valid_radios = reference_points.new_ones(
            (batch_size, self.num_encoder_levels, 2))
        # shape (num_total_query, batch_size, c)
        memory = self.encoder(
            query=encoder_inputs,
            key=None,
            value=None,
            query_pos=level_positional_encodings,
            key_pos=None,
            attn_masks=None,
            key_padding_mask=None,
            query_key_padding_mask=padding_masks,
            spatial_shapes=spatial_shapes,
            reference_points=reference_points,
            level_start_index=level_start_index,
            valid_radios=valid_radios)
        # (num_total_query, batch_size, c) -> (batch_size, c, num_total_query)
        memory = memory.permute(1, 2, 0)

        # from low resolution to high resolution
        num_query_per_level = [e[0] * e[1] for e in spatial_shapes]
        outs = torch.split(memory, num_query_per_level, dim=-1)
        outs = [
            x.reshape(batch_size, -1, spatial_shapes[i][0],
                      spatial_shapes[i][1]) for i, x in enumerate(outs)
        ]

        for i in range(self.num_input_levels - self.num_encoder_levels - 1, -1,
                       -1):
            x = feats[i]
            cur_feat = self.lateral_convs[i](x)
            y = cur_feat + F.interpolate(
                outs[-1],
                size=cur_feat.shape[-2:],
                mode='bilinear',
                align_corners=False)
            y = self.output_convs[i](y)
            outs.append(y)
        multi_scale_features = outs[:self.num_outs]

        mask_feature = self.mask_feature(outs[-1])
        multi_scale_features.append(mask_feature)
        multi_scale_features.reverse()
        return tuple(multi_scale_features)


================================================
FILE: knet/det/semantic_fpn_wrapper.py
================================================
import math


import torch
import torch.nn as nn
from torch.nn import init
from mmcv.cnn import ConvModule, normal_init
from mmdet.models.builder import NECKS, BACKBONES
from mmcv.cnn.bricks.transformer import build_positional_encoding
from mmdet.utils import get_root_logger
from mmcv.ops import DeformConv2dPack
from mmcv.runner import BaseModule
import torch.nn.functional as F


@NECKS.register_module()
class SemanticFPNWrapper(nn.Module):
    """
    Implementation of Semantic FPN used in Panoptic FPN.

    Args:
        in_channels ([type]): [description]
        feat_channels ([type]): [description]
        out_channels ([type]): [description]
        start_level ([type]): [description]
        end_level ([type]): [description]
        cat_coors (bool, optional): [description]. Defaults to False.
        fuse_by_cat (bool, optional): [description]. Defaults to False.
        conv_cfg ([type], optional): [description]. Defaults to None.
        norm_cfg ([type], optional): [description]. Defaults to None.
    """

    def __init__(self,
                 in_channels,
                 feat_channels,
                 out_channels,
                 start_level,
                 end_level,
                 cat_coors=False,
                 positional_encoding=None,
                 cat_coors_level=3,
                 fuse_by_cat=False,
                 return_list=False,
                 upsample_times=3,
                 with_pred=True,
                 num_aux_convs=0,
                 act_cfg=dict(type='ReLU', inplace=True),
                 out_act_cfg=dict(type='ReLU'),
                 conv_cfg=None,
                 norm_cfg=None):
        super(SemanticFPNWrapper, self).__init__()

        self.in_channels = in_channels
        self.feat_channels = feat_channels
        self.start_level = start_level
        self.end_level = end_level
        assert start_level >= 0 and end_level >= start_level
        self.out_channels = out_channels
        self.conv_cfg = conv_cfg
        self.norm_cfg = norm_cfg
        self.act_cfg = act_cfg
        self.cat_coors = cat_coors
        self.cat_coors_level = cat_coors_level
        self.fuse_by_cat = fuse_by_cat
        self.return_list = return_list
        self.upsample_times = upsample_times
        self.with_pred = with_pred
        if positional_encoding is not None:
            self.positional_encoding = build_positional_encoding(
                positional_encoding)
        else:
            self.positional_encoding = None

        self.convs_all_levels = nn.ModuleList()
        for i in range(self.start_level, self.end_level + 1):
            convs_per_level = nn.Sequential()
            if i == 0:
                if i == self.cat_coors_level and self.cat_coors:
                    chn = self.in_channels + 2
                else:
                    chn = self.in_channels
                if upsample_times == self.end_level - i:
                    one_conv = ConvModule(
                        chn,
                        self.feat_channels,
                        3,
                        padding=1,
                        conv_cfg=self.conv_cfg,
                        norm_cfg=self.norm_cfg,
                        act_cfg=self.act_cfg,
                        inplace=False)
                    convs_per_level.add_module('conv' + str(i), one_conv)
                else:
                    for i in range(self.end_level - upsample_times):
                        one_conv = ConvModule(
                            chn,
                            self.feat_channels,
                            3,
                            padding=1,
                            stride=2,
                            conv_cfg=self.conv_cfg,
                            norm_cfg=self.norm_cfg,
                            act_cfg=self.act_cfg,
                            inplace=False)
                        convs_per_level.add_module('conv' + str(i), one_conv)
                self.convs_all_levels.append(convs_per_level)
                continue

            for j in range(i):
                if j == 0:
                    if i == self.cat_coors_level and self.cat_coors:
                        chn = self.in_channels + 2
                    else:
                        chn = self.in_channels
                    one_conv = ConvModule(
                        chn,
                        self.feat_channels,
                        3,
                        padding=1,
                        conv_cfg=self.conv_cfg,
                        norm_cfg=self.norm_cfg,
                        act_cfg=self.act_cfg,
                        inplace=False)
                    convs_per_level.add_module('conv' + str(j), one_conv)
                    if j < upsample_times - (self.end_level - i):
                        one_upsample = nn.Upsample(
                            scale_factor=2,
                            mode='bilinear',
                            align_corners=False)
                        convs_per_level.add_module('upsample' + str(j),
                                                   one_upsample)
                    continue

                one_conv = ConvModule(
                    self.feat_channels,
                    self.feat_channels,
                    3,
                    padding=1,
                    conv_cfg=self.conv_cfg,
                    norm_cfg=self.norm_cfg,
                    act_cfg=self.act_cfg,
                    inplace=False)
                convs_per_level.add_module('conv' + str(j), one_conv)
                if j < upsample_times - (self.end_level - i):
                    one_upsample = nn.Upsample(
                        scale_factor=2, mode='bilinear', align_corners=False)
                    convs_per_level.add_module('upsample' + str(j),
                                               one_upsample)

            self.convs_all_levels.append(convs_per_level)

        if fuse_by_cat:
            in_channels = self.feat_channels * len(self.convs_all_levels)
        else:
            in_channels = self.feat_channels

        if self.with_pred:
            self.conv_pred = ConvModule(
                in_channels,
                self.out_channels,
                1,
                padding=0,
                conv_cfg=self.conv_cfg,
                act_cfg=out_act_cfg,
                norm_cfg=self.norm_cfg)

        self.num_aux_convs = num_aux_convs
        self.aux_convs = nn.ModuleList()
        for i in range(num_aux_convs):
            self.aux_convs.append(
                ConvModule(
                    in_channels,
                    self.out_channels,
                    1,
                    padding=0,
                    conv_cfg=self.conv_cfg,
                    act_cfg=out_act_cfg,
                    norm_cfg=self.norm_cfg))

    def init_weights(self):
        logger = get_root_logger()
        logger.info('Use normal intialization for semantic FPN')
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                normal_init(m, std=0.01)

    def generate_coord(self, input_feat):
        x_range = torch.linspace(
            -1, 1, input_feat.shape[-1], device=input_feat.device)
        y_range = torch.linspace(
            -1, 1, input_feat.shape[-2], device=input_feat.device)
        y, x = torch.meshgrid(y_range, x_range)
        y = y.expand([input_feat.shape[0], 1, -1, -1])
        x = x.expand([input_feat.shape[0], 1, -1, -1])
        coord_feat = torch.cat([x, y], 1)
        return coord_feat

    def forward(self, inputs):
        mlvl_feats = []
        for i in range(self.start_level, self.end_level + 1):
            input_p = inputs[i]
            if i == self.cat_coors_level:
                if self.positional_encoding is not None:
                    ignore_mask = input_p.new_zeros(
                        (input_p.shape[0], input_p.shape[-2],
                         input_p.shape[-1]),
                        dtype=torch.bool)
                    positional_encoding = self.positional_encoding(ignore_mask)
                    input_p = input_p + positional_encoding
                if self.cat_coors:
                    coord_feat = self.generate_coord(input_p)
                    input_p = torch.cat([input_p, coord_feat], 1)

            mlvl_feats.append(self.convs_all_levels[i](input_p))

        if self.fuse_by_cat:
            feature_add_all_level = torch.cat(mlvl_feats, dim=1)
        else:
            feature_add_all_level = sum(mlvl_feats)

        if self.with_pred:
            out = self.conv_pred(feature_add_all_level)
        else:
            out = feature_add_all_level

        if self.num_aux_convs > 0:
            outs = [out]
            for conv in self.aux_convs:
                outs.append(conv(feature_add_all_level))
            return outs

        if self.return_list:
            return [out]
        else:
            return out


@NECKS.register_module()
class UperNetAlignHead(BaseModule):

    def __init__(self, in_channels=[256, 512, 1024, 2048], out_channels=256, feat_channels=256, align_types="v1",
                 start_level=1, end_level=3, conv3x3_type="conv", positional_encoding=None, cat_coors_level=3,
                 upsample_times=2, cat_coors=False, fuse_by_cat=False, return_list=False,
                 num_aux_convs=1, norm_cfg=dict(type='GN', num_groups=32, requires_grad=True) ):
        super(UperNetAlignHead, self).__init__()

        if positional_encoding is not None:
            self.positional_encoding = build_positional_encoding(
                positional_encoding)
        else:
            self.positional_encoding = None

        self.cat_coors_level = cat_coors_level
        self.align_types = align_types

        self.dcn = DeformConv2dPack(in_channels=256, out_channels=out_channels, kernel_size=3, padding=1)
        self.fpn_in = []
        for fpn_inplane in in_channels[:-1]:
            self.fpn_in.append(
                ConvModule(fpn_inplane, out_channels, kernel_size=1, norm_cfg=dict(type='BN2d'),
                           act_cfg=dict(type='ReLU'),
                           inplace=False)
            )
        self.fpn_in = nn.ModuleList(self.fpn_in)

        self.fpn_out = []
        self.fpn_out_align = []
        self.dsn = []
        for i in range(len(in_channels) - 1):
            self.fpn_out.append(
                ConvModule(in_channels=out_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1,
                           norm_cfg=dict(type='BN2d')))

            if conv3x3_type == 'conv':
                if self.align_types == "v1":
                    self.fpn_out_align.append(
                        AlignedModule(inplane=out_channels, outplane=out_channels // 2)
                    )
                else:
                    self.fpn_out_align.append(
                        AlignedModulev2PoolingAtten(inplane=out_channels, outplane=out_channels // 2)
                    )

            self.fpn_out = nn.ModuleList(self.fpn_out)
            self.fpn_out_align = nn.ModuleList(self.fpn_out_align)

    def forward(self, conv_out):
        f = conv_out[-1]
        fpn_feature_list = [f]
        for i in reversed(range(len(conv_out) - 1)):
            conv_x = conv_out[i]
            conv_x = self.fpn_in[i](conv_x)
            f = self.fpn_out_align[i]([conv_x, f])
            f = conv_x + f
            fpn_feature_list.append(self.fpn_out[i](f))

        output_size = conv_out[1].size()[2:]
        fusion_list = []

        for i in range(0, len(fpn_feature_list)):
            fusion_list.append(nn.functional.interpolate(
                fpn_feature_list[i],
                output_size,
                mode='bilinear', align_corners=True))

        x = fusion_list[0]
        for i in range(1, len(fusion_list)):
            x += fusion_list[i]

        # add position encodings
        ignore_mask = x.new_zeros(
                        (x.shape[0], x.shape[-2],
                         x.shape[-1]),
                        dtype=torch.bool)
        positional_encoding = self.positional_encoding(ignore_mask)
        x = x + positional_encoding

        return self.dcn(x)


class AlignedModule(nn.Module):

    def __init__(self, inplane, outplane, kernel_size=3):
        super(AlignedModule, self).__init__()
        self.down_h = nn.Conv2d(inplane, outplane, 1, bias=False)
        self.down_l = nn.Conv2d(inplane, outplane, 1, bias=False)
        self.flow_make = nn.Conv2d(outplane * 2, 2, kernel_size=kernel_size, padding=1, bias=False)

    def forward(self, x):
        low_feature, h_feature = x
        h_feature_orign = h_feature
        h, w = low_feature.size()[2:]
        size = (h, w)
        low_feature = self.down_l(low_feature)
        h_feature = self.down_h(h_feature)
        h_feature = F.interpolate(h_feature, size=size, mode="bilinear", align_corners=True)
        flow = self.flow_make(torch.cat([h_feature, low_feature], 1))
        h_feature = self.flow_warp(h_feature_orign, flow, size=size)

        return h_feature

    def flow_warp(self, input, flow, size):
        out_h, out_w = size
        n, c, h, w = input.size()

        norm = torch.tensor([[[[out_w, out_h]]]]).type_as(input).to(input.device)
        h = torch.linspace(-1.0, 1.0, out_h).view(-1, 1).repeat(1, out_w)
        w = torch.linspace(-1.0, 1.0, out_w).repeat(out_h, 1)
        grid = torch.cat((w.unsqueeze(2), h.unsqueeze(2)), 2)
        grid = grid.repeat(n, 1, 1, 1).type_as(input).to(input.device)
        grid = grid + flow.permute(0, 2, 3, 1) / norm

        output = F.grid_sample(input, grid, align_corners=True)
        return output


class AlignedModulev2PoolingAtten(nn.Module):

    def __init__(self, inplane, outplane, kernel_size=3):
        super(AlignedModulev2PoolingAtten, self).__init__()
        self.down_h = nn.Conv2d(inplane, outplane, 1, bias=False)
        self.down_l = nn.Conv2d(inplane, outplane, 1, bias=False)
        self.flow_make = nn.Conv2d(outplane*2, 4, kernel_size=kernel_size, padding=1, bias=False)
        self.flow_gate = nn.Sequential(
            nn.Conv2d(4, 1, kernel_size=kernel_size, padding=1, bias=False),
            nn.Sigmoid()
        )

    def forward(self, x):
        low_feature, h_feature = x
        h_feature_orign = h_feature
        h, w = low_feature.size()[2:]
        size = (h, w)
        l_feature = self.down_l(low_feature)
        h_feature = self.down_h(h_feature)
        h_feature = F.upsample(h_feature, size=size, mode="bilinear", align_corners=True)

        flow = self.flow_make(torch.cat([h_feature, l_feature], 1))
        flow_up, flow_down = flow[:, :2, :, :], flow[:, 2:, :, :]

        h_feature_warp = self.flow_warp(h_feature_orign, flow_up, size=size)
        l_feature_warp = self.flow_warp(low_feature, flow_down, size=size)

        h_feature_mean = torch.mean(h_feature, dim=1).unsqueeze(1)
        l_feature_mean = torch.mean(low_feature, dim=1).unsqueeze(1)
        h_feature_max = torch.max(h_feature, dim=1)[0].unsqueeze(1)
        l_feature_max = torch.max(low_feature, dim=1)[0].unsqueeze(1)

        flow_gates = self.flow_gate(torch.cat([h_feature_mean, l_feature_mean, h_feature_max, l_feature_max], 1))

        fuse_feature = h_feature_warp * flow_gates + l_feature_warp * (1 - flow_gates)

        return fuse_feature

    def flow_warp(self, input, flow, size):
        out_h, out_w = size
        n, c, h, w = input.size()
        # n, c, h, w
        # n, 2, h, w

        norm = torch.tensor([[[[out_w, out_h]]]]).type_as(input).to(input.device)
        h = torch.linspace(-1.0, 1.0, out_h).view(-1, 1).repeat(1, out_w)
        w = torch.linspace(-1.0, 1.0, out_w).repeat(out_h, 1)
        grid = torch.cat((w.unsqueeze(2), h.unsqueeze(2)), 2)
        grid = grid.repeat(n, 1, 1, 1).type_as(input).to(input.device)
        grid = grid + flow.permute(0, 2, 3, 1) / norm

        output = F.grid_sample(input, grid, align_corners=True)
        return output


@BACKBONES.register_module()
class STDCNet1446(nn.Module):
    def __init__(self, base=64, layers=[4, 5, 3], block_num=4, type="cat", num_classes=1000, dropout=0.20,
                 pretrain_model='./pretrained_models/STDCNet1446_76.47.tar',
                 use_conv_last=False, norm_layer=nn.SyncBatchNorm, ):
        super(STDCNet1446, self).__init__()
        if type == "cat":
            block = CatBottleneck
        elif type == "add":
            block = AddBottleneck
        self.use_conv_last = use_conv_last
        self.features = self._make_layers(base, layers, block_num, block, norm_layer)
        self.conv_last = ConvX(base * 16, max(1024, base * 16), 1, 1)
        self.gap = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(max(1024, base * 16), max(1024, base * 16), bias=False)
        self.bn = nn.BatchNorm1d(max(1024, base * 16))
        self.relu = nn.ReLU(inplace=True)
        self.dropout = nn.Dropout(p=dropout)
        self.linear = nn.Linear(max(1024, base * 16), num_classes, bias=False)

        self.x2 = nn.Sequential(self.features[:1])
        self.x4 = nn.Sequential(self.features[1:2])
        self.x8 = nn.Sequential(self.features[2:6])
        self.x16 = nn.Sequential(self.features[6:11])
        self.x32 = nn.Sequential(self.features[11:])

        if pretrain_model:
            print('use pretrain model {}'.format(pretrain_model))
            self.init_weight(pretrain_model)
        else:
            self.init_params()

        self.features = None
        self.conv_last = None
        self.gap = None
        self.fc = None
        self.bn = None
        self.relu = None
        self.dropout = None
        self.linear = None

    def init_weight(self, pretrain_model):

        state_dict = torch.load(pretrain_model, map_location='cpu')["state_dict"]
        self_state_dict = self.state_dict()
        for k, v in state_dict.items():
            self_state_dict.update({k: v})
        self.load_state_dict(self_state_dict)

    def init_params(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                init.kaiming_normal_(m.weight, mode='fan_out')
                if m.bias is not None:
                    init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                init.constant_(m.weight, 1)
                init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                init.normal_(m.weight, std=0.001)
                if m.bias is not None:
                    init.constant_(m.bias, 0)

    def _make_layers(self, base, layers, block_num, block, norm_layer):
        features = []
        features += [ConvX(3, base // 2, 3, 2)]
        features += [ConvX(base // 2, base, 3, 2)]

        for i, layer in enumerate(layers):
            for j in range(layer):
                if i == 0 and j == 0:
                    features.append(block(base, base * 4, block_num, 2, norm_layer=norm_layer))
                elif j == 0:
                    features.append(block(base * int(math.pow(2, i + 1)), base * int(math.pow(2, i + 2)), block_num, 2,
                                          norm_layer=norm_layer))
                else:
                    features.append(block(base * int(math.pow(2, i + 2)), base * int(math.pow(2, i + 2)), block_num, 1,
                                          norm_layer=norm_layer))

        return nn.Sequential(*features)

    def forward(self, x):
        feat2 = self.x2(x)
        feat4 = self.x4(feat2)
        feat8 = self.x8(feat4)
        feat16 = self.x16(feat8)
        feat32 = self.x32(feat16)
        if self.use_conv_last:
            feat32 = self.conv_last(feat32)

        return feat4, feat8, feat16, feat32


@BACKBONES.register_module()
class STDCNet813(nn.Module):
    def __init__(self, base=64, layers=[2, 2, 2], block_num=4, type="cat", num_classes=1000, dropout=0.20,
                 pretrain_model='./pretrained_models/STDCNet813_73.91.tar',
                 use_conv_last=False, norm_layer=nn.BatchNorm2d):
        super(STDCNet813, self).__init__()
        if type == "cat":
            block = CatBottleneck
        elif type == "add":
            block = AddBottleneck
        self.use_conv_last = use_conv_last
        self.features = self._make_layers(base, layers, block_num, block, norm_layer)
        self.conv_last = ConvX(base * 16, max(1024, base * 16), 1, 1)
        self.gap = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(max(1024, base * 16), max(1024, base * 16), bias=False)
        self.bn = nn.BatchNorm1d(max(1024, base * 16))
        self.relu = nn.ReLU(inplace=True)
        self.dropout = nn.Dropout(p=dropout)
        self.linear = nn.Linear(max(1024, base * 16), num_classes, bias=False)

        self.x2 = nn.Sequential(self.features[:1])
        self.x4 = nn.Sequential(self.features[1:2])
        self.x8 = nn.Sequential(self.features[2:4])
        self.x16 = nn.Sequential(self.features[4:6])
        self.x32 = nn.Sequential(self.features[6:])

        if pretrain_model:
            print('use pretrain model {}'.format(pretrain_model))
            self.init_weight(pretrain_model)
        else:
            self.init_params()

        self.features = None
        self.conv_last = None
        self.gap = None
        self.fc = None
        self.bn = None
        self.relu = None
        self.dropout = None
        self.linear = None

    def init_weight(self, pretrain_model):

        state_dict = torch.load(pretrain_model, map_location='cpu')["state_dict"]
        self_state_dict = self.state_dict()
        for k, v in state_dict.items():
            self_state_dict.update({k: v})
        self.load_state_dict(self_state_dict)

    def init_params(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                init.kaiming_normal_(m.weight, mode='fan_out')
                if m.bias is not None:
                    init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                init.constant_(m.weight, 1)
                init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                init.normal_(m.weight, std=0.001)
                if m.bias is not None:
                    init.constant_(m.bias, 0)

    def _make_layers(self, base, layers, block_num, block, norm_layer):
        features = []
        features += [ConvX(3, base // 2, 3, 2)]
        features += [ConvX(base // 2, base, 3, 2)]

        for i, layer in enumerate(layers):
            for j in range(layer):
                if i == 0 and j == 0:
                    features.append(block(base, base * 4, block_num, 2, norm_layer=norm_layer))
                elif j == 0:
                    features.append(block(base * int(math.pow(2, i + 1)), base * int(math.pow(2, i + 2)), block_num, 2,
                                          norm_layer=norm_layer))
                else:
                    features.append(block(base * int(math.pow(2, i + 2)), base * int(math.pow(2, i + 2)), block_num, 1,
                                          norm_layer=norm_layer))

        return nn.Sequential(*features)

    def forward(self, x):
        feat2 = self.x2(x)
        feat4 = self.x4(feat2)
        feat8 = self.x8(feat4)
        feat16 = self.x16(feat8)
        feat32 = self.x32(feat16)
        if self.use_conv_last:
            feat32 = self.conv_last(feat32)

        return feat4, feat8, feat16, feat32


class AddBottleneck(nn.Module):
    def __init__(self, in_planes, out_planes, block_num=3, stride=1, norm_layer=nn.BatchNorm2d):
        super(AddBottleneck, self).__init__()
        assert block_num > 1, print("block number should be larger than 1.")
        self.conv_list = nn.ModuleList()
        self.stride = stride
        if stride == 2:
            self.avd_layer = nn.Sequential(
                nn.Conv2d(out_planes // 2, out_planes // 2, kernel_size=3, stride=2, padding=1, groups=out_planes // 2,
                          bias=False),
                norm_layer(out_planes // 2),
            )
            self.skip = nn.Sequential(
                nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=2, padding=1, groups=in_planes, bias=False),
                norm_layer(in_planes),
                nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False),
                norm_layer(out_planes),
            )
            stride = 1

        for idx in range(block_num):
            if idx == 0:
                self.conv_list.append(ConvX(in_planes, out_planes // 2, kernel=1))
            elif idx == 1 and block_num == 2:
                self.conv_list.append(ConvX(out_planes // 2, out_planes // 2, stride=stride))
            elif idx == 1 and block_num > 2:
                self.conv_list.append(ConvX(out_planes // 2, out_planes // 4, stride=stride))
            elif idx < block_num - 1:
                self.conv_list.append(
                    ConvX(out_planes // int(math.pow(2, idx)), out_planes // int(math.pow(2, idx + 1))))
            else:
                self.conv_list.append(ConvX(out_planes // int(math.pow(2, idx)), out_planes // int(math.pow(2, idx))))

    def forward(self, x):
        out_list = []
        out = x

        for idx, conv in enumerate(self.conv_list):
            if idx == 0 and self.stride == 2:
                out = self.avd_layer(conv(out))
            else:
                out = conv(out)
            out_list.append(out)

        if self.stride == 2:
            x = self.skip(x)

        return torch.cat(out_list, dim=1) + x


class CatBottleneck(nn.Module):
    def __init__(self, in_planes, out_planes, block_num=3, stride=1, norm_layer=nn.BatchNorm2d):
        super(CatBottleneck, self).__init__()
        assert block_num > 1, print("block number should be larger than 1.")
        self.conv_list = nn.ModuleList()
        self.stride = stride
        if stride == 2:
            self.avd_layer = nn.Sequential(
                nn.Conv2d(out_planes // 2, out_planes // 2, kernel_size=3, stride=2, padding=1, groups=out_planes // 2,
                          bias=False),
                norm_layer(out_planes // 2),
            )
            self.skip = nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
            stride = 1

        for idx in range(block_num):
            if idx == 0:
                self.conv_list.append(ConvX(in_planes, out_planes // 2, kernel=1))
            elif idx == 1 and block_num == 2:
                self.conv_list.append(ConvX(out_planes // 2, out_planes // 2, stride=stride))
            elif idx == 1 and block_num > 2:
                self.conv_list.append(ConvX(out_planes // 2, out_planes // 4, stride=stride))
            elif idx < block_num - 1:
                self.conv_list.append(
                    ConvX(out_planes // int(math.pow(2, idx)), out_planes // int(math.pow(2, idx + 1))))
            else:
                self.conv_list.append(ConvX(out_planes // int(math.pow(2, idx)), out_planes // int(math.pow(2, idx))))

    def forward(self, x):
        out_list = []
        out1 = self.conv_list[0](x)

        for idx, conv in enumerate(self.conv_list[1:]):
            if idx == 0:
                if self.stride == 2:
                    out = conv(self.avd_layer(out1))
                else:
                    out = conv(out1)
            else:
                out = conv(out)
            out_list.append(out)

        if self.stride == 2:
            out1 = self.skip(out1)
        out_list.insert(0, out1)

        out = torch.cat(out_list, dim=1)
        return out


class ConvX(nn.Module):
    def __init__(self, in_planes, out_planes, kernel=3, stride=1, norm_layer=nn.BatchNorm2d):
        super(ConvX, self).__init__()
        self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel, stride=stride, padding=kernel//2, bias=False)
        self.bn = norm_layer(out_planes)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        out = self.relu(self.bn(self.conv(x)))
        return out

================================================
FILE: knet/det/utils.py
================================================
from typing import List

import torch
import torch.nn.functional as F
from mmdet.utils import get_root_logger


def sem2ins_masks(gt_sem_seg,
                  ignore_label=255,
                  label_shift=80,
                  thing_label_in_seg=0):
    classes = torch.unique(gt_sem_seg)
    ins_masks = []
    ins_labels = []
    for i in classes:
        # skip ignore class 255 and "special thing class" in semantic seg
        if i == ignore_label or i == thing_label_in_seg:
            continue
        ins_labels.append(i)
        ins_masks.append(gt_sem_seg == i)
    # 0 is the special thing class in semantic seg, so we also shift it by 1
    # Thus, 0-79 is foreground classes of things (similar in instance seg)
    # 80-151 is foreground classes of stuffs (shifted by the original index)
    if len(ins_labels) > 0:
        ins_labels = torch.stack(ins_labels) + label_shift - 1
        ins_masks = torch.cat(ins_masks)
    else:
        ins_labels = gt_sem_seg.new_zeros(size=[0])
        ins_masks = gt_sem_seg.new_zeros(
            size=[0, gt_sem_seg.shape[-2], gt_sem_seg.shape[-1]])
    return ins_labels.long(), ins_masks.float()


def sem2ins_masks_cityscapes(gt_sem_seg,
                             ignore_label=255,
                             label_shift=8,
                             thing_label_in_seg=list(range(11, 19))):
    """
        Shift the cityscapes semantic labels to instance labels and masks.
    """
    # assert label range from 0-18 (255)
    classes = torch.unique(gt_sem_seg)
    ins_masks = []
    ins_labels = []
    for i in classes:
        # skip ignore class 255 and "special thing class" in semantic seg
        if i == ignore_label or i in thing_label_in_seg:
            continue
        ins_labels.append(i)
        ins_masks.append(gt_sem_seg == i)
    # For cityscapes, 0-7 is foreground classes of things (similar in instance seg)
    # 8-18 is foreground classes of stuffs (shifted by the original index)
    if len(ins_labels) > 0:
        ins_labels = torch.stack(ins_labels) + label_shift
        ins_masks = torch.cat(ins_masks)
    else:
        ins_labels = gt_sem_seg.new_zeros(size=[0])
        ins_masks = gt_sem_seg.new_zeros(
            size=[0, gt_sem_seg.shape[-2], gt_sem_seg.shape[-1]])
    return ins_labels.long(), ins_masks.float()


def sem2ins_masks_kitti_step(gt_sem_seg,
                             ignore_label=255,
                             label_shift=2,
                             thing_label_in_seg=(11,13)):
    """
        Shift the cityscapes semantic labels to instance labels and masks.
    """
    # assert label range from 0-18 (255)
    classes = torch.unique(gt_sem_seg)
    ins_masks = []
    ins_labels = []
    for i in classes:
        # skip ignore class 255 and "special thing class" in semantic seg
        if i == ignore_label or i in thing_label_in_seg:
            continue
        offset = 0
        for thing_label in thing_label_in_seg:
            if i > thing_label:
                offset -= 1
        ins_labels.append(i + offset)
        ins_masks.append(gt_sem_seg == i)
    # For cityscapes, 0-7 is foreground classes of things (similar in instance seg)
    # 8-18 is foreground classes of stuffs (shifted by the original index)
    if len(ins_labels) > 0:
        ins_labels = torch.stack(ins_labels) + label_shift
        ins_masks = torch.cat(ins_masks)
    else:
        ins_labels = gt_sem_seg.new_zeros(size=[0])
        ins_masks = gt_sem_seg.new_zeros(
            size=[0, gt_sem_seg.shape[-2], gt_sem_seg.shape[-1]])
    return ins_labels.long(), ins_masks.float()

================================================
FILE: knet/kernel_updator.py
================================================
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import build_activation_layer, build_norm_layer
from mmcv.cnn.bricks.transformer import TRANSFORMER_LAYER


@TRANSFORMER_LAYER.register_module()
class KernelUpdator(nn.Module):

    def __init__(self,
                 in_channels=256,
                 feat_channels=64,
                 out_channels=None,
                 input_feat_shape=3,
                 gate_sigmoid=True,
                 gate_norm_act=False,
                 activate_out=False,
                 act_cfg=dict(type='ReLU', inplace=True),
                 norm_cfg=dict(type='LN')):
        super(KernelUpdator, self).__init__()
        self.in_channels = in_channels
        self.feat_channels = feat_channels
        self.out_channels_raw = out_channels
        self.gate_sigmoid = gate_sigmoid
        self.gate_norm_act = gate_norm_act
        self.activate_out = activate_out
        if isinstance(input_feat_shape, int):
            input_feat_shape = [input_feat_shape] * 2
        self.input_feat_shape = input_feat_shape
        self.act_cfg = act_cfg
        self.norm_cfg = norm_cfg
        self.out_channels = out_channels if out_channels else in_channels

        self.num_params_in = self.feat_channels
        self.num_params_out = self.feat_channels
        self.dynamic_layer = nn.Linear(
            self.in_channels, self.num_params_in + self.num_params_out)
        self.input_layer = nn.Linear(self.in_channels,
                                     self.num_params_in + self.num_params_out,
                                     1)
        self.input_gate = nn.Linear(self.in_channels, self.feat_channels, 1)
        self.update_gate = nn.Linear(self.in_channels, self.feat_channels, 1)
        if self.gate_norm_act:
            self.gate_norm = build_norm_layer(norm_cfg, self.feat_channels)[1]

        self.norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1]
        self.norm_out = build_norm_layer(norm_cfg, self.feat_channels)[1]
        self.input_norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1]
        self.input_norm_out = build_norm_layer(norm_cfg, self.feat_channels)[1]

        self.activation = build_activation_layer(act_cfg)

        self.fc_layer = nn.Linear(self.feat_channels, self.out_channels, 1)
        self.fc_norm = build_norm_layer(norm_cfg, self.out_channels)[1]

    def forward(self, update_feature, input_feature):
        update_feature = update_feature.reshape(-1, self.in_channels)
        num_proposals = update_feature.size(0)
        parameters = self.dynamic_layer(update_feature)
        param_in = parameters[:, :self.num_params_in].view(
            -1, self.feat_channels)
        param_out = parameters[:, -self.num_params_out:].view(
            -1, self.feat_channels)

        input_feats = self.input_layer(
            input_feature.reshape(num_proposals, -1, self.feat_channels))
        input_in = input_feats[..., :self.num_params_in]
        input_out = input_feats[..., -self.num_params_out:]

        gate_feats = input_in * param_in.unsqueeze(-2)
        if self.gate_norm_act:
            gate_feats = self.activation(self.gate_norm(gate_feats))

        input_gate = self.input_norm_in(self.input_gate(gate_feats))
        update_gate = self.norm_in(self.update_gate(gate_feats))
        if self.gate_sigmoid:
            input_gate = input_gate.sigmoid()
            update_gate = update_gate.sigmoid()
        param_out = self.norm_out(param_out)
        input_out = self.input_norm_out(input_out)

        if self.activate_out:
            param_out = self.activation(param_out)
            input_out = self.activation(input_out)

        # param_out has shape (batch_size, feat_channels, out_channels)
        features = update_gate * param_out.unsqueeze(
            -2) + input_gate * input_out

        features = self.fc_layer(features)
        features = self.fc_norm(features)
        features = self.activation(features)

        return features

================================================
FILE: knet/video/__init__.py
================================================


================================================
FILE: knet/video/dice_loss.py
================================================
import torch
import torch.nn as nn
from mmdet.models.builder import LOSSES, build_loss
from mmdet.models.losses.utils import weighted_loss


@weighted_loss
def dice_loss(input, target, eps=1e-3, numerator_eps=0):
    input = input.reshape(input.size()[0], -1)
    target = target.reshape(target.size()[0], -1).float()

    a = torch.sum(input * target, 1)
    b = torch.sum(input * input, 1) + eps
    c = torch.sum(target * target, 1) + eps
    d = (2 * a + numerator_eps) / (b + c)
    return 1 - d

#
# @LOSSES.register_module()
# class DiceLoss(nn.Module):
#
#     def __init__(self,
#                  eps=1e-3,
#                  numerator_eps=0.0,
#                  use_sigmoid=True,
#                  reduction='mean',
#                  loss_weight=1.0):
#         super(DiceLoss, self).__init__()
#         self.eps = eps
#         self.reduction = reduction
#         self.loss_weight = loss_weight
#         self.use_sigmoid = use_sigmoid
#         self.numerator_eps = numerator_eps
#
#     def forward(self,
#                 pred,
#                 target,
#                 weight=None,
#                 avg_factor=None,
#                 reduction_override=None,
#                 **kwargs):
#         if weight is not None and not torch.any(weight > 0):
#             return (pred * weight).sum()  # 0
#         assert reduction_override in (None, 'none', 'mean', 'sum')
#         reduction = (
#             reduction_override if reduction_override else self.reduction)
#         pred = pred.sigmoid()
#         loss = self.loss_weight * dice_loss(
#             pred,
#             target,
#             weight,
#             eps=self.eps,
#             numerator_eps=self.numerator_eps,
#             reduction=reduction,
#             avg_factor=avg_factor,
#             **kwargs)
#         return loss


================================================
FILE: knet/video/kernel_head.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import (ConvModule, bias_init_with_prob, normal_init)
from mmdet.core import build_assigner, build_sampler, multi_apply, reduce_mean
from mmdet.models.builder import HEADS, build_loss, build_neck
from mmdet.models.losses import accuracy
from mmdet.utils import get_root_logger


@HEADS.register_module()
class VideoConvKernelHead(nn.Module):
    """
        This head for init mask and kernel prediction
    """
    def __init__(self,
                 num_proposals=100,
                 in_channels=256,
                 out_channels=256,
                 num_heads=8,
                 num_cls_fcs=1,
                 num_seg_convs=1,
                 num_loc_convs=1,
                 att_dropout=False,
                 localization_fpn=None,
                 conv_kernel_size=1,
                 norm_cfg=dict(type='GN', num_groups=32),
                 semantic_fpn=True,
                 train_cfg=None,
                 num_classes=80,
                 xavier_init_kernel=False,
                 kernel_init_std=0.01,
                 use_binary=False,
                 proposal_feats_with_obj=False,
                 loss_mask=None,
                 loss_seg=None,
                 loss_cls=None,
                 loss_dice=None,
                 loss_rank=None,
                 feat_downsample_stride=1,
                 feat_refine_stride=1,
                 feat_refine=True,
                 with_embed=False,
                 feat_embed_only=False,
                 conv_normal_init=False,
                 mask_out_stride=4,
                 hard_target=False,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 ignore_label=255,
                 thing_label_in_seg=0,
                 cat_stuff_mask=False,
                 link_previous=False,
                 **kwargs):
        super(VideoConvKernelHead, self).__init__()
        self.num_proposals = num_proposals
        self.num_cls_fcs = num_cls_fcs
        self.train_cfg = train_cfg
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.num_classes = num_classes
        self.proposal_feats_with_obj = proposal_feats_with_obj
        self.sampling = False
        self.localization_fpn = build_neck(localization_fpn)
        self.semantic_fpn = semantic_fpn
        self.norm_cfg = norm_cfg
        self.num_heads = num_heads
        self.att_dropout = att_dropout
        self.mask_out_stride = mask_out_stride
        self.hard_target = hard_target
        self.conv_kernel_size = conv_kernel_size
        self.xavier_init_kernel = xavier_init_kernel
        self.kernel_init_std = kernel_init_std
        self.feat_downsample_stride = feat_downsample_stride
        self.feat_refine_stride = feat_refine_stride
        self.conv_normal_init = conv_normal_init
        self.feat_refine = feat_refine
        self.with_embed = with_embed
        self.feat_embed_only = feat_embed_only
        self.num_loc_convs = num_loc_convs
        self.num_seg_convs = num_seg_convs
        self.use_binary = use_binary
        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.ignore_label = ignore_label
        self.thing_label_in_seg = thing_label_in_seg
        self.cat_stuff_mask = cat_stuff_mask
        self.link_previous = link_previous

        if loss_mask is not None:
            self.loss_mask = build_loss(loss_mask)
        else:
            self.loss_mask = loss_mask

        if loss_dice is not None:
            self.loss_dice = build_loss(loss_dice)
        else:
            self.loss_dice = loss_dice

        if loss_seg is not None:
            self.loss_seg = build_loss(loss_seg)
        else:
            self.loss_seg = loss_seg
        if loss_cls is not None:
            self.loss_cls = build_loss(loss_cls)
        else:
            self.loss_cls = loss_cls

        if loss_rank is not None:
            self.loss_rank = build_loss(loss_rank)
        else:
            self.loss_rank = loss_rank

        if self.train_cfg:
            self.assigner = build_assigner(self.train_cfg.assigner)
            # use PseudoSampler when sampling is False
            if self.sampling and hasattr(self.train_cfg, 'sampler'):
                sampler_cfg = self.train_cfg.sampler
            else:
                sampler_cfg = dict(type='MaskPseudoSampler')
            self.sampler = build_sampler(sampler_cfg, context=self)
        self._init_layers()

    def _init_layers(self):
        """Initialize a sparse set of proposal boxes and proposal features."""
        self.init_kernels = nn.Conv2d(
            self.out_channels,
            self.num_proposals,
            self.conv_kernel_size,
            padding=int(self.conv_kernel_size // 2),
            bias=False)  # (N, C, 1, 1) -> (N, C)

        if self.semantic_fpn:
            self.conv_seg = nn.Conv2d(self.out_channels, self.num_classes, 1)

        if self.feat_downsample_stride > 1 and self.feat_refine:
            self.ins_downsample = ConvModule(
                self.in_channels,
                self.out_channels,
                3,
                stride=self.feat_refine_stride,  # 2
                padding=1,
                norm_cfg=self.norm_cfg)
            self.seg_downsample = ConvModule(
                self.in_channels,
                self.out_channels,
                3,
                stride=self.feat_refine_stride,  # 2
                padding=1,
                norm_cfg=self.norm_cfg)

        self.loc_convs = nn.ModuleList()
        for i in range(self.num_loc_convs):
            self.loc_convs.append(
                ConvModule(
                    self.in_channels,
                    self.out_channels,
                    1,
                    norm_cfg=self.norm_cfg))

        self.seg_convs = nn.ModuleList()
        for i in range(self.num_seg_convs):
            self.seg_convs.append(
                ConvModule(
                    self.in_channels,
                    self.out_channels,
                    1,
                    norm_cfg=self.norm_cfg))

    def init_weights(self):
        self.localization_fpn.init_weights()

        if self.feat_downsample_stride > 1 and self.conv_normal_init:
            logger = get_root_logger()
            logger.info('Initialize convs in KPN head by normal std 0.01')
            for conv in [self.loc_convs, self.seg_convs]:
                for m in conv.modules():
                    if isinstance(m, nn.Conv2d):
                        normal_init(m, std=0.01)

        if self.semantic_fpn:
            bias_seg = bias_init_with_prob(0.01)
            if self.loss_seg.use_sigmoid:
                normal_init(self.conv_seg, std=0.01, bias=bias_seg)
            else:
                normal_init(self.conv_seg, mean=0, std=0.01)
        if self.xavier_init_kernel:
            logger = get_root_logger()
            logger.info('Initialize kernels by xavier uniform')
            nn.init.xavier_uniform_(self.init_kernels.weight)
        else:
            logger = get_root_logger()
            logger.info(
                f'Initialize kernels by normal std: {self.kernel_init_std}')
            normal_init(self.init_kernels, mean=0, std=self.kernel_init_std)

    def _decode_init_proposals(self, img, img_metas,
                               previous_obj_feats=None, previous_mask_preds=None, previous_x_feats=None):
        num_imgs = len(img_metas)

        localization_feats = self.localization_fpn(img)

        ## thing branch
        if isinstance(localization_feats, list):
            loc_feats = localization_feats[0]
        else:
            loc_feats = localization_feats
        for conv in self.loc_convs:
            loc_feats = conv(loc_feats)
        if self.feat_downsample_stride > 1 and self.feat_refine:
            loc_feats = self.ins_downsample(loc_feats)

        # init kernel prediction
        mask_preds = self.init_kernels(loc_feats)  # init mask prediction

        # stuff branch
        if self.semantic_fpn:
            if isinstance(localization_feats, list):
                semantic_feats = localization_feats[1]
            else:
                semantic_feats = localization_feats
            for conv in self.seg_convs:
                semantic_feats = conv(semantic_feats)
            if self.feat_downsample_stride > 1 and self.feat_refine:
                semantic_feats = self.seg_downsample(semantic_feats)
        else:
            semantic_feats = None

        if semantic_feats is not None:
            seg_preds = self.conv_seg(semantic_feats)
        else:
            seg_preds = None

        # init things
        proposal_feats = self.init_kernels.weight.clone()
        proposal_feats = proposal_feats[None].expand(num_imgs,
                                                     *proposal_feats.size())

        if semantic_feats is not None:
            x_feats = semantic_feats + loc_feats
        else:
            x_feats = loc_feats

        if self.proposal_feats_with_obj:
            sigmoid_masks = mask_preds.sigmoid()
            nonzero_inds = sigmoid_masks > 0.5
            if self.use_binary:
                sigmoid_masks = nonzero_inds.float()
            else:
                sigmoid_masks = nonzero_inds.float() * sigmoid_masks
            obj_feats = torch.einsum('bnhw, bchw->bnc', sigmoid_masks, x_feats)

        cls_scores = None

        if self.proposal_feats_with_obj:  # default True
            proposal_feats = proposal_feats + obj_feats.view(
                num_imgs, self.num_proposals, self.out_channels, 1, 1)

        if self.cat_stuff_mask and not self.training:
            mask_preds = torch.cat(
                [mask_preds, seg_preds[:, self.num_thing_classes:]], dim=1)
            stuff_kernels = self.conv_seg.weight[self.
                                                 num_thing_classes:].clone()
            stuff_kernels = stuff_kernels[None].expand(num_imgs,
                                                       *stuff_kernels.size())
            proposal_feats = torch.cat([proposal_feats, stuff_kernels], dim=1)  # (b, N_{st}+N_{th}, c)

        return proposal_feats, x_feats, mask_preds, cls_scores, seg_preds

    def forward_train(self,
                      img,
                      img_metas,
                      gt_masks,
                      gt_labels,
                      gt_sem_seg=None,
                      gt_sem_cls=None,
                      previous_obj_feats=None,
                      previous_mask_preds=None,
                      previous_x_feats=None):
        """Forward function in training stage."""
        num_imgs = len(img_metas)
        results = self._decode_init_proposals(img, img_metas, previous_obj_feats, previous_mask_preds, previous_x_feats)
        (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = results
        if self.feat_downsample_stride > 1:
            scaled_mask_preds = F.interpolate(
                mask_preds,
                scale_factor=self.feat_downsample_stride,
                mode='bilinear',
                align_corners=False)
            if seg_preds is not None:
                scaled_seg_preds = F.interpolate(
                    seg_preds,
                    scale_factor=self.feat_downsample_stride,
                    mode='bilinear',
                    align_corners=False)
        else:
            scaled_mask_preds = mask_preds  # thing
            scaled_seg_preds = seg_preds   # stuff

        if self.hard_target:
            gt_masks = [x.bool().float() for x in gt_masks]
        else:
            gt_masks = gt_masks

        sampling_results = []
        if cls_scores is None:
            detached_cls_scores = [None] * num_imgs
        else:
            detached_cls_scores = cls_scores.detach()

        for i in range(num_imgs):
            assign_result = self.assigner.assign(scaled_mask_preds[i].detach(),
                                                 detached_cls_scores[i],
                                                 gt_masks[i], gt_labels[i],
                                                 img_meta=img_metas[i])
            sampling_result = self.sampler.sample(assign_result,
                                                  scaled_mask_preds[i],
                                                  gt_masks[i])
            sampling_results.append(sampling_result)

        mask_targets = self.get_targets(
            sampling_results,
            gt_masks,
            self.train_cfg,
            True,
            gt_sem_seg=gt_sem_seg,
            gt_sem_cls=gt_sem_cls)

        losses = self.loss(scaled_mask_preds, cls_scores, scaled_seg_preds,
                           proposal_feats, *mask_targets)

        if self.cat_stuff_mask and self.training:
            mask_preds = torch.cat(
                [mask_preds, seg_preds[:, self.num_thing_classes:]], dim=1)
            stuff_kernels = self.conv_seg.weight[self.
                                                 num_thing_classes:].clone()
            stuff_kernels = stuff_kernels[None].expand(num_imgs,
                                                       *stuff_kernels.size())
            proposal_feats = torch.cat([proposal_feats, stuff_kernels], dim=1)

        return losses, proposal_feats, x_feats, mask_preds, cls_scores

    def loss(self,
             mask_pred,
             cls_scores,
             seg_preds,
             proposal_feats,
             labels,
             label_weights,
             mask_targets,
             mask_weights,
             seg_targets,
             reduction_override=None,
             **kwargs):
        losses = dict()
        bg_class_ind = self.num_classes
        # note in spare rcnn num_gt == num_pos
        pos_inds = (labels >= 0) & (labels < bg_class_ind)
        num_preds = mask_pred.shape[0] * mask_pred.shape[1]

        if cls_scores is not None:
            num_pos = pos_inds.sum().float()
            avg_factor = reduce_mean(num_pos)
            assert mask_pred.shape[0] == cls_scores.shape[0]
            assert mask_pred.shape[1] == cls_scores.shape[1]
            losses['loss_rpn_cls'] = self.loss_cls(
                cls_scores.view(num_preds, -1),
                labels,
                label_weights,
                avg_factor=avg_factor,
                reduction_override=reduction_override)
            losses['rpn_pos_acc'] = accuracy(
                cls_scores.view(num_preds, -1)[pos_inds], labels[pos_inds])

        bool_pos_inds = pos_inds.type(torch.bool)
        # 0~self.num_classes-1 are FG, self.num_classes is BG
        # do not perform bounding box regression for BG anymore.
        H, W = mask_pred.shape[-2:]
        if pos_inds.any():
            pos_mask_pred = mask_pred.reshape(num_preds, H, W)[bool_pos_inds]
            pos_mask_targets = mask_targets[bool_pos_inds]
            losses['loss_rpn_mask'] = self.loss_mask(pos_mask_pred,
                                                     pos_mask_targets)
            losses['loss_rpn_dice'] = self.loss_dice(pos_mask_pred,
                                                     pos_mask_targets)

            if self.loss_rank is not None:
                batch_size = mask_pred.size(0)
                rank_target = mask_targets.new_full((batch_size, H, W),
                                                    self.ignore_label,
                                                    dtype=torch.long)
                rank_inds = pos_inds.view(batch_size,
                                          -1).nonzero(as_tuple=False)
                batch_mask_targets = mask_targets.view(batch_size, -1, H,
                                                       W).bool()
                for i in range(batch_size):
                    curr_inds = (rank_inds[:, 0] == i)
                    curr_rank = rank_inds[:, 1][curr_inds]
                    for j in curr_rank:
                        rank_target[i][batch_mask_targets[i][j]] = j
                losses['loss_rpn_rank'] = self.loss_rank(
                    mask_pred, rank_target, ignore_index=self.ignore_label)

        else:
            losses['loss_rpn_mask'] = mask_pred.sum() * 0
            losses['loss_rpn_dice'] = mask_pred.sum() * 0
            if self.loss_rank is not None:
                losses['loss_rank'] = mask_pred.sum() * 0

        if seg_preds is not None:
            if self.loss_seg.use_sigmoid:
                cls_channel = seg_preds.shape[1]
                flatten_seg = seg_preds.view(
                    -1, cls_channel,
                    H * W).permute(0, 2, 1).reshape(-1, cls_channel)
                flatten_seg_target = seg_targets.view(-1)
                num_dense_pos = (flatten_seg_target >= 0) & (
                    flatten_seg_target < bg_class_ind)
                num_dense_pos = num_dense_pos.sum().float().clamp(min=1.0)
                losses['loss_rpn_seg'] = self.loss_seg(
                    flatten_seg,
                    flatten_seg_target,
                    avg_factor=num_dense_pos)
            else:
                cls_channel = seg_preds.shape[1]
                flatten_seg = seg_preds.view(-1, cls_channel, H * W).permute(
                    0, 2, 1).reshape(-1, cls_channel)
                flatten_seg_target = seg_targets.view(-1)
                losses['loss_rpn_seg'] = self.loss_seg(flatten_seg,
                                                       flatten_seg_target, ignore_index=self.num_classes)

        return losses

    def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask,
                           pos_gt_mask, pos_gt_labels, gt_sem_seg, gt_sem_cls,
                           cfg):
        num_pos = pos_mask.size(0)
        num_neg = neg_mask.size(0)
        num_samples = num_pos + num_neg
        H, W = pos_mask.shape[-2:]
        # original implementation uses new_zeros since BG are set to be 0
        # now use empty & fill because BG cat_id = num_classes,
        # FG cat_id = [0, num_classes-1]
        labels = pos_mask.new_full((num_samples, ),
                                   self.num_classes,
                                   dtype=torch.long)
        label_weights = pos_mask.new_zeros(num_samples)
        mask_targets = pos_mask.new_zeros(num_samples, H, W)
        mask_weights = pos_mask.new_zeros(num_samples, H, W)
        seg_targets = pos_mask.new_full((H, W),
                                        self.num_classes,
                                        dtype=torch.long)

        if gt_sem_cls is not None and gt_sem_seg is not None:
            gt_sem_seg = gt_sem_seg.bool()
            for sem_mask, sem_cls in zip(gt_sem_seg, gt_sem_cls):
                seg_targets[sem_mask] = sem_cls.long()

        if num_pos > 0:
            labels[pos_inds] = pos_gt_labels
            pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
            label_weights[pos_inds] = pos_weight
            mask_targets[pos_inds, ...] = pos_gt_mask
            mask_weights[pos_inds, ...] = 1
            for i in range(num_pos):
                seg_targets[pos_gt_mask[i].bool()] = pos_gt_labels[i]

        if num_neg > 0:
            label_weights[neg_inds] = 1.0

        return labels, label_weights, mask_targets, mask_weights, seg_targets

    def get_targets(self,
                    sampling_results,
                    gt_mask,
                    rpn_train_cfg,
                    concat=True,
                    gt_sem_seg=None,
                    gt_sem_cls=None):
        pos_inds_list = [res.pos_inds for res in sampling_results]
        neg_inds_list = [res.neg_inds for res in sampling_results]
        pos_mask_list = [res.pos_masks for res in sampling_results]
        neg_mask_list = [res.neg_masks for res in sampling_results]
        pos_gt_mask_list = [res.pos_gt_masks for res in sampling_results]
        pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results]
        if gt_sem_seg is None:
            gt_sem_seg = [None] * 2
            gt_sem_cls = [None] * 2
        results = multi_apply(
            self._get_target_single,
            pos_inds_list,
            neg_inds_list,
            pos_mask_list,
            neg_mask_list,
            pos_gt_mask_list,
            pos_gt_labels_list,
            gt_sem_seg,
            gt_sem_cls,
            cfg=rpn_train_cfg)
        (labels, label_weights, mask_targets, mask_weights,
         seg_targets) = results
        if concat:
            labels = torch.cat(labels, 0)
            label_weights = torch.cat(label_weights, 0)
            mask_targets = torch.cat(mask_targets, 0)
            mask_weights = torch.cat(mask_weights, 0)
            seg_targets = torch.stack(seg_targets, 0)
        return labels, label_weights, mask_targets, mask_weights, seg_targets

    def simple_test_rpn(self, img, img_metas,
            previous_obj_feats=None, previous_mask_preds=None, previous_x_feats=None):
        """Forward function in testing stage."""
        return self._decode_init_proposals(img, img_metas, previous_obj_feats, previous_mask_preds, previous_x_feats)

    def forward_dummy(self, img, img_metas):
        """Dummy forward function.

        Used in flops calculation.
        """
        return self._decode_init_proposals(img, img_metas)


================================================
FILE: knet/video/kernel_iter_head.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmdet.core import build_assigner, build_sampler
from mmdet.models.builder import HEADS, build_head
from mmdet.models.roi_heads import BaseRoIHead
from knet.det.mask_pseudo_sampler import MaskPseudoSampler


@HEADS.register_module()
class VideoKernelIterHead(BaseRoIHead):

    def __init__(self,
                 num_stages=6,
                 recursive=False,
                 assign_stages=5,
                 stage_loss_weights=(1, 1, 1, 1, 1, 1),
                 proposal_feature_channel=256,
                 merge_cls_scores=False,
                 do_panoptic=False,
                 post_assign=False,
                 hard_target=False,
                 merge_joint=False,
                 num_proposals=100,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 ignore_label=255,
                 thing_label_in_seg=0,
                 with_track=False,
                 mask_head=dict(
                     type='KernelUpdateHead',
                     num_classes=80,
                     num_fcs=2,
                     num_heads=8,
                     num_cls_fcs=1,
                     num_reg_fcs=3,
                     feedforward_channels=2048,
                     hidden_channels=256,
                     dropout=0.0,
                     roi_feat_size=7,
                     ffn_act_cfg=dict(type='ReLU', inplace=True)),
                 mask_out_stride=4,
                 train_cfg=None,
                 test_cfg=None,
                 **kwargs):
        assert mask_head is not None
        assert len(stage_loss_weights) == num_stages
        self.num_stages = num_stages
        self.stage_loss_weights = stage_loss_weights
        self.proposal_feature_channel = proposal_feature_channel
        self.merge_cls_scores = merge_cls_scores
        self.recursive = recursive
        self.post_assign = post_assign
        self.mask_out_stride = mask_out_stride
        self.hard_target = hard_target
        self.merge_joint = merge_joint
        self.assign_stages = assign_stages
        self.do_panoptic = do_panoptic
        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.thing_label_in_seg = thing_label_in_seg
        self.num_proposals = num_proposals
        self.ignore_label = ignore_label
        self.with_track = with_track
        super(VideoKernelIterHead, self).__init__(
            mask_head=mask_head, train_cfg=train_cfg, test_cfg=test_cfg, **kwargs)
        # train_cfg would be None when run the test.py
        if train_cfg is not None:
            for stage in range(num_stages):
                assert isinstance(
                    self.mask_sampler[stage], MaskPseudoSampler), \
                    'Sparse Mask only support `MaskPseudoSampler`'

    def init_bbox_head(self, mask_roi_extractor, mask_head):
        """Initialize box head and box roi extractor.

        Args:
            mask_roi_extractor (dict): Config of box roi extractor.
            mask_head (dict): Config of box in box head.
        """
        pass

    def init_assigner_sampler(self):
        """Initialize assigner and sampler for each stage."""
        self.mask_assigner = []
        self.mask_sampler = []
        if self.train_cfg is not None:
            for idx, rcnn_train_cfg in enumerate(self.train_cfg):
                self.mask_assigner.append(
                    build_assigner(rcnn_train_cfg.assigner))
                self.current_stage = idx
                self.mask_sampler.append(
                    build_sampler(rcnn_train_cfg.sampler, context=self))

    def init_weights(self):
        for i in range(self.num_stages):
            self.mask_head[i].init_weights()

    def init_mask_head(self, mask_roi_extractor, mask_head):
        """Initialize mask head and mask roi extractor.

        Args:
            mask_roi_extractor (dict): Config of mask roi extractor.
            mask_head (dict): Config of mask in mask head.
        """
        self.mask_head = nn.ModuleList()
        if not isinstance(mask_head, list):
            mask_head = [mask_head for _ in range(self.num_stages)]
        assert len(mask_head) == self.num_stages
        for head in mask_head:
            self.mask_head.append(build_head(head))
        if self.recursive:
            for i in range(self.num_stages):
                self.mask_head[i] = self.mask_head[0]

    def _mask_forward(self, stage, x, object_feats, mask_preds, img_metas,
                      previous_obj_feats=None,
                      previous_mask_preds=None,
                      previous_x_feats=None
                      ):
        mask_head = self.mask_head[stage]
        cls_score, mask_preds, object_feats, x_feats, object_feats_track = mask_head(
            x, object_feats, mask_preds, img_metas=img_metas,
            previous_obj_feats=previous_obj_feats,
            previous_mask_preds=previous_mask_preds,
            previous_x_feats=previous_x_feats
        )
        if mask_head.mask_upsample_stride > 1 and (stage == self.num_stages - 1
                                                   or self.training):
            scaled_mask_preds = F.interpolate(
                mask_preds,
                scale_factor=mask_head.mask_upsample_stride,
                align_corners=False,
                mode='bilinear')
        else:
            scaled_mask_preds = mask_preds
        mask_results = dict(
            cls_score=cls_score,
            mask_preds=mask_preds,
            scaled_mask_preds=scaled_mask_preds,
            object_feats=object_feats,
            object_feats_track=object_feats_track,
            x_feats=x_feats,
        )

        return mask_results

    def forward_train(self,
                      x,
                      proposal_feats,
                      mask_preds,
                      cls_score,
                      img_metas,
                      gt_masks,
                      gt_labels,
                      gt_pids=None,
                      gt_bboxes_ignore=None,
                      imgs_whwh=None,
                      gt_bboxes=None,
                      gt_sem_seg=None,
                      gt_sem_cls=None):

        num_imgs = len(img_metas)
        if self.mask_head[0].mask_upsample_stride > 1:
            prev_mask_preds = F.interpolate(
                mask_preds.detach(),
                scale_factor=self.mask_head[0].mask_upsample_stride,
                mode='bilinear',
                align_corners=False)
        else:
            prev_mask_preds = mask_preds.detach()

        if cls_score is not None:
            prev_cls_score = cls_score.detach()
        else:
            prev_cls_score = [None] * num_imgs

        if self.hard_target:
            gt_masks = [x.bool().float() for x in gt_masks]
        else:
            gt_masks = gt_masks

        object_feats = proposal_feats
        all_stage_loss = {}
        all_stage_mask_results = []
        assign_results = []
        final_sample_results = []
        for stage in range(self.num_stages):
            mask_results = self._mask_forward(stage, x, object_feats,
                                              mask_preds, img_metas)
            all_stage_mask_results.append(mask_results)
            mask_preds = mask_results['mask_preds']
            scaled_mask_preds = mask_results['scaled_mask_preds']
            cls_score = mask_results['cls_score']
            object_feats = mask_results['object_feats']
            object_feats_track = mask_results['object_feats_track']

            if self.post_assign:
                prev_mask_preds = scaled_mask_preds.detach()
                prev_cls_score = cls_score.detach()

            sampling_results = []
            if stage < self.assign_stages:
                assign_results = []
            for i in range(num_imgs):
                if stage < self.assign_stages:
                    mask_for_assign = prev_mask_preds[i][:self.num_proposals]
                    if prev_cls_score[i] is not None:
                        cls_for_assign = prev_cls_score[
                            i][:self.num_proposals, :self.num_thing_classes]
                    else:
                        cls_for_assign = None

                    assign_result = self.mask_assigner[stage].assign(
                        mask_for_assign, cls_for_assign, gt_masks[i],
                        gt_labels[i], img_meta=img_metas[i])
                    assign_results.append(assign_result)
                sampling_result = self.mask_sampler[stage].sample(
                    assign_results[i], scaled_mask_preds[i], gt_masks[i])
                sampling_results.append(sampling_result)

            mask_targets = self.mask_head[stage].get_targets(
                sampling_results,
                gt_masks,
                gt_labels,
                self.train_cfg[stage],
                True,
                gt_sem_seg=gt_sem_seg,
                gt_sem_cls=gt_sem_cls)

            single_stage_loss = self.mask_head[stage].loss(
                object_feats,
                cls_score,
                scaled_mask_preds,
                *mask_targets,
                imgs_whwh=imgs_whwh)
            for key, value in single_stage_loss.items():
                all_stage_loss[f's{stage}_{key}'] = value * \
                                    self.stage_loss_weights[stage]

            if not self.post_assign:
                prev_mask_preds = scaled_mask_preds.detach()
                prev_cls_score = cls_score.detach()

            if stage == self.num_stages - 1:
                final_sample_results.extend(sampling_results)

        if self.with_track:
            return all_stage_loss, object_feats, cls_score, mask_preds, scaled_mask_preds
        else:
            return all_stage_loss

    def forward_train_with_previous(self,
                                      x,
                                      proposal_feats,
                                      mask_preds,
                                      cls_score,
                                      img_metas,
                                      gt_masks,
                                      gt_labels,
                                      gt_pids=None,
                                      gt_bboxes_ignore=None,
                                      imgs_whwh=None,
                                      gt_bboxes=None,
                                      gt_sem_seg=None,
                                      gt_sem_cls=None,
                                      previous_obj_feats=None,
                                      previous_mask_preds=None,
                                      previous_x_feats=None,
                                    ):

        num_imgs = len(img_metas)
        if self.mask_head[0].mask_upsample_stride > 1:
            prev_mask_preds = F.interpolate(
                mask_preds.detach(),
                scale_factor=self.mask_head[0].mask_upsample_stride,
                mode='bilinear',
                align_corners=False)
        else:
            prev_mask_preds = mask_preds.detach()

        if cls_score is not None:
            prev_cls_score = cls_score.detach()
        else:
            prev_cls_score = [None] * num_imgs

        if self.hard_target:
            gt_masks = [x.bool().float() for x in gt_masks]
        else:
            gt_masks = gt_masks

        object_feats = proposal_feats
        all_stage_loss = {}
        all_stage_mask_results = []
        assign_results = []
        final_sample_results = []
        for stage in range(self.num_stages):

            # only link the last stage
            previous_obj_feats_cur = previous_obj_feats if stage == self.num_stages - 1 else None
            previous_mask_preds_cur = previous_mask_preds if stage == self.num_stages - 1 else None
            previous_x_feats_cur = previous_x_feats if stage == self.num_stages - 1 else None

            # only link the first stage
            # previous_obj_feats_cur = previous_obj_feats if stage == 0 else None
            # previous_mask_preds_cur = previous_mask_preds if stage == 0 else None
            # previous_x_feats_cur = previous_x_feats if stage == 0 else None

            mask_results = self._mask_forward(stage, x, object_feats,
                                              mask_preds, img_metas,
                                              previous_obj_feats=previous_obj_feats_cur,
                                              previous_mask_preds=previous_mask_preds_cur,
                                              previous_x_feats=previous_x_feats_cur)
            all_stage_mask_results.append(mask_results)
            mask_preds = mask_results['mask_preds']
            scaled_mask_preds = mask_results['scaled_mask_preds']
            cls_score = mask_results['cls_score']
            object_feats = mask_results['object_feats']
            object_feats_track = mask_results['object_feats_track']

            if self.post_assign:
                prev_mask_preds = scaled_mask_preds.detach()
                prev_cls_score = cls_score.detach()

            sampling_results = []
            if stage < self.assign_stages:
                assign_results = []
            for i in range(num_imgs):
                if stage < self.assign_stages:
                    mask_for_assign = prev_mask_preds[i][:self.num_proposals]
                    if prev_cls_score[i] is not None:
                        cls_for_assign = prev_cls_score[
                            i][:self.num_proposals, :self.num_thing_classes]
                    else:
                        cls_for_assign = None

                    assign_result = self.mask_assigner[stage].assign(
                        mask_for_assign, cls_for_assign, gt_masks[i],
                        gt_labels[i], img_meta=img_metas[i])
                    assign_results.append(assign_result)
                sampling_result = self.mask_sampler[stage].sample(
                    assign_results[i], scaled_mask_preds[i], gt_masks[i])
                sampling_results.append(sampling_result)

            mask_targets = self.mask_head[stage].get_targets(
                sampling_results,
                gt_masks,
                gt_labels,
                self.train_cfg[stage],
                True,
                gt_sem_seg=gt_sem_seg,
                gt_sem_cls=gt_sem_cls)

            single_stage_loss = self.mask_head[stage].loss(
                object_feats,
                cls_score,
                scaled_mask_preds,
                *mask_targets,
                imgs_whwh=imgs_whwh)
            for key, value in single_stage_loss.items():
                all_stage_loss[f's{stage}_{key}'] = value * \
                                    self.stage_loss_weights[stage]

            if not self.post_assign:
                prev_mask_preds = scaled_mask_preds.detach()
                prev_cls_score = cls_score.detach()

            if stage == self.num_stages - 1:
                final_sample_results.extend(sampling_results)

        if self.with_track:
            return all_stage_loss, object_feats, cls_score, mask_preds, scaled_mask_preds, object_feats_track
        else:
            return all_stage_loss

    def simple_test(self,
                    x,
                    proposal_feats,
                    mask_preds,
                    cls_score,
                    img_metas):

        # Decode initial proposals
        num_imgs = len(img_metas)
        # num_proposals = proposal_feats.size(1)

        object_feats = proposal_feats
        for stage in range(self.num_stages):
            mask_results = self._mask_forward(stage, x, object_feats,
                                              mask_preds, img_metas)
            object_feats = mask_results['object_feats']
            cls_score = mask_results['cls_score']
            mask_preds = mask_results['mask_preds']
            scaled_mask_preds = mask_results['scaled_mask_preds']
            object_feats_track = mask_results['object_feats_track']

        num_classes = self.mask_head[-1].num_classes
        results = []

        if self.mask_head[-1].loss_cls.use_sigmoid:
            cls_score = cls_score.sigmoid()
        else:
            cls_score = cls_score.softmax(-1)[..., :-1]

        if self.do_panoptic:
            for img_id in range(num_imgs):
                single_result = self.get_panoptic(cls_score[img_id],
                                                  scaled_mask_preds[img_id],
                                                  self.test_cfg,
                                                  img_metas[img_id],
                                                  object_feats[img_id]
                                                  )
                results.append(single_result)
        else:
            for img_id in range(num_imgs):
                cls_score_per_img = cls_score[img_id]
                scores_per_img, topk_indices = cls_score_per_img.flatten(
                    0, 1).topk(
                        self.test_cfg.max_per_img, sorted=True)
                mask_indices = topk_indices // num_classes
                labels_per_img = topk_indices % num_classes
                masks_per_img = scaled_mask_preds[img_id][mask_indices]
                single_result = self.mask_head[-1].get_seg_masks(
                    masks_per_img, labels_per_img, scores_per_img,
                    self.test_cfg, img_metas[img_id])
                results.append(single_result)

        if self.with_track:
            return results, object_feats, cls_score, mask_preds, scaled_mask_preds
        else:
            return results

    def simple_test_with_previous(self,
                                    x,
                                    proposal_feats,
                                    mask_preds,
                                    cls_score,
                                    img_metas,
                                  previous_obj_feats=None,
                                  previous_mask_preds=None,
                                  previous_x_feats=None,
                                  is_first=False,
                                  ):

        # Decode initial proposals
        num_imgs = len(img_metas)
        # num_proposals = proposal_feats.size(1)

        object_feats = proposal_feats
        for stage in range(self.num_stages):
            # only link the last stage inputs
            previous_obj_feats_cur = previous_obj_feats if stage == self.num_stages - 1 else None
            previous_mask_preds_cur = previous_mask_preds if stage == self.num_stages - 1 else None
            previous_x_feats_cur = previous_x_feats if stage == self.num_stages - 1 else None

            mask_results = self._mask_forward(stage, x, object_feats,
                                              mask_preds, img_metas,
                                              previous_obj_feats=previous_obj_feats_cur,
                                              previous_mask_preds=previous_mask_preds_cur,
                                              previous_x_feats=previous_x_feats_cur
                                              )
            object_feats = mask_results['object_feats']
            cls_score = mask_results['cls_score']
            mask_preds = mask_results['mask_preds']
            scaled_mask_preds = mask_results['scaled_mask_preds']
            object_feats_track = mask_results['object_feats_track']

        num_classes = self.mask_head[-1].num_classes
        results = []

        if self.mask_head[-1].loss_cls.use_sigmoid:
            cls_score = cls_score.sigmoid()
        else:
            cls_score = cls_score.softmax(-1)[..., :-1]

        if is_first:
            object_feats_track = object_feats

        if self.do_panoptic:
            for img_id in range(num_imgs):
                single_result = self.get_panoptic(cls_score[img_id],
                                                  scaled_mask_preds[img_id],
                                                  self.test_cfg,
                                                  img_metas[img_id],
                                                  object_feats_track[img_id])
                results.append(single_result)
        else:
            for img_id in range(num_imgs):
                cls_score_per_img = cls_score[img_id]
                scores_per_img, topk_indices = cls_score_per_img.flatten(
                    0, 1).topk(
                        self.test_cfg.max_per_img, sorted=True)
                mask_indices = topk_indices // num_classes
                labels_per_img = topk_indices % num_classes
                masks_per_img = scaled_mask_preds[img_id][mask_indices]
                single_result = self.mask_head[-1].get_seg_masks(
                    masks_per_img, labels_per_img, scores_per_img,
                    self.test_cfg, img_metas[img_id])
                results.append(single_result)

        if self.with_track:
            return results, object_feats, cls_score, mask_preds, scaled_mask_preds
        else:
            return results

    def simple_test_mask_preds(self,
                    x,
                    proposal_feats,
                    mask_preds,
                    cls_score,
                    img_metas):

        object_feats = proposal_feats
        for stage in range(self.num_stages):
            mask_results = self._mask_forward(stage, x, object_feats,
                                              mask_preds, img_metas)
            object_feats = mask_results['object_feats']
            cls_score = mask_results['cls_score']
            mask_preds = mask_results['mask_preds']
            scaled_mask_preds = mask_results['scaled_mask_preds']

        if self.mask_head[-1].loss_cls.use_sigmoid:
            cls_score = cls_score.sigmoid()
        else:
            cls_score = cls_score.softmax(-1)[..., :-1]

        return object_feats, cls_score, mask_preds, scaled_mask_preds

    def simple_test_mask_preds_plus_previous(
            self,
            x,
            proposal_feats,
            mask_preds,
            cls_score,
            img_metas,
            previous_obj_feats=None,
            previous_mask_preds=None,
            previous_x_feats=None,
        ):

        object_feats = proposal_feats
        for stage in range(self.num_stages):
            previous_obj_feats_cur = previous_obj_feats if stage == self.num_stages - 1 else None
            previous_mask_preds_cur = previous_mask_preds if stage == self.num_stages - 1 else None
            previous_x_feats_cur = previous_x_feats if stage == self.num_stages - 1 else None
            mask_results = self._mask_forward(stage, x, object_feats,
                                              mask_preds, img_metas,
                                              previous_obj_feats=previous_obj_feats_cur,
                                              previous_mask_preds=previous_mask_preds_cur,
                                              previous_x_feats=previous_x_feats_cur
                                              )
            object_feats = mask_results['object_feats']
            cls_score = mask_results['cls_score']
            mask_preds = mask_results['mask_preds']
            scaled_mask_preds = mask_results['scaled_mask_preds']

        if self.mask_head[-1].loss_cls.use_sigmoid:
            cls_score = cls_score.sigmoid()
        else:
            cls_score = cls_score.softmax(-1)[..., :-1]

        return object_feats, cls_score, mask_preds, scaled_mask_preds

    def get_masked_feature(self, x, mask_pred):
        sigmoid_masks = mask_pred.sigmoid()
        nonzero_inds = sigmoid_masks > 0.5
        sigmoid_masks = nonzero_inds.float()
        x_feat = torch.einsum('bnhw,bchw->bnc', sigmoid_masks, x)
        return x_feat

    def aug_test(self, features, proposal_list, img_metas, rescale=False):
        raise NotImplementedError('SparseMask does not support `aug_test`')

    def forward_dummy(self, x, proposal_boxes, proposal_feats, img_metas):
        """Dummy forward function when do the flops computing."""
        all_stage_mask_results = []
        num_imgs = len(img_metas)
        num_proposals = proposal_feats.size(1)
        C, H, W = x.shape[-3:]
        mask_preds = proposal_feats.bmm(x.view(num_imgs, C, -1)).view(
            num_imgs, num_proposals, H, W)
        object_feats = proposal_feats
        for stage in range(self.num_stages):
            mask_results = self._mask_forward(stage, x, object_feats,
                                              mask_preds, img_metas)
            all_stage_mask_results.append(mask_results)
        return all_stage_mask_results

    def get_panoptic(self, cls_scores, mask_preds, test_cfg, img_meta, obj_feat=None):
        # resize mask predictions back
        thing_scores = cls_scores[:self.num_proposals][:, :self.
                                                       num_thing_classes]
        thing_mask_preds = mask_preds[:self.num_proposals]
        thing_scores, topk_indices = thing_scores.flatten(0, 1).topk(
            self.test_cfg.max_per_img, sorted=True)
        mask_indices = topk_indices // self.num_thing_classes
        thing_labels = topk_indices % self.num_thing_classes
        masks_per_img = thing_mask_preds[mask_indices]
        thing_masks = self.mask_head[-1].rescale_masks(masks_per_img, img_meta)

        # thing obj_feat
        thing_obj_feat = obj_feat[:self.num_proposals]
        thing_obj_feat = thing_obj_feat[mask_indices]

        if not self.merge_joint:
            thing_masks = thing_masks > test_cfg.mask_thr
        bbox_result, segm_result, thing_mask_preds = self.mask_head[-1].segm2result(
            thing_masks, thing_labels, thing_scores)

        stuff_scores = cls_scores[
            self.num_proposals:][:, self.num_thing_classes:].diag()
        stuff_scores, stuff_inds = torch.sort(stuff_scores, descending=True)
        stuff_masks = mask_preds[self.num_proposals:][stuff_inds]
        stuff_masks = self.mask_head[-1].rescale_masks(stuff_masks, img_meta)

        # stuff obj_feat
        stuff_obj_feat = obj_feat[self.num_proposals:][stuff_inds]

        if not self.merge_joint:
            stuff_masks = stuff_masks > test_cfg.mask_thr

        if self.merge_joint:
            stuff_labels = stuff_inds + self.num_thing_classes
            panoptic_result, thing_obj_feat = self.merge_stuff_thing_stuff_joint(thing_masks, thing_labels,
                                                                 thing_scores, stuff_masks,
                                                                 stuff_labels, stuff_scores,
                                                                 test_cfg.merge_stuff_thing,
                                                                 thing_obj_feat, stuff_obj_feat
                                                                 )
        else:
            stuff_labels = stuff_inds + 1
            panoptic_result, thing_obj_feat = self.merge_stuff_thing_thing_first(thing_masks, thing_labels,
                                                 thing_scores, stuff_masks,
                                                 stuff_labels, stuff_scores,
                                                 test_cfg.merge_stuff_thing,
                                                thing_obj_feat, stuff_obj_feat)

        return bbox_result, segm_result, thing_mask_preds,  panoptic_result, thing_obj_feat

    def split_thing_stuff(self, mask_preds, det_labels, cls_scores):
        thing_scores = cls_scores[:self.num_proposals]
        thing_masks = mask_preds[:self.num_proposals]
        thing_labels = det_labels[:self.num_proposals]

        stuff_labels = det_labels[self.num_proposals:]
        stuff_labels = stuff_labels - self.num_thing_classes + 1
        stuff_masks = mask_preds[self.num_proposals:]
        stuff_scores = cls_scores[self.num_proposals:]

        results = (thing_masks, thing_labels, thing_scores, stuff_masks,
                   stuff_labels, stuff_scores)
        return results

    def merge_stuff_thing_thing_first(self,
                          thing_masks,
                          thing_labels,
                          thing_scores,
                          stuff_masks,
                          stuff_labels,
                          stuff_scores,
                          merge_cfg=None,
                          thing_obj_feat=None,
                          stuff_obj_feat=None):

        H, W = thing_masks.shape[-2:]
        panoptic_seg = thing_masks.new_zeros((H, W), dtype=torch.int32)
        thing_masks = thing_masks.to(
            dtype=torch.bool, device=panoptic_seg.device)
        stuff_masks = stuff_masks.to(
            dtype=torch.bool, device=panoptic_seg.device)

        # sort instance outputs by scores
        sorted_inds = torch.argsort(-thing_scores)
        thing_obj_feat = thing_obj_feat[sorted_inds]
        current_segment_id = 0
        segments_info = []
        instance_ids = []

        # Add instances one-by-one, check for overlaps with existing ones
        for inst_id in sorted_inds:
            score = thing_scores[inst_id].item()
            if score < merge_cfg.instance_score_thr:
                break
            mask = thing_masks[inst_id]  # H,W
            mask_area = mask.sum().item()

            if mask_area == 0:
                continue

            intersect = (mask > 0) & (panoptic_seg > 0)
            intersect_area = intersect.sum().item()

            if intersect_area * 1.0 / mask_area > merge_cfg.iou_thr:
                continue

            if intersect_area > 0:
                mask = mask & (panoptic_seg == 0)

            mask_area = mask.sum().item()
            if mask_area == 0:
                continue

            current_segment_id += 1
            panoptic_seg[mask.bool()] = current_segment_id
            segments_info.append({
                'id': current_segment_id,
                'isthing': True,
                'score': score,
                'category_id': thing_labels[inst_id].item(),
                'instance_id': inst_id.item(),
            })
            instance_ids.append(inst_id.item())

        # Add semantic results to remaining empty areas
        sorted_inds = torch.argsort(-stuff_scores)
        sorted_stuff_labels = stuff_labels[sorted_inds]
        # paste semantic masks following the order of scores
        processed_label = []
        for semantic_label in sorted_stuff_labels:
            semantic_label = semantic_label.item()
            if semantic_label in processed_label:
                continue
            processed_label.append(semantic_label)
            sem_inds = stuff_labels == semantic_label
            sem_masks = stuff_masks[sem_inds].sum(0).bool()
            mask = sem_masks & (panoptic_seg == 0)
            mask_area = mask.sum().item()
            if mask_area < merge_cfg.stuff_max_area:
                continue

            current_segment_id += 1
            panoptic_seg[mask] = current_segment_id
            segments_info.append({
                'id': current_segment_id,
                'isthing': False,
                'category_id': semantic_label,
                'area': mask_area,
            })
        return (panoptic_seg.cpu().numpy(), segments_info), thing_obj_feat[instance_ids]

    def merge_stuff_thing_stuff_first(self,
                          thing_masks,
                          thing_labels,
                          thing_scores,
                          stuff_masks,
                          stuff_labels,
                          stuff_scores,
                          merge_cfg=None,
                          thing_obj_feat=None,
                          stuff_obj_feat=None):

        H, W = thing_masks.shape[-2:]
        panoptic_seg = thing_masks.new_zeros((H, W), dtype=torch.int32)
        thing_masks = thing_masks.to(
            dtype=torch.bool, device=panoptic_seg.device)
        stuff_masks = stuff_masks.to(
            dtype=torch.bool, device=panoptic_seg.device)

        current_segment_id = 0
        segments_info = []

        # Add semantic results first
        sorted_inds = torch.argsort(-stuff_scores)
        sorted_stuff_labels = stuff_labels[sorted_inds]
        # paste semantic masks following the order of scores
        processed_label = []
        for semantic_label in sorted_stuff_labels:
            semantic_label = semantic_label.item()
            if semantic_label in processed_label:
                continue
            processed_label.append(semantic_label)
            sem_inds = stuff_labels == semantic_label
            sem_masks = stuff_masks[sem_inds].sum(0).bool()
            mask = sem_masks & (panoptic_seg == 0)
            mask_area = mask.sum().item()
            if mask_area < merge_cfg.stuff_max_area:
                continue

            current_segment_id += 1
            panoptic_seg[mask] = current_segment_id
            segments_info.append({
                'id': current_segment_id,
                'isthing': False,
                'category_id': semantic_label,
                'area': mask_area,
            })

        # sort instance outputs by scores
        sorted_inds = torch.argsort(-thing_scores)
        # thing obj feat
        thing_obj_feat = thing_obj_feat[sorted_inds]
        # Add instances one-by-one, check for overlaps with existing ones
        instance_ids = []
        for inst_id in sorted_inds:
            score = thing_scores[inst_id].item()
            if score < merge_cfg.instance_score_thr:
                break
            mask = thing_masks[inst_id]  # H,W
            mask_area = mask.sum().item()

            if mask_area == 0:
                continue

            intersect = (mask > 0) & (panoptic_seg > 0)
            intersect_area = intersect.sum().item()

            if intersect_area * 1.0 / mask_area > merge_cfg.iou_thr:
                continue

            if intersect_area > 0:
                mask = mask & (panoptic_seg == 0)

            mask_area = mask.sum().item()
            if mask_area == 0:
                continue

            current_segment_id += 1
            panoptic_seg[mask.bool()] = current_segment_id
            segments_info.append({
                'id': current_segment_id,
                'isthing': True,
                'score': score,
                'category_id': thing_labels[inst_id].item(),
                'instance_id': inst_id.item(),
            })
            instance_ids.append(inst_id.item())

        return (panoptic_seg.cpu().numpy(), segments_info), thing_obj_feat[instance_ids]

    def merge_stuff_thing_stuff_joint(self,
                                      thing_masks,
                                      thing_labels,
                                      thing_scores,
                                      stuff_masks,
                                      stuff_labels,
                                      stuff_scores,
                                      merge_cfg=None,
                                      thing_obj=None,
                                      stuff_obj=None
                                      ):

        H, W = thing_masks.shape[-2:]
        panoptic_seg = thing_masks.new_zeros((H, W), dtype=torch.int32)

        total_masks = torch.cat([thing_masks, stuff_masks], dim=0)
        total_scores = torch.cat([thing_scores, stuff_scores], dim=0)
        total_labels = torch.cat([thing_labels, stuff_labels], dim=0)
        obj_fea = torch.cat([thing_obj, stuff_obj], dim=0)

        cur_prob_masks = total_scores.view(-1, 1, 1) * total_masks
        segments_info = []
        cur_mask_ids = cur_prob_masks.argmax(0)

        # sort instance outputs by scores
        sorted_inds = torch.argsort(-total_scores)
        current_segment_id = 0
        sort_obj_fea = obj_fea
        things_ids = []
        for k in sorted_inds:
            pred_class = total_labels[k].item()
            isthing = pred_class < self.num_thing_classes
            if isthing and total_scores[k] < merge_cfg.instance_score_thr:
                continue

            mask = cur_mask_ids == k
            mask_area = mask.sum().item()
            original_area = (total_masks[k] >= 0.5).sum().item()

            if mask_area > 0 and original_area > 0:
                if mask_area / original_area < merge_cfg.overlap_thr:
                    continue
                current_segment_id += 1

                panoptic_seg[mask] = current_segment_id

                if isthing:
                    segments_info.append({
                        'id': current_segment_id,
                        'isthing': isthing,
                        'score': total_scores[k].item(),
                        'category_id': pred_class,  # 0, num_thing - 1
                        'instance_id': k.item(),
                    })
                    things_ids.append(k.item())
                else:
                    segments_info.append({
                        'id': current_segment_id,
                        'isthing': isthing,
                        'category_id': pred_class - self.num_thing_classes + 1, # 1, num_stuff
                        'area': mask_area,
                    })

        return (panoptic_seg.cpu().numpy(), segments_info), sort_obj_fea[things_ids]

================================================
FILE: knet/video/kernel_update_head.py
================================================
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import (ConvModule, bias_init_with_prob,
                      build_activation_layer, build_norm_layer)
from mmcv.runner import force_fp32
from mmdet.core import multi_apply
from mmdet.models.builder import HEADS, build_loss
from mmdet.models.dense_heads.atss_head import reduce_mean
from mmdet.models.losses import accuracy
from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention, build_transformer_layer
from mmdet.utils import get_root_logger
from unitrack.mask import mask2box, tensor_mask2box


@HEADS.register_module()
class VideoKernelUpdateHead(nn.Module):

    def __init__(self,
                 num_classes=80,
                 num_ffn_fcs=2,
                 num_heads=8,
                 num_cls_fcs=1,
                 num_mask_fcs=3,
                 feedforward_channels=2048,
                 in_channels=256,
                 out_channels=256,
                 dropout=0.0,
                 mask_thr=0.5,
                 act_cfg=dict(type='ReLU', inplace=True),
                 ffn_act_cfg=dict(type='ReLU', inplace=True),
                 conv_kernel_size=3,
                 feat_transform_cfg=None,
                 hard_mask_thr=0.5,
                 kernel_init=False,
                 with_ffn=True,
                 mask_out_stride=4,
                 relative_coors=False,
                 relative_coors_off=False,
                 feat_gather_stride=1,
                 mask_transform_stride=1,
                 mask_upsample_stride=1,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 ignore_label=255,
                 thing_label_in_seg=0,
                 previous=None,
                 previous_x_feat=None,
                 previous_link=None,  # seg/cls embeddings
                 previous_type=None,  # tracking embeddings
                 previous_detach=False,
                 previous_detach_link=False,  # whether detach linl query
                 previous_link_detach=False,
                 kernel_updator_cfg=dict(
                     type='DynamicConv',
                     in_channels=256,
                     feat_channels=64,
                     out_channels=256,
                     input_feat_shape=1,
                     act_cfg=dict(type='ReLU', inplace=True),
                     norm_cfg=dict(type='LN')),
                 loss_rank=None,
                 loss_mask=dict(
                     type='CrossEntropyLoss', use_mask=True, loss_weight=1.0),
                 loss_dice=dict(type='DiceLoss', loss_weight=3.0),
                 loss_cls=dict(
                     type='FocalLoss',
                     use_sigmoid=True,
                     gamma=2.0,
                     alpha=0.25,
                     loss_weight=2.0)):
        super(VideoKernelUpdateHead, self).__init__()
        self.num_classes = num_classes
        self.loss_cls = build_loss(loss_cls)
        self.loss_mask = build_loss(loss_mask)
        self.loss_dice = build_loss(loss_dice)
        if loss_rank is not None:
            self.loss_rank = build_loss(loss_rank)
        else:
            self.loss_rank = loss_rank

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.mask_thr = mask_thr
        self.fp16_enabled = False
        self.dropout = dropout

        self.num_heads = num_heads
        self.hard_mask_thr = hard_mask_thr
        self.kernel_init = kernel_init
        self.with_ffn = with_ffn
        self.mask_out_stride = mask_out_stride
        self.relative_coors = relative_coors
        self.relative_coors_off = relative_coors_off
        self.conv_kernel_size = conv_kernel_size
        self.feat_gather_stride = feat_gather_stride
        self.mask_transform_stride = mask_transform_stride
        self.mask_upsample_stride = mask_upsample_stride

        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.ignore_label = ignore_label
        self.thing_label_in_seg = thing_label_in_seg

        self.attention = MultiheadAttention(
            in_channels * conv_kernel_size ** 2, num_heads, dropout)
        self.attention_norm = build_norm_layer(
            dict(type='LN'), in_channels * conv_kernel_size ** 2)[1]

        self.kernel_update_conv = build_transformer_layer(kernel_updator_cfg)

        if feat_transform_cfg is not None:
            kernel_size = feat_transform_cfg.pop('kernel_size', 1)
            self.feat_transform = ConvModule(
                in_channels,
                in_channels,
                kernel_size,
                stride=feat_gather_stride,
                padding=int(feat_gather_stride // 2),
                **feat_transform_cfg)
        else:
            self.feat_transform = None

        if self.with_ffn:
            self.ffn = FFN(
                in_channels,
                feedforward_channels,
                num_ffn_fcs,
                act_cfg=ffn_act_cfg,
                dropout=dropout)
            self.ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1]

        self.cls_fcs = nn.ModuleList()
        for _ in range(num_cls_fcs):
            self.cls_fcs.append(
                nn.Linear(in_channels, in_channels, bias=False))
            self.cls_fcs.append(
                build_norm_layer(dict(type='LN'), in_channels)[1])
            self.cls_fcs.append(build_activation_layer(act_cfg))

        if self.loss_cls.use_sigmoid:
            self.fc_cls = nn.Linear(in_channels, self.num_classes)
        else:
            self.fc_cls = nn.Linear(in_channels, self.num_classes + 1)

        self.mask_fcs = nn.ModuleList()
        for _ in range(num_mask_fcs):
            self.mask_fcs.append(
                nn.Linear(in_channels, in_channels, bias=False))
            self.mask_fcs.append(
                build_norm_layer(dict(type='LN'), in_channels)[1])
            self.mask_fcs.append(build_activation_layer(act_cfg))

        self.fc_mask = nn.Linear(in_channels, out_channels)

        self.previous = previous
        self.previous_type = previous_type
        self.previous_link = previous_link
        self.previous_x_feat = previous_x_feat
        self.previous_detach = previous_detach
        self.previous_detach_link = previous_detach_link
        self.previous_link_detach = previous_link_detach

        if self.previous is not None:
            _in_channels = self.in_channels
            _conv_kernel_size = self.conv_kernel_size
            _num_head = 8
            _dropout = 0.
            # tracking embedding
            if self.previous_type == "ffn":
                self.attention_previous = MultiheadAttention(
                    _in_channels * _conv_kernel_size ** 2,
                    _num_head,
                    _dropout,
                )
                _, self.attention_previous_norm = build_norm_layer(
                    dict(type='LN'),
                    _in_channels * _conv_kernel_size ** 2
                )
                # add link ffn
                self.link_ffn = FFN(
                    in_channels,
                    feedforward_channels,
                    num_ffn_fcs,
                    act_cfg=ffn_act_cfg,
                    dropout=dropout)
                self.link_ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1]

            elif self.previous_type == "update" or self.previous_type == "update_obj":

                self.attention_previous_update_track = build_transformer_layer(kernel_updator_cfg)

                self.attention_previous_track = MultiheadAttention(
                    _in_channels * _conv_kernel_size ** 2,
                    _num_head,
                    _dropout,
                )
                _, self.attention_previous_norm_track = build_norm_layer(
                    dict(type='LN'),
                    _in_channels * _conv_kernel_size ** 2
                )
                # add link ffn
                self.link_ffn_track = FFN(
                    in_channels,
                    feedforward_channels,
                    num_ffn_fcs,
                    act_cfg=ffn_act_cfg,
                    dropout=dropout)
                self.link_ffn_norm_track = build_norm_layer(dict(type='LN'), in_channels)[1]

            # seg and cls embedding Link
            if self.previous_link == "update_dynamic_cov":
                _in_channels = self.in_channels
                _conv_kernel_size = self.conv_kernel_size
                _num_head = 8
                _dropout = 0.
                self.attention_previous_update_link = build_transformer_layer(kernel_updator_cfg)
                self.attention_previous_link = MultiheadAttention(
                    _in_channels * _conv_kernel_size ** 2,
                    _num_head,
                    _dropout,
                )
                _, self.attention_previous_norm_link = build_norm_layer(
                    dict(type='LN'),
                    _in_channels * _conv_kernel_size ** 2
                )
                # add link ffn
                self.link_ffn_link = FFN(
                    in_channels,
                    feedforward_channels,
                    num_ffn_fcs,
                    act_cfg=ffn_act_cfg,
                    dropout=dropout)
                self.link_ffn_norm_link = build_norm_layer(dict(type='LN'), in_channels)[1]

            elif self.previous_link == "link_atten":
                _in_channels = self.in_channels
                _conv_kernel_size = self.conv_kernel_size
                _num_head = 8
                _dropout = 0.
                self.attention_previous_link = MultiheadAttention(
                    _in_channels * _conv_kernel_size ** 2,
                    _num_head,
                    _dropout,
                )
                _, self.attention_previous_norm_link = build_norm_layer(
                    dict(type='LN'),
                    _in_channels * _conv_kernel_size ** 2
                )
                # add link ffn
                self.link_ffn_link = FFN(
                    in_channels,
                    feedforward_channels,
                    num_ffn_fcs,
                    act_cfg=ffn_act_cfg,
                    dropout=dropout)
                self.link_ffn_norm_link = build_norm_layer(dict(type='LN'), in_channels)[1]

    def init_weights(self):
        """Use xavier initialization for all weight parameter and set
        classification head bias as a specific value when use focal loss."""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
            else:
                # adopt the default initialization for
                # the weight and bias of the layer norm
                pass
        if self.loss_cls.use_sigmoid:
            bias_init = bias_init_with_prob(0.01)
            nn.init.constant_(self.fc_cls.bias, bias_init)
        if self.kernel_init:
            logger = get_root_logger()
            logger.info(
                'mask kernel in mask head is normal initialized by std 0.01')
            nn.init.normal_(self.fc_mask.weight, mean=0, std=0.01)

    def forward(self,
                x,
                proposal_feat,
                mask_preds,
                prev_cls_score=None,
                mask_shape=None,
                img_metas=None,
                previous_obj_feats=None,
                previous_mask_preds=None,
                previous_x_feats=None
                ):

        N, num_proposals = proposal_feat.shape[:2]
        if self.feat_transform is not None:
            x = self.feat_transform(x)
            if previous_x_feats is not None:
                previous_x_feats = self.feat_transform(previous_x_feats)
        C, H, W = x.shape[-3:]

        mask_h, mask_w = mask_preds.shape[-2:]
        if mask_h != H or mask_w != W:
            gather_mask = F.interpolate(
                mask_preds, (H, W), align_corners=False, mode='bilinear')
        else:
            gather_mask = mask_preds

        sigmoid_masks = gather_mask.sigmoid()
        nonzero_inds = sigmoid_masks > self.hard_mask_thr
        sigmoid_masks = nonzero_inds.float()

        # einsum is faster than bmm by 30%
        x_feat = torch.einsum('bnhw,bchw->bnc', sigmoid_masks, x)

        # obj_feat in shape [B, N, C, K, K] -> [B, N, C, K*K] -> [B, N, K*K, C]
        proposal_feat = proposal_feat.reshape(N, num_proposals,
                                              self.in_channels,
                                              -1).permute(0, 1, 3, 2)

        # whether to detach the previous outputs
        if self.training and self.previous_detach:
            previous_obj_feats = previous_obj_feats.detach()

        # update previous with link object query
        if previous_obj_feats is not None and self.previous_link == "update_dynamic_cov":
            previous_obj_feats_link = previous_obj_feats.reshape(N, num_proposals,
                                                                 self.in_channels,
                                                                 -1).permute(0, 1, 3, 2)

            if self.training and self.previous_detach_link:
                previous_obj_feats_link = previous_obj_feats_link.detach()

            previous_obj_feats_update = self.attention_previous_update_link(x_feat, previous_obj_feats_link)

            previous_obj_feats_update = previous_obj_feats_update.reshape(N, num_proposals, -1).permute(1, 0, 2)
            cur_obj_feat = proposal_feat.reshape(N, num_proposals, self.in_channels * self.conv_kernel_size ** 2). \
                permute(1, 0, 2)
            cur_obj_feat = self.attention_previous_norm_link(
                self.attention_previous_link(
                    query=cur_obj_feat,
                    key=previous_obj_feats_update,
                    value=previous_obj_feats_update,
                    identity=cur_obj_feat
                ),
            )
            cur_obj_feat = cur_obj_feat.permute(1, 0, 2)
            cur_obj_feat = cur_obj_feat.reshape(N, num_proposals, -1, self.in_channels)
            # pre_obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C]
            proposal_feat = self.link_ffn_norm_link(self.link_ffn_link(cur_obj_feat))

        if previous_obj_feats is not None and self.previous_link == "link_atten":
            previous_obj_feats_link = previous_obj_feats.reshape(N, num_proposals,
                                                                 self.in_channels,
                                                                 -1).permute(0, 1, 3, 2)

            previous_obj_feats_update = previous_obj_feats_link.reshape(N, num_proposals, -1).permute(1, 0, 2)
            cur_obj_feat = proposal_feat.reshape(N, num_proposals, self.in_channels * self.conv_kernel_size ** 2). \
                permute(1, 0, 2)
            cur_obj_feat = self.attention_previous_norm_link(
                self.attention_previous_link(
                    query=cur_obj_feat,
                    key=previous_obj_feats_update,
                    value=previous_obj_feats_update,
                    identity=cur_obj_feat
                ),
            )
            cur_obj_feat = cur_obj_feat.permute(1, 0, 2)
            cur_obj_feat = cur_obj_feat.reshape(N, num_proposals, -1, self.in_channels)
            # pre_obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C]
            proposal_feat = self.link_ffn_norm_link(self.link_ffn_link(cur_obj_feat))

        # update current
        obj_feat = self.kernel_update_conv(x_feat, proposal_feat)

        # [B, N, K*K, C] -> [B, N, K*K*C] -> [N, B, K*K*C]
        obj_feat = obj_feat.reshape(N, num_proposals,
                                    -1).permute(1, 0, 2)
        obj_feat = self.attention_norm(self.attention(obj_feat))
        # [N, B, K*K*C] -> [B, N, K*K*C]
        obj_feat = obj_feat.permute(1, 0, 2)

        # obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C]
        obj_feat = obj_feat.reshape(N, num_proposals, -1, self.in_channels)

        # FFN
        if self.with_ffn:
            obj_feat = self.ffn_norm(self.ffn(obj_feat))

        # For Tracking Parts
        # Link previous and cur if previous obj feat is Not None
        if previous_obj_feats is not None:
            # previous_obj_feats (b, n, c, k, k) -> (b,n,c,k*k) -> (b,,n, k*k, c)
            # permute to correct dimension

            if self.previous_type == "ffn":
                previous_obj_feats = previous_obj_feats.reshape(N, num_proposals,
                                                                self.in_channels,
                                                                -1).permute(0, 1, 3, 2)
                cur_obj_feat = obj_feat.reshape(N, num_proposals, self.in_channels * self.conv_kernel_size ** 2). \
                    permute(1, 0, 2)
                previous_obj_feats = previous_obj_feats.reshape(N, num_proposals,
                                                                self.in_channels * self.conv_kernel_size ** 2).permute(
                    1, 0, 2)

                previous_obj_feat = self.attention_previous_norm(
                    self.attention_previous(
                        query=cur_obj_feat,
                        key=previous_obj_feats,
                        value=previous_obj_feats,
                        identity=cur_obj_feat
                    ),
                )
                previous_obj_feat = previous_obj_feat.permute(1, 0, 2)
                previous_obj_feat_track = previous_obj_feat.reshape(N, num_proposals, -1, self.in_channels)
                # pre_obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C]
                previous_obj_feat_track = self.link_ffn_norm(self.link_ffn(previous_obj_feat_track))

            elif self.previous_type == "update":
                # not work
                previous_obj_feats = previous_obj_feats.reshape(N, num_proposals,
                                                                self.in_channels,
                                                                -1).permute(0, 1, 3, 2)
                previous_obj_feats_track = self.attention_previous_update_track(x_feat, previous_obj_feats)

                previous_obj_feats_track = previous_obj_feats_track.reshape(N, num_proposals,
                                                                            self.in_channels,
                                                                            -1).permute(0, 1, 3, 2)
                cur_obj_feat = obj_feat.reshape(N, num_proposals, self.in_channels * self.conv_kernel_size ** 2). \
                    permute(1, 0, 2)
                previous_obj_feats_track = previous_obj_feats_track.reshape(N, num_proposals,
                                                                            self.in_channels * self.conv_kernel_size ** 2).permute(
                    1, 0, 2)

                previous_obj_feats_track = self.attention_previous_norm_track(
                    self.attention_previous_track(
                        query=cur_obj_feat,
                        key=previous_obj_feats_track,
                        value=previous_obj_feats_track,
                        identity=cur_obj_feat
                    ),
                )
                previous_obj_feats_track = previous_obj_feats_track.permute(1, 0, 2)
                previous_obj_feats_track = previous_obj_feats_track.reshape(N, num_proposals, -1, self.in_channels)
                # pre_obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C]
                previous_obj_feat_track = self.link_ffn_norm_track(self.link_ffn_track(previous_obj_feats_track))

            elif self.previous_type == "update_obj":
                # not work
                previous_obj_feats = previous_obj_feats.reshape(N, num_proposals,
                                                                self.in_channels,
                                                                -1).permute(0, 1, 3, 2)
                previous_obj_feats_track = self.attention_previous_update_track(obj_feat.squeeze(2), previous_obj_feats)

                previous_obj_feats_track = previous_obj_feats_track.reshape(N, num_proposals,
                                                                            self.in_channels,
                                                                            -1).permute(0, 1, 3, 2)
                cur_obj_feat = obj_feat.reshape(N, num_proposals, self.in_channels * self.conv_kernel_size ** 2). \
                    permute(1, 0, 2)
                previous_obj_feats_track = previous_obj_feats_track.reshape(N, num_proposals,
                                                                            self.in_channels * self.conv_kernel_size ** 2).permute(
                    1, 0, 2)

                previous_obj_feats_track = self.attention_previous_norm_track(
                    self.attention_previous_track(
                        query=cur_obj_feat,
                        key=previous_obj_feats_track,
                        value=previous_obj_feats_track,
                        identity=cur_obj_feat
                    ),
                )
                previous_obj_feats_track = previous_obj_feats_track.permute(1, 0, 2)
                previous_obj_feats_track = previous_obj_feats_track.reshape(N, num_proposals, -1, self.in_channels)
                # pre_obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C]
                previous_obj_feat_track = self.link_ffn_norm_track(self.link_ffn_track(previous_obj_feats_track))
            else:
                previous_obj_feat_track = None

        cls_feat = obj_feat.sum(-2)
        mask_feat = obj_feat

        for cls_layer in self.cls_fcs:
            cls_feat = cls_layer(cls_feat)
        for reg_layer in self.mask_fcs:
            mask_feat = reg_layer(mask_feat)

        cls_score = self.fc_cls(cls_feat).view(N, num_proposals, -1)
        # [B, N, K*K, C] -> [B, N, C, K*K]
        mask_feat = self.fc_mask(mask_feat).permute(0, 1, 3, 2)

        if (self.mask_transform_stride == 2
                and self.feat_gather_stride == 1):
            mask_x = F.interpolate(
                x, scale_factor=0.5, mode='bilinear', align_corners=False)
            H, W = mask_x.shape[-2:]
        else:
            mask_x = x
        # group conv is 5x faster than unfold and uses about 1/5 memory
        # Group conv vs. unfold vs. concat batch, 2.9ms :13.5ms :3.8ms
        # Group conv vs. unfold vs. concat batch, 278 : 1420 : 369
        # fold_x = F.unfold(
        #     mask_x,
        #     self.conv_kernel_size,
        #     padding=int(self.conv_kernel_size // 2))
        # mask_feat = mask_feat.reshape(N, num_proposals, -1)
        # new_mask_preds = torch.einsum('bnc,bcl->bnl', mask_feat, fold_x)
        # [B, N, C, K*K] -> [B*N, C, K, K]
        mask_feat = mask_feat.reshape(N, num_proposals, C,
                                      self.conv_kernel_size,
                                      self.conv_kernel_size)
        # [B, C, H, W] -> [1, B*C, H, W]
        new_mask_preds = []
        for i in range(N):
            new_mask_preds.append(
                F.conv2d(
                    mask_x[i:i + 1],
                    mask_feat[i],
                    padding=int(self.conv_kernel_size // 2)))

        new_mask_preds = torch.cat(new_mask_preds, dim=0)
        new_mask_preds = new_mask_preds.reshape(N, num_proposals, H, W)
        if self.mask_transform_stride == 2:
            new_mask_preds = F.interpolate(
                new_mask_preds,
                scale_factor=2,
                mode='bilinear',
                align_corners=False)

        if mask_shape is not None and mask_shape[0] != H:
            new_mask_preds = F.interpolate(
                new_mask_preds,
                mask_shape,
                align_corners=False,
                mode='bilinear')

        if previous_obj_feats is not None and previous_obj_feat_track is not None:
            return cls_score, new_mask_preds, obj_feat.permute(0, 1, 3, 2).reshape(
                N, num_proposals, self.in_channels, self.conv_kernel_size, self.conv_kernel_size), x_feat, \
                   previous_obj_feat_track.permute(0, 1, 3, 2).reshape(
                       N, num_proposals, self.in_channels, self.conv_kernel_size, self.conv_kernel_size)
        else:
            return cls_score, new_mask_preds, obj_feat.permute(0, 1, 3, 2).reshape(
                N, num_proposals, self.in_channels, self.conv_kernel_size, self.conv_kernel_size), x_feat, None

    @force_fp32(apply_to=('cls_score', 'mask_pred'))
    def loss(self,
             object_feats,
             cls_score,
             mask_pred,
             labels,
             label_weights,
             mask_targets,
             mask_weights,
             imgs_whwh=None,
             reduction_override=None,
             **kwargs):

        losses = dict()
        bg_class_ind = self.num_classes
        # note in spare rcnn num_gt == num_pos
        pos_inds = (labels >= 0) & (labels < bg_class_ind)
        num_pos = pos_inds.sum().float()
        avg_factor = reduce_mean(num_pos).clamp_(min=1.0)

        num_preds = mask_pred.shape[0] * mask_pred.shape[1]
        assert mask_pred.shape[0] == cls_score.shape[0]
        assert mask_pred.shape[1] == cls_score.shape[1]

        if cls_score is not None:
            if cls_score.numel() > 0:
                losses['loss_cls'] = self.loss_cls(
                    cls_score.view(num_preds, -1),
                    labels,
                    label_weights,
                    avg_factor=avg_factor,
                    reduction_override=reduction_override)
                losses['pos_acc'] = accuracy(
                    cls_score.view(num_preds, -1)[pos_inds], labels[pos_inds])
        if mask_pred is not None:
            bool_pos_inds = pos_inds.type(torch.bool)
            # 0~self.num_classes-1 are FG, self.num_classes is BG
            # do not perform bounding box regression for BG anymore.
            H, W = mask_pred.shape[-2:]
            if pos_inds.any():
                pos_mask_pred = mask_pred.reshape(num_preds, H,
                                                  W)[bool_pos_inds]
                pos_mask_targets = mask_targets[bool_pos_inds]
                losses['loss_mask'] = self.loss_mask(pos_mask_pred,
                                                     pos_mask_targets)
                losses['loss_dice'] = self.loss_dice(pos_mask_pred,
                                                     pos_mask_targets)

                if self.loss_rank is not None:
                    batch_size = mask_pred.size(0)
                    rank_target = mask_targets.new_full((batch_size, H, W),
                                                        self.ignore_label,
                                                        dtype=torch.long)
                    rank_inds = pos_inds.view(batch_size,
                                              -1).nonzero(as_tuple=False)
                    batch_mask_targets = mask_targets.view(
                        batch_size, -1, H, W).bool()
                    for i in range(batch_size):
                        curr_inds = (rank_inds[:, 0] == i)
                        curr_rank = rank_inds[:, 1][curr_inds]
                        for j in curr_rank:
                            rank_target[i][batch_mask_targets[i][j]] = j
                    losses['loss_rank'] = self.loss_rank(
                        mask_pred, rank_target, ignore_index=self.ignore_label)
            else:
                losses['loss_mask'] = mask_pred.sum() * 0
                losses['loss_dice'] = mask_pred.sum() * 0
                if self.loss_rank is not None:
                    losses['loss_rank'] = mask_pred.sum() * 0

        return losses

    def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask,
                           pos_gt_mask, pos_gt_labels, gt_sem_seg, gt_sem_cls,
                           cfg):

        num_pos = pos_mask.size(0)
        num_neg = neg_mask.size(0)
        num_samples = num_pos + num_neg
        H, W = pos_mask.shape[-2:]
        # original implementation uses new_zeros since BG are set to be 0
        # now use empty & fill because BG cat_id = num_classes,
        # FG cat_id = [0, num_classes-1]
        labels = pos_mask.new_full((num_samples,),
                                   self.num_classes,
                                   dtype=torch.long)
        label_weights = pos_mask.new_zeros((num_samples, self.num_classes))
        mask_targets = pos_mask.new_zeros(num_samples, H, W)
        mask_weights = pos_mask.new_zeros(num_samples, H, W)
        if num_pos > 0:
            labels[pos_inds] = pos_gt_labels
            pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
            label_weights[pos_inds] = pos_weight
            pos_mask_targets = pos_gt_mask
            mask_targets[pos_inds, ...] = pos_mask_targets
            mask_weights[pos_inds, ...] = 1

        if num_neg > 0:
            label_weights[neg_inds] = 1.0

        if gt_sem_cls is not None and gt_sem_seg is not None:
            sem_labels = pos_mask.new_full((self.num_stuff_classes,),
                                           self.num_classes,
                                           dtype=torch.long)
            sem_targets = pos_mask.new_zeros(self.num_stuff_classes, H, W)
            sem_weights = pos_mask.new_zeros(self.num_stuff_classes, H, W)
            sem_stuff_weights = torch.eye(
                self.num_stuff_classes, device=pos_mask.device)
            sem_thing_weights = pos_mask.new_zeros(
                (self.num_stuff_classes, self.num_thing_classes))
            sem_label_weights = torch.cat(
                [sem_thing_weights, sem_stuff_weights], dim=-1)
            if len(gt_sem_cls > 0):
                sem_inds = gt_sem_cls - self.num_thing_classes
                sem_inds = sem_inds.long()
                sem_labels[sem_inds] = gt_sem_cls.long()
                sem_targets[sem_inds] = gt_sem_seg
                sem_weights[sem_inds] = 1

            label_weights[:, self.num_thing_classes:] = 0
            labels = torch.cat([labels, sem_labels])
            label_weights = torch.cat([label_weights, sem_label_weights])
            mask_targets = torch.cat([mask_targets, sem_targets])
            mask_weights = torch.cat([mask_weights, sem_weights])

        return labels, label_weights, mask_targets, mask_weights

    def get_targets(self,
                    sampling_results,
                    gt_mask,
                    gt_labels,
                    rcnn_train_cfg,
                    concat=True,
                    gt_sem_seg=None,
                    gt_sem_cls=None
                    ):

        pos_inds_list = [res.pos_inds for res in sampling_results]
        neg_inds_list = [res.neg_inds for res in sampling_results]
        pos_mask_list = [res.pos_masks for res in sampling_results]
        neg_mask_list = [res.neg_masks for res in sampling_results]
        pos_gt_mask_list = [res.pos_gt_masks for res in sampling_results]
        pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results]
        if gt_sem_seg is None:
            gt_sem_seg = [None] * 2
            gt_sem_cls = [None] * 2

        labels, label_weights, mask_targets, mask_weights = multi_apply(
            self._get_target_single,
            pos_inds_list,
            neg_inds_list,
            pos_mask_list,
            neg_mask_list,
            pos_gt_mask_list,
            pos_gt_labels_list,
            gt_sem_seg,
            gt_sem_cls,
            cfg=rcnn_train_cfg)
        if concat:
            labels = torch.cat(labels, 0)
            label_weights = torch.cat(label_weights, 0)
            mask_targets = torch.cat(mask_targets, 0)
            mask_weights = torch.cat(mask_weights, 0)
        return labels, label_weights, mask_targets, mask_weights

    def rescale_masks(self, masks_per_img, img_meta):
        h, w, _ = img_meta['img_shape']
        masks_per_img = F.interpolate(
            masks_per_img.unsqueeze(0).sigmoid(),
            size=img_meta['batch_input_shape'],
            mode='bilinear',
            align_corners=False)

        masks_per_img = masks_per_img[:, :, :h, :w]
        ori_shape = img_meta['ori_shape']
        seg_masks = F.interpolate(
            masks_per_img,
            size=ori_shape[:2],
            mode='bilinear',
            align_corners=False).squeeze(0)
        return seg_masks

    def get_seg_masks(self, masks_per_img, labels_per_img, scores_per_img,
                      test_cfg, img_meta):
        # resize mask predictions back
        seg_masks = self.rescale_masks(masks_per_img, img_meta)
        seg_masks = seg_masks > test_cfg.mask_thr
        bbox_result, segm_result, mask_preds = self.segm2result(seg_masks, labels_per_img,
                                                                scores_per_img)
        return bbox_result, segm_result, mask_preds

    def segm2result(self, mask_preds, det_labels, cls_scores):
        num_classes = self.num_classes
        bbox_result = None
        segm_result = [[] for _ in range(num_classes)]
        det_labels = det_labels.cpu().numpy()
        cls_scores = cls_scores.cpu().numpy()
        num_ins = mask_preds.shape[0]
        # fake bboxes mask to bboxes
        bboxes = np.zeros((num_ins, 5), dtype=np.float32)
        bboxes[:, -1] = cls_scores
        bboxes[:, :4] = np.array(tensor_mask2box(mask_preds).clip(min=0))
        # mask_preds = mask_preds.cpu().numpy()
        # bbox_result = [bboxes[det_labels == i, :] for i in range(num_classes)]
        for idx in range(num_ins):
            segm_result[det_labels[idx]].append(mask_preds[idx])
        return bboxes, segm_result, mask_preds

================================================
FILE: knet/video/knet.py
================================================
import torch
import torch.nn.functional as F
from mmdet.models.builder import DETECTORS
from mmdet.models.detectors import TwoStageDetector, BaseDetector
from mmdet.models.builder import build_head
from knet.det.utils import sem2ins_masks, sem2ins_masks_cityscapes


@DETECTORS.register_module()
class VideoKNet(TwoStageDetector):

    def __init__(self,
                 *args,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 ignore_label=255,
                 thing_label_in_seg=0,
                 cityscapes=False,
                 **kwargs):
        super(VideoKNet, self).__init__(*args, **kwargs)
        assert self.with_rpn, 'KNet does not support external proposals'

        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.thing_label_in_seg = thing_label_in_seg
        self.ignore_label = ignore_label
        self.cityscapes = cityscapes  # whether to train the cityscape panoptic segmentation

    def forward_train(self,
                      img,
                      img_metas,
                      gt_bboxes=None,
                      gt_labels=None,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      gt_semantic_seg=None,
                      ref_img=None,
                      ref_img_metas=None,
                      ref_gt_bboxes_ignore=None,
                      ref_gt_labels=None,
                      ref_gt_masks=None,
                      ref_gt_semantic_seg=None,
                      proposals=None,
                      **kwargs):
        """Forward function of SparseR-CNN-like network in train stage.

        Args:
            img (Tensor): of shape (N, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.
            img_metas (list[dict]): list of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                :class:`mmdet.datasets.pipelines.Collect`.
            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
            gt_labels (list[Tensor]): class indices corresponding to each box
            gt_bboxes_ignore (None | list[Tensor): specify which bounding
                boxes can be ignored when computing the loss.
            gt_masks (List[Tensor], optional) : Segmentation masks for
                each box. But we don't support it in this architecture.
            proposals (List[Tensor], optional): override rpn proposals with
                custom proposals. Use when `with_rpn` is False.
            # This is for video only:
            ref_img (Tensor): of shape (N, 2, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.
                2 denotes there is two reference images for each input image.

            ref_img_metas (list[list[dict]]): The first list only has one
                element. The second list contains reference image information
                dict where each dict has: 'img_shape', 'scale_factor', 'flip',
                and may also contain 'filename', 'ori_shape', 'pad_shape', and
                'img_norm_cfg'. For details on the values of these keys see
                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.

            ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The
                Tensor contains ground truth bboxes for each reference image
                with shape (num_all_ref_gts, 5) in
                [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id
                start from 0, and denotes the id of reference image for each
                key image.

            ref_gt_labels (list[Tensor]): The list only has one Tensor. The
                Tensor contains class indices corresponding to each reference
                box with shape (num_all_ref_gts, 2) in
                [ref_img_id, class_indice].
        Returns:
            dict[str, Tensor]: a dictionary of loss components
        """
        super(TwoStageDetector, self).forward_train(img, img_metas)
        assert proposals is None, 'KNet does not support' \
                                  ' external proposals'
        assert gt_masks is not None
        # gt_masks and gt_semantic_seg are not padded when forming batch
        gt_masks_tensor = []
        gt_sem_seg = []
        gt_sem_cls = []
        # batch_input_shape shoud be the same across images
        pad_H, pad_W = img_metas[0]['batch_input_shape']
        assign_H = pad_H // self.mask_assign_stride
        assign_W = pad_W // self.mask_assign_stride

        for i, gt_mask in enumerate(gt_masks):
            mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device)
            if gt_mask.width != pad_W or gt_mask.height != pad_H:
                pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height)
                mask_tensor = F.pad(mask_tensor, pad_wh, value=0)

            if gt_semantic_seg is not None:
                # gt_semantic seg is padded by zero when forming a batch
                # need to convert them from 0 to ignore
                gt_semantic_seg[
                    i, :, img_metas[i]['img_shape'][0]:, :] = self.ignore_label
                gt_semantic_seg[
                    i, :, :, img_metas[i]['img_shape'][1]:] = self.ignore_label
                if self.cityscapes:
                    sem_labels, sem_seg = sem2ins_masks_cityscapes(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=self.num_thing_classes)
                else:
                    sem_labels, sem_seg = sem2ins_masks(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=self.num_thing_classes,
                        thing_label_in_seg=self.thing_label_in_seg)

                if sem_seg.shape[0] == 0:
                    gt_sem_seg.append(
                        mask_tensor.new_zeros(
                            (mask_tensor.size(0), assign_H, assign_W)))
                else:
                    gt_sem_seg.append(
                        F.interpolate(
                            sem_seg[None], (assign_H, assign_W),
                            mode='bilinear',
                            align_corners=False)[0])
                gt_sem_cls.append(sem_labels)
            else:
                gt_sem_seg = None
                gt_sem_cls = None

            if mask_tensor.shape[0] == 0:
                gt_masks_tensor.append(
                    mask_tensor.new_zeros(
                        (mask_tensor.size(0), assign_H, assign_W)))
            else:
                gt_masks_tensor.append(
                    F.interpolate(
                        mask_tensor[None], (assign_H, assign_W),  # downsample to 1/4 resolution
                        mode='bilinear',
                        align_corners=False)[0])

        gt_masks = gt_masks_tensor

        x = self.extract_feat(img)
        rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks,
                                                  gt_labels, gt_sem_seg,
                                                  gt_sem_cls)
        (rpn_losses, proposal_feats, x_feats, mask_preds,
         cls_scores) = rpn_results

        losses = self.roi_head.forward_train(
            x_feats,
            proposal_feats,
            mask_preds,
            cls_scores,
            img_metas,
            gt_masks,
            gt_labels,
            gt_bboxes_ignore=gt_bboxes_ignore,
            gt_bboxes=gt_bboxes,
            gt_sem_seg=gt_sem_seg,
            gt_sem_cls=gt_sem_cls,
            imgs_whwh=None)

        losses.update(rpn_losses)
        return losses

    def simple_test(self, img, img_metas, rescale=False):
        """Test function without test time augmentation.

        Args:
            imgs (list[torch.Tensor]): List of multiple images
            img_metas (list[dict]): List of image information.
            rescale (bool): Whether to rescale the results.
                Defaults to False.

        Returns:
            list[list[np.ndarray]]: BBox results of each image and classes.
                The outer list corresponds to each image. The inner list
                corresponds to each class.
        """
        x = self.extract_feat(img)
        rpn_results = self.rpn_head.simple_test_rpn(x, img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results
        segm_results = self.roi_head.simple_test(
            x_feats,
            proposal_feats,
            mask_preds,
            cls_scores,
            img_metas,
            imgs_whwh=None,
            rescale=rescale)
        return segm_results

    def forward_dummy(self, img):
        """Used for computing network flops.

        See `mmdetection/tools/get_flops.py`
        """
        # backbone
        x = self.extract_feat(img)
        # rpn
        num_imgs = len(img)
        dummy_img_metas = [
            dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs)
        ]
        rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results
        # roi_head
        roi_outs = self.roi_head.forward_dummy(x_feats, proposal_feats,
                                               dummy_img_metas)
        return roi_outs

    def extract_feat(self, img):
        """Directly extract features from the backbone+neck."""
        x = self.backbone(img)
        if self.with_neck:
            x = self.neck(x)
        return x

================================================
FILE: knet/video/knet_quansi_dense.py
================================================
import warnings
import numpy as np
import torch
import torch.nn.functional as F
from mmcv.cnn import ConvModule
from mmdet.models.builder import DETECTORS
from mmdet.models.detectors import BaseDetector
from mmdet.models.builder import build_head, build_neck, build_backbone
from mmdet.core import build_assigner, build_sampler
from knet.video.qdtrack.builder import build_tracker
from knet.det.utils import sem2ins_masks, sem2ins_masks_cityscapes
from unitrack.mask import tensor_mask2box

@DETECTORS.register_module()
class VideoKNetQuansiTrack(BaseDetector):
    """
        Simple Extension of KNet to Video KNet by the implementation of VPSFuse Net.
    """
    def __init__(self,
                 backbone,
                 neck=None,
                 rpn_head=None,
                 roi_head=None,
                 track_head=None,
                 extra_neck=None,
                 track_localization_fpn=None,
                 tracker=None,
                 train_cfg=None,
                 test_cfg=None,
                 track_train_cfg=None,
                 pretrained=None,
                 init_cfg=None,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 ignore_label=255,
                 thing_label_in_seg=0,
                 cityscapes=False,
                 kitti_step=False,
                 fix_knet=False,
                 freeze_detector=False,
                 semantic_filter=False,
                 **kwargs):
        super(VideoKNetQuansiTrack, self).__init__(init_cfg)

        if pretrained:
            warnings.warn('DeprecationWarning: pretrained is deprecated, '
                          'please use "init_cfg" instead')
            backbone.pretrained = pretrained
        self.backbone = build_backbone(backbone)

        if neck is not None:
            self.neck = build_neck(neck)

        if extra_neck is not None:
            self.extra_neck = build_neck(extra_neck)

        if rpn_head is not None:
            rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None
            rpn_head_ = rpn_head.copy()
            rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn)
            self.rpn_head = build_head(rpn_head_)

        if roi_head is not None:
            # update train and test cfg here for now
            # TODO: refactor assigner & sampler
            rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None
            roi_head.update(train_cfg=rcnn_train_cfg)
            roi_head.update(test_cfg=test_cfg.rcnn)
            roi_head.pretrained = pretrained
            self.roi_head = build_head(roi_head)

        if track_head is not None:
            self.track_train_cfg = track_train_cfg
            self.track_head = build_head(track_head)
            self.init_track_assigner_sampler()

        if track_localization_fpn is not None:
            self.track_localization_fpn = build_neck(track_localization_fpn)

        if tracker is not None:
            self.tracker_cfg = tracker

        if freeze_detector:
           self._freeze_detector()

        if fix_knet:
            for p in self.backbone.parameters():
                p.requires_grad_(False)
            self.backbone.eval()
            for p in self.neck.parameters():
                p.requires_grad_(False)
            self.neck.eval()
            for p in self.rpn_head.parameters():
                p.requires_grad_(False)
            self.rpn_head.eval()
            for p in self.roi_head.parameters():
                p.requires_grad_(False)
            self.roi_head.eval()

        self.train_cfg = train_cfg
        self.test_cfg = test_cfg
        self.num_proposals = self.rpn_head.num_proposals
        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.thing_label_in_seg = thing_label_in_seg
        self.ignore_label = ignore_label
        self.cityscapes = cityscapes  # whether to train the cityscape panoptic segmentation
        self.kitti_step = kitti_step
        self.semantic_filter = semantic_filter

    def init_tracker(self):
        self.tracker = build_tracker(self.tracker_cfg)

    def _freeze_detector(self):

        self.detector = [
            self.rpn_head, self.roi_head
        ]
        for model in self.detector:
            model.eval()
            for param in model.parameters():
                param.requires_grad = False

    def init_track_assigner_sampler(self):
        """Initialize assigner and sampler."""

        self.track_roi_assigner = build_assigner(
            self.track_train_cfg.assigner)
        self.track_share_assigner = False

        self.track_roi_sampler = build_sampler(
            self.track_train_cfg.sampler, context=self)
        self.track_share_sampler = False

    def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_semantic_seg):
        # gt_masks and gt_semantic_seg are not padded when forming batch
        gt_masks_tensor = []
        gt_sem_seg = []
        gt_sem_cls = []
        # batch_input_shape shoud be the same across images
        pad_H, pad_W = img_metas[0]['batch_input_shape']
        assign_H = pad_H // self.mask_assign_stride
        assign_W = pad_W // self.mask_assign_stride

        for i, gt_mask in enumerate(gt_masks):
            mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device)
            if gt_mask.width != pad_W or gt_mask.height != pad_H:
                pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height)
                mask_tensor = F.pad(mask_tensor, pad_wh, value=0)

            if gt_semantic_seg is not None:
                # gt_semantic seg is padded by zero when forming a batch
                # need to convert them from 0 to ignore
                gt_semantic_seg[
                i, :, img_metas[i]['img_shape'][0]:, :] = self.ignore_label
                gt_semantic_seg[
                i, :, :, img_metas[i]['img_shape'][1]:] = self.ignore_label
                if self.cityscapes:
                    sem_labels, sem_seg = sem2ins_masks_cityscapes(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=self.num_thing_classes)
                else:
                    sem_labels, sem_seg = sem2ins_masks(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=self.num_thing_classes,
                        thing_label_in_seg=self.thing_label_in_seg)

                if sem_seg.shape[0] == 0:
                    gt_sem_seg.append(
                        mask_tensor.new_zeros(
                            (mask_tensor.size(0), assign_H, assign_W)))
                else:
                    gt_sem_seg.append(
                        F.interpolate(
                            sem_seg[None], (assign_H, assign_W),
                            mode='bilinear',
                            align_corners=False)[0])
                gt_sem_cls.append(sem_labels)
            else:
                gt_sem_seg = None
                gt_sem_cls = None

            if mask_tensor.shape[0] == 0:
                gt_masks_tensor.append(
                    mask_tensor.new_zeros(
                        (mask_tensor.size(0), assign_H, assign_W)))
            else:
                gt_masks_tensor.append(
                    F.interpolate(
                        mask_tensor[None], (assign_H, assign_W),  # downsample to 1/4 resolution
                        mode='bilinear',
                        align_corners=False)[0])

        return gt_masks_tensor, gt_sem_cls, gt_sem_seg

    def forward_train(self,
                      img,
                      img_metas,
                      gt_bboxes=None,
                      gt_labels=None,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      gt_semantic_seg=None,
                      gt_instance_ids=None,
                      ref_img=None,
                      ref_img_metas=None,
                      ref_gt_bboxes_ignore=None,
                      ref_gt_labels=None,
                      ref_gt_bboxes=None,
                      ref_gt_masks=None,
                      ref_gt_semantic_seg=None,
                      ref_gt_instance_ids=None,
                      proposals=None,
                      **kwargs):
        """Forward function of SparseR-CNN-like network in train stage.

        Args:
            img (Tensor): of shape (N, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.
            img_metas (list[dict]): list of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                :class:`mmdet.datasets.pipelines.Collect`.
            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
            gt_labels (list[Tensor]): class indices corresponding to each box
            gt_bboxes_ignore (None | list[Tensor): specify which bounding
                boxes can be ignored when computing the loss.
            gt_masks (List[Tensor], optional) : Segmentation masks for
                each box. But we don't support it in this architecture.
            proposals (List[Tensor], optional): override rpn proposals with
                custom proposals. Use when `with_rpn` is False.

            # This is for video only:
            ref_img (Tensor): of shape (N, 2, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.
                2 denotes there is two reference images for each input image.

            ref_img_metas (list[list[dict]]): The first list only has one
                element. The second list contains reference image information
                dict where each dict has: 'img_shape', 'scale_factor', 'flip',
                and may also contain 'filename', 'ori_shape', 'pad_shape', and
                'img_norm_cfg'. For details on the values of these keys see
                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.

            ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The
                Tensor contains ground truth bboxes for each reference image
                with shape (num_all_ref_gts, 5) in
                [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id
                start from 0, and denotes the id of reference image for each
                key image.

            ref_gt_labels (list[Tensor]): The list only has one Tensor. The
                Tensor contains class indices corresponding to each reference
                box with shape (num_all_ref_gts, 2) in
                [ref_img_id, class_indice].

        Returns:
            dict[str, Tensor]: a dictionary of loss components
        """
        batch_input_shape = tuple(img[0].size()[-2:])
        for img_meta in img_metas:
            img_meta['batch_input_shape'] = batch_input_shape

        assert proposals is None, 'KNet does not support' \
                                  ' external proposals'
        assert gt_masks is not None
        assert gt_instance_ids is not None

        # preprocess the reference images
        ref_img = ref_img.squeeze(1)  # (b,3,h,w)
        ref_masks_gt = []
        for ref_gt_mask in ref_gt_masks:
            ref_masks_gt.append(ref_gt_mask[0])

        ref_labels_gt = []
        for ref_gt_label in ref_gt_labels:
            ref_labels_gt.append(ref_gt_label[:, 1].long())
        ref_gt_labels = ref_labels_gt

        ref_semantic_seg_gt = ref_gt_semantic_seg.squeeze(1)

        ref_gt_instance_id_list = []
        for ref_gt_instance_id in ref_gt_instance_ids:
            ref_gt_instance_id_list.append(ref_gt_instance_id[:,1].long())

        ref_img_metas_new = []
        for ref_img_meta in ref_img_metas:
            ref_img_meta[0]['batch_input_shape'] = batch_input_shape
            ref_img_metas_new.append(ref_img_meta[0])

        # prepare the gt_match_indices
        gt_pids_list =[]
        for i in range(len(ref_gt_instance_id_list)):
            ref_ids = ref_gt_instance_id_list[i].cpu().data.numpy().tolist()
            gt_ids = gt_instance_ids[i].cpu().data.numpy().tolist()
            gt_pids = [ref_ids.index(i) if i in ref_ids else -1 for i in gt_ids]
            gt_pids_list.append(torch.LongTensor([gt_pids]).to(img.device)[0])

        gt_match_indices = gt_pids_list

        # gt_masks and gt_semantic_seg are not padded when forming batch
        gt_masks, gt_sem_cls, gt_sem_seg = self.preprocess_gt_masks(img_metas, gt_masks, gt_labels, gt_semantic_seg)

        ref_gt_masks, ref_gt_sem_cls, ref_gt_sem_seg = self.preprocess_gt_masks(ref_img_metas_new,
                                                                    ref_masks_gt, ref_gt_labels, ref_semantic_seg_gt)

        x = self.extract_feat(img)
        self.backbone.eval()
        with torch.no_grad():
            x_ref = self.extract_feat(ref_img)
        self.backbone.train()

        rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks,
                                                  gt_labels, gt_sem_seg,
                                                  gt_sem_cls)

        # simple forward to get the reference results
        ref_rpn_results = self.rpn_head.simple_test_rpn(x_ref, ref_img_metas_new)

        (rpn_losses, proposal_feats, x_feats, mask_preds,
         cls_scores) = rpn_results

        (ref_proposal_feats, ref_x_feats, ref_mask_preds,
         ref_cls_scores, ref_seg_preds) = ref_rpn_results

        # forward to get the current results
        losses, object_feats, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.forward_train(
            x_feats,
            proposal_feats,
            mask_preds,
            cls_scores,
            img_metas,
            gt_masks,
            gt_labels,
            gt_bboxes_ignore=gt_bboxes_ignore,
            gt_bboxes=gt_bboxes,
            gt_sem_seg=gt_sem_seg,
            gt_sem_cls=gt_sem_cls,
            imgs_whwh=None)

        # simple forward to get the reference results
        _,  ref_cls_scores, ref_mask_preds, ref_scaled_mask_preds = self.roi_head.simple_test_mask_preds(
            ref_x_feats,
            ref_proposal_feats,
            ref_mask_preds,
            ref_cls_scores,
            ref_img_metas_new,
           )

        # ===== Tracking Part -==== #
        # assign both key frame and reference frame tracking targets
        key_sampling_results, ref_sampling_results = [], []
        num_imgs = len(img_metas)

        x_track_fea = x_feats
        x_track_fea_ref = ref_x_feats

        for i in range(num_imgs):
            assign_result = self.track_roi_assigner.assign(
                scaled_mask_preds[i][:self.num_proposals].detach(), cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(),
                gt_masks[i], gt_labels[i], img_meta=img_metas[i])
            sampling_result = self.track_roi_sampler.sample(
                assign_result,
                mask_preds[i][:self.num_proposals].detach(),
                gt_masks[i])
            key_sampling_results.append(sampling_result)

            ref_assign_result = self.track_roi_assigner.assign(
                ref_scaled_mask_preds[i][:self.num_proposals].detach(), ref_cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(),
                ref_gt_masks[i], ref_gt_labels[i], img_meta=ref_img_metas_new[i])
            ref_sampling_result = self.track_roi_sampler.sample(
                ref_assign_result,
                ref_mask_preds[i][:self.num_proposals].detach(),
                ref_gt_masks[i])
            ref_sampling_results.append(ref_sampling_result)

        # mask feature embeddings
        key_masks = [res.pos_gt_masks for res in key_sampling_results]
        key_feats = self._track_forward(x_track_fea, key_masks)
        ref_masks = [res.pos_gt_masks for res in ref_sampling_results]
        ref_feats = self._track_forward(x_track_fea_ref, ref_masks)

        match_feats = self.track_head.match(key_feats, ref_feats,
                                            key_sampling_results,
                                            ref_sampling_results)

        asso_targets = self.track_head.get_track_targets(
            gt_match_indices, key_sampling_results, ref_sampling_results)
        loss_track = self.track_head.loss(*match_feats, *asso_targets)

        losses.update(loss_track)
        losses.update(rpn_losses)

        return losses

    def simple_test(self, img, img_metas, rescale=False, ref_img=None, **kwargs):
        """Test function without test time augmentation.

        Args:
            imgs (list[torch.Tensor]): List of multiple images
            img_metas (list[dict]): List of image information.
            rescale (bool): Whether to rescale the results.
                Defaults to False.

        Returns:
            list[list[np.ndarray]]: BBox results of each image and classes.
                The outer list corresponds to each image. The inner list
                corresponds to each class.
        """

        # if ref_img is not None:
        #     ref_img = ref_img[0]

        # whether is the first frame for such clips
        assert 'city' in img_metas[0]['filename'] and 'iid' in img_metas[0]
        iid = img_metas[0]['iid']
        fid = iid % 10000
        img_name = img_metas[0]['filename'].split("/")[-1].split(".")[0]

        if "city" in img_metas[0]['filename']:
            iid = img_metas[0]['iid']
            fid = iid % 10000
            is_first = (fid == 1)
        else:
            iid = kwargs['img_id'][0].item()
            fid = iid % 10000
            is_first = (fid == 0)

        if is_first:
            self.init_tracker()

        # for current frame
        x = self.extract_feat(img)
        # x_track_fea = self.track_localization_fpn(x)
        # current frame inference
        rpn_results = self.rpn_head.simple_test_rpn(x, img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results
        x_track_fea = x_feats
        cur_segm_results, query_output, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.simple_test(
            x_feats,
            proposal_feats,
            mask_preds,
            cls_scores,
            img_metas)
        # for tracking part
        sorted_bbox_result, segm_result, mask_preds, panoptic_result = cur_segm_results[0]
        panoptic_seg, segments_info = panoptic_result

        # get the semantic filter
        if self.semantic_filter:
            seg_preds = torch.nn.functional.interpolate(seg_preds, panoptic_seg.shape, mode='bilinear', align_corners=False)
            seg_preds = seg_preds.sigmoid()
            seg_out = seg_preds.argmax(1)
            semantic_thing = (seg_out < self.num_thing_classes).to(dtype=torch.float32)
        else:
            semantic_thing = 1.

        # get sorted tracking thing ids, labels, masks, score for tracking
        things_index_for_tracking, things_labels_for_tracking, thing_masks_for_tracking, things_score_for_tracking = \
            self.get_things_id_for_tracking(panoptic_seg, segments_info)
        things_labels_for_tracking = torch.Tensor(things_labels_for_tracking).to(cls_scores.device).long()
        if len(things_labels_for_tracking) > 0:
            thing_masks_for_tracking_final = []
            for mask in thing_masks_for_tracking:
                thing_masks_for_tracking_final.append(torch.Tensor(mask).unsqueeze(0).to(
                    x_feats.device).float())
            thing_masks_for_tracking_final = torch.cat(thing_masks_for_tracking_final, 0)
            thing_masks_for_tracking = thing_masks_for_tracking_final
            thing_masks_for_tracking_scaled = F.interpolate(thing_masks_for_tracking.unsqueeze(0),
                                                     size=x_track_fea.size()[2:], mode="bilinear", align_corners=False)
            things_bbox_for_tracking = torch.zeros((len(things_score_for_tracking), 5),
                                                   dtype=torch.float, device=x_feats.device)
            things_bbox_for_tracking[:, 4] = torch.tensor(things_score_for_tracking,
                                                          device=things_bbox_for_tracking.device)
            thing_masks_for_tracking_with_semantic_filter = thing_masks_for_tracking_scaled * semantic_thing

        if len(things_labels_for_tracking) == 0:
            track_feats = None
        else:
            # tracking embedding features
            track_feats = self._track_forward(x_track_fea, thing_masks_for_tracking_with_semantic_filter)

        if track_feats is not None:
            # assert len(things_id_for_tracking) == len(things_labels_for_tracking)
            things_bbox_for_tracking[:, :4] = torch.tensor(
                tensor_mask2box(thing_masks_for_tracking_with_semantic_filter),
                device=things_bbox_for_tracking.device)

            bboxes, labels, ids = self.tracker.match(
                bboxes=things_bbox_for_tracking,
                labels=things_labels_for_tracking,
                track_feats=track_feats,
                frame_id=fid)
            ids = ids + 1
            # hack for unmatched into background
            ids[ids == -1] = 0
        else:
            ids = []

        track_maps = self.generate_track_id_maps(ids, thing_masks_for_tracking, panoptic_seg)

        return self.get_semantic_seg(panoptic_seg, segments_info), track_maps, None, None, None

    def _track_forward(self, x, mask_pred):
        """Track head forward function used in both training and testing.
        We use mask pooling to get the fine grain features"""
        track_feats_list = []

        for i, masks in enumerate(mask_pred):
            masks = masks.sigmoid() > 0.5
            masks = masks.float().detach()
            size = x.size()[2:]
            masks = F.interpolate(masks.unsqueeze(0), size=size, mode="bilinear", align_corners=True).squeeze(0)
            track_feats = torch.einsum('nhw,chw->nc', masks, x[i])
            track_feats = track_feats / (masks.sum(-1).sum(-1) + 1).unsqueeze(-1)
            track_feats_list.append(track_feats)
        track_feats = torch.cat(track_feats_list, 0)
        track_feats = self.track_head(track_feats)
        return track_feats

    def forward_dummy(self, img):
        """Used for computing network flops.

        See `mmdetection/tools/get_flops.py`
        """
        # backbone
        x = self.extract_feat(img)
        # rpn
        num_imgs = len(img)
        dummy_img_metas = [
            dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs)
        ]
        rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results
        roi_outs = self.roi_head.forward_dummy(x_feats, proposal_feats,
                                               dummy_img_metas)
        return roi_outs

    def extract_feat(self, img):
        """Directly extract features from the backbone+neck."""
        x = self.backbone(img)
        if self.with_neck:
            x = self.neck(x)
        return x

    @property
    def with_rpn(self):
        """bool: whether the detector has RPN"""
        return hasattr(self, 'rpn_head') and self.rpn_head is not None

    @property
    def with_roi_head(self):
        """bool: whether the detector has a RoI head"""
        return hasattr(self, 'roi_head') and self.roi_head is not None

    def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
        """Test with augmentations.

        If rescale is False, then returned bboxes and masks will fit the scale
        of imgs[0].
        """
        pass

    def get_things_id_for_tracking(self, panoptic_seg, seg_infos):
        idxs = []
        labels = []
        masks = []
        score = []
        for segment in seg_infos:
            if segment['isthing'] == True:
                thing_mask = panoptic_seg == segment["id"]
                masks.append(thing_mask)
                idxs.append(segment["instance_id"])
                labels.append(segment['category_id'])
                score.append(segment['score'])
        return idxs, labels, masks, score

    def pack_things_object(self, object_feats, ref_object_feats):
        object_feats, ref_object_feats = object_feats.squeeze(-1).squeeze(-1), ref_object_feats.squeeze(-1).squeeze(-1)
        thing_object_feats = torch.split(object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        ref_thing_object_feats = torch.split(ref_object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        return thing_object_feats, ref_thing_object_feats

    def pack_things_masks(self, mask_pred, ref_mask_pred):
        thing_mask_pred = torch.split(mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        ref_thing_thing_mask_pred = torch.split(ref_mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        return thing_mask_pred, ref_thing_thing_mask_pred

    def get_semantic_seg(self, panoptic_seg, segments_info):
        results = {}
        masks = []
        scores = []
        kitti_step2cityscpaes = [11, 13]
        semantic_seg = np.zeros(panoptic_seg.shape)
        for segment in segments_info:
            if segment['isthing'] == True:
                if self.kitti_step:
                    cat_cur = kitti_step2cityscpaes[segment["category_id"]]
                    semantic_seg[panoptic_seg == segment["id"]] = cat_cur
                else:
                    semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] + 11
            else:
                # for stuff (0- n-1)
                if self.kitti_step:
                    cat_cur = segment["category_id"]
                    cat_cur -= 1
                    offset = 0
                    for thing_id in kitti_step2cityscpaes:
                        if cat_cur + offset >= thing_id:
                            offset += 1
                    cat_cur += offset
                    semantic_seg[panoptic_seg == segment["id"]] = cat_cur
                else:
                    semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] - 1
        return semantic_seg

    def generate_track_id_maps(self, ids, masks, panopitc_seg_maps):
        final_id_maps = np.zeros(panopitc_seg_maps.shape)
        if len(ids) == 0:
            return final_id_maps
        # assert len(things_mask_results) == len(track_results)
        masks = masks.bool()
        for i, id in enumerate(ids):
            mask = masks[i].cpu().numpy()
            final_id_maps[mask] = id
        return final_id_maps


import cv2
import numpy as np
import os.path as osp


def log_masks_for_inference(masks_preds, names, output_dirs="work_dirs/vps/vps_output/thing_masks"):
    for i, masks in enumerate(masks_preds):
        out_masks = np.zeros(masks_preds[0].shape).astype(np.int16)
        masks = masks.sigmoid() > 0.5
        masks = masks.cpu().numpy()
        out_masks[masks==1] = 255
        file_name = osp.join(output_dirs, names + "_" + str(i) + ".png")
        cv2.imwrite(file_name, out_masks)

================================================
FILE: knet/video/knet_quansi_dense_embed_fc.py
================================================
import warnings
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
from mmcv.cnn import (ConvModule, bias_init_with_prob,
                      build_activation_layer, build_norm_layer)
from mmdet.models.builder import DETECTORS
from mmdet.models.detectors import BaseDetector
from mmdet.models.builder import build_head, build_neck, build_backbone, build_roi_extractor
from mmdet.core import build_assigner, build_sampler
from knet.video.qdtrack.builder import build_tracker
from knet.det.utils import sem2ins_masks, sem2ins_masks_cityscapes, sem2ins_masks_kitti_step
from unitrack.mask import tensor_mask2box


@DETECTORS.register_module()
class VideoKNetQuansiEmbedFC(BaseDetector):
    """
        Simple Extension of KNet to Video KNet by the implementation of VPSFuse Net.
    """

    def __init__(self,
                 backbone,
                 neck=None,
                 rpn_head=None,
                 roi_head=None,
                 track_head=None,
                 extra_neck=None,
                 track_mhsa=False,
                 tracker=None,
                 train_cfg=None,
                 test_cfg=None,
                 track_train_cfg=None,
                 pretrained=None,
                 init_cfg=None,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 ignore_label=255,
                 thing_label_in_seg=0,
                 detach_mask_emd=False,
                 cityscapes=False,
                 kitti_step=False,
                 cityscapes_short=False,
                 freeze_detector=False,
                 semantic_filter=True,
                 # linking parameters
                 link_previous=False,
                 bbox_roi_extractor=None,
                 **kwargs):
        super(VideoKNetQuansiEmbedFC, self).__init__(init_cfg)

        if pretrained:
            warnings.warn('DeprecationWarning: pretrained is deprecated, '
                          'please use "init_cfg" instead')
            backbone.pretrained = pretrained
        self.backbone = build_backbone(backbone)

        if neck is not None:
            self.neck = build_neck(neck)

        if extra_neck is not None:
            self.extra_neck = build_neck(extra_neck)

        if rpn_head is not None:
            rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None
            rpn_head_ = rpn_head.copy()
            rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn)
            self.rpn_head = build_head(rpn_head_)

        if roi_head is not None:
            # update train and test cfg here for now
            # TODO: refactor assigner & sampler
            rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None
            roi_head.update(train_cfg=rcnn_train_cfg)
            roi_head.update(test_cfg=test_cfg.rcnn)
            roi_head.pretrained = pretrained
            self.roi_head = build_head(roi_head)

        if track_head is not None:
            self.track_train_cfg = track_train_cfg
            self.track_head = build_head(track_head)
            self.init_track_assigner_sampler()
            if bbox_roi_extractor is not None:
                self.track_roi_extractor = build_roi_extractor(
                    bbox_roi_extractor)

        if tracker is not None:
            self.tracker_cfg = tracker

        if freeze_detector:
            self._freeze_detector()

        self.train_cfg = train_cfg
        self.test_cfg = test_cfg
        self.num_proposals = self.rpn_head.num_proposals
        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.thing_label_in_seg = thing_label_in_seg
        self.ignore_label = ignore_label
        self.cityscapes = cityscapes  # whether to train the cityscape panoptic segmentation
        self.kitti_step = kitti_step  # whether to train the kitti step panoptic segmentation
        self.cityscapes_short = cityscapes_short  # whether to test with short clips (300)

        self.semantic_filter = semantic_filter
        self.link_previous = link_previous
        self.detach_mask_emd = detach_mask_emd
        self.track_mhsa = track_mhsa
        # add embedding fcs for the final stage queries
        num_emb_fcs = 1
        act_cfg = dict(type='ReLU', inplace=True)
        in_channels = 256
        out_channels = 256
        self.embed_fcs = nn.ModuleList()
        for _ in range(num_emb_fcs):
            self.embed_fcs.append(
                nn.Linear(in_channels, in_channels, bias=False))
            self.embed_fcs.append(
                build_norm_layer(dict(type='LN'), in_channels)[1])
            self.embed_fcs.append(build_activation_layer(act_cfg))

        self.fc_embed = nn.Linear(in_channels, out_channels)

    def init_tracker(self):
        self.tracker = build_tracker(self.tracker_cfg)

    def _freeze_detector(self):

        self.detector = [
            self.rpn_head, self.roi_head
        ]
        for model in self.detector:
            model.eval()
            for param in model.parameters():
                param.requires_grad = False

    def init_track_assigner_sampler(self):
        """Initialize assigner and sampler."""

        self.track_roi_assigner = build_assigner(
            self.track_train_cfg.assigner)
        self.track_share_assigner = False

        self.track_roi_sampler = build_sampler(
            self.track_train_cfg.sampler, context=self)
        self.track_share_sampler = False

    def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_semantic_seg):
        # gt_masks and gt_semantic_seg are not padded when forming batch
        gt_masks_tensor = []
        gt_sem_seg = []
        gt_sem_cls = []
        # batch_input_shape shoud be the same across images
        pad_H, pad_W = img_metas[0]['batch_input_shape']
        assign_H = pad_H // self.mask_assign_stride
        assign_W = pad_W // self.mask_assign_stride

        for i, gt_mask in enumerate(gt_masks):
            mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device)
            if gt_mask.width != pad_W or gt_mask.height != pad_H:
                pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height)
                mask_tensor = F.pad(mask_tensor, pad_wh, value=0)

            if gt_semantic_seg is not None:
                # gt_semantic seg is padded by zero when forming a batch
                # need to convert them from 0 to ignore
                gt_semantic_seg[
                i, :, img_metas[i]['img_shape'][0]:, :] = self.ignore_label
                gt_semantic_seg[
                i, :, :, img_metas[i]['img_shape'][1]:] = self.ignore_label
                if self.cityscapes:
                    sem_labels, sem_seg = sem2ins_masks_cityscapes(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=self.num_thing_classes,
                        thing_label_in_seg=list(range(self.num_stuff_classes,
                                                      self.num_thing_classes + self.num_stuff_classes))
                    )
                elif self.kitti_step:
                    sem_labels, sem_seg = sem2ins_masks_kitti_step(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=2,
                        thing_label_in_seg=(11, 13))
                else:
                    sem_labels, sem_seg = sem2ins_masks(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=self.num_thing_classes,
                        thing_label_in_seg=self.thing_label_in_seg)

                if sem_seg.shape[0] == 0:
                    gt_sem_seg.append(
                        mask_tensor.new_zeros(
                            (mask_tensor.size(0), assign_H, assign_W)))
                else:
                    gt_sem_seg.append(
                        F.interpolate(
                            sem_seg[None], (assign_H, assign_W),
                            mode='bilinear',
                            align_corners=False)[0])
                gt_sem_cls.append(sem_labels)
            else:
                gt_sem_seg = None
                gt_sem_cls = None

            if mask_tensor.shape[0] == 0:
                gt_masks_tensor.append(
                    mask_tensor.new_zeros(
                        (mask_tensor.size(0), assign_H, assign_W)))
            else:
                gt_masks_tensor.append(
                    F.interpolate(
                        mask_tensor[None], (assign_H, assign_W),  # downsample to 1/4 resolution
                        mode='bilinear',
                        align_corners=False)[0])

        return gt_masks_tensor, gt_sem_cls, gt_sem_seg

    def forward_train(self,
                      img,
                      img_metas,
                      gt_bboxes=None,
                      gt_labels=None,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      gt_semantic_seg=None,
                      gt_instance_ids=None,
                      ref_img=None,
                      ref_img_metas=None,
                      ref_gt_bboxes_ignore=None,
                      ref_gt_labels=None,
                      ref_gt_bboxes=None,
                      ref_gt_masks=None,
                      ref_gt_semantic_seg=None,
                      ref_gt_instance_ids=None,
                      proposals=None,
                      **kwargs):
        """Forward function of SparseR-CNN-like network in train stage.

        Args:
            img (Tensor): of shape (N, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.
            img_metas (list[dict]): list of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                :class:`mmdet.datasets.pipelines.Collect`.
            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
            gt_labels (list[Tensor]): class indices corresponding to each box
            gt_bboxes_ignore (None | list[Tensor): specify which bounding
                boxes can be ignored when computing the loss.
            gt_masks (List[Tensor], optional) : Segmentation masks for
                each box. But we don't support it in this architecture.
            proposals (List[Tensor], optional): override rpn proposals with
                custom proposals. Use when `with_rpn` is False.

            # This is for video only:
            ref_img (Tensor): of shape (N, 2, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.
                2 denotes there is two reference images for each input image.

            ref_img_metas (list[list[dict]]): The first list only has one
                element. The second list contains reference image information
                dict where each dict has: 'img_shape', 'scale_factor', 'flip',
                and may also contain 'filename', 'ori_shape', 'pad_shape', and
                'img_norm_cfg'. For details on the values of these keys see
                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.

            ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The
                Tensor contains ground truth bboxes for each reference image
                with shape (num_all_ref_gts, 5) in
                [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id
                start from 0, and denotes the id of reference image for each
                key image.

            ref_gt_labels (list[Tensor]): The list only has one Tensor. The
                Tensor contains class indices corresponding to each reference
                box with shape (num_all_ref_gts, 2) in
                [ref_img_id, class_indice].

        Returns:
            dict[str, Tensor]: a dictionary of loss components
        """
        batch_input_shape = tuple(img[0].size()[-2:])
        for img_meta in img_metas:
            img_meta['batch_input_shape'] = batch_input_shape

        assert proposals is None, 'KNet does not support' \
                                  ' external proposals'
        assert gt_masks is not None
        assert gt_instance_ids is not None

        # preprocess the reference images
        ref_img = ref_img.squeeze(1)  # (b,3,h,w)
        img_h, img_w = batch_input_shape
        ref_masks_gt = []
        for ref_gt_mask in ref_gt_masks:
            ref_masks_gt.append(ref_gt_mask[0])

        ref_labels_gt = []
        for ref_gt_label in ref_gt_labels:
            ref_labels_gt.append(ref_gt_label[:, 1].long())
        ref_gt_labels = ref_labels_gt

        ref_semantic_seg_gt = ref_gt_semantic_seg.squeeze(1)

        ref_gt_instance_id_list = []
        for ref_gt_instance_id in ref_gt_instance_ids:
            ref_gt_instance_id_list.append(ref_gt_instance_id[:, 1].long())

        ref_img_metas_new = []
        for ref_img_meta in ref_img_metas:
            ref_img_meta[0]['batch_input_shape'] = batch_input_shape
            ref_img_metas_new.append(ref_img_meta[0])

        # prepare the gt_match_indices
        gt_pids_list = []
        for i in range(len(ref_gt_instance_id_list)):
            ref_ids = ref_gt_instance_id_list[i].cpu().data.numpy().tolist()
            gt_ids = gt_instance_ids[i].cpu().data.numpy().tolist()
            gt_pids = [ref_ids.index(i) if i in ref_ids else -1 for i in gt_ids]
            gt_pids_list.append(torch.LongTensor([gt_pids]).to(img.device)[0])

        gt_match_indices = gt_pids_list

        # gt_masks and gt_semantic_seg are not padded when forming batch
        gt_masks, gt_sem_cls, gt_sem_seg = self.preprocess_gt_masks(img_metas, gt_masks, gt_labels, gt_semantic_seg)
        ref_gt_masks, ref_gt_sem_cls, ref_gt_sem_seg = self.preprocess_gt_masks(ref_img_metas_new,
                                                                                ref_masks_gt, ref_gt_labels,
                                                                                ref_semantic_seg_gt)

        x = self.extract_feat(img)
        x_ref = self.extract_feat(ref_img)

        rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks,
                                                  gt_labels, gt_sem_seg,
                                                  gt_sem_cls)

        # simple forward to get the reference results
        self.rpn_head.eval()
        ref_rpn_results = self.rpn_head.simple_test_rpn(x_ref, ref_img_metas_new)
        self.rpn_head.train()

        (rpn_losses, proposal_feats, x_feats, mask_preds,
         cls_scores) = rpn_results

        (ref_proposal_feats, ref_x_feats, ref_mask_preds,
         ref_cls_scores, ref_seg_preds) = ref_rpn_results

        ref_obj_feats, ref_cls_scores, ref_mask_preds, ref_scaled_mask_preds = self.roi_head.simple_test_mask_preds(
            ref_x_feats,
            ref_proposal_feats,
            ref_mask_preds,
            ref_cls_scores,
            ref_img_metas_new,
        )

        if self.link_previous:
            losses, object_feats, cls_scores, mask_preds, scaled_mask_preds, object_feats_track = self.roi_head.forward_train_with_previous(
                x_feats,
                proposal_feats,
                mask_preds,
                cls_scores,
                img_metas,
                gt_masks,
                gt_labels,
                gt_bboxes_ignore=gt_bboxes_ignore,
                gt_bboxes=gt_bboxes,
                gt_sem_seg=gt_sem_seg,
                gt_sem_cls=gt_sem_cls,
                imgs_whwh=None,
                previous_obj_feats=ref_obj_feats,
                previous_mask_preds=ref_scaled_mask_preds,
                previous_x_feats=ref_x_feats,
            )
        else:
            # forward to get the current results
            losses, object_feats, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.forward_train(
                x_feats,
                proposal_feats,
                mask_preds,
                cls_scores,
                img_metas,
                gt_masks,
                gt_labels,
                gt_bboxes_ignore=gt_bboxes_ignore,
                gt_bboxes=gt_bboxes,
                gt_sem_seg=gt_sem_seg,
                gt_sem_cls=gt_sem_cls,
                imgs_whwh=None)

        # ===== Tracking Part -==== #
        # assign both key frame and reference frame tracking targets
        key_sampling_results, ref_sampling_results = [], []
        num_imgs = len(img_metas)

        for i in range(num_imgs):
            assign_result = self.track_roi_assigner.assign(
                scaled_mask_preds[i][:self.num_proposals].detach(),
                cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(),
                gt_masks[i], gt_labels[i], img_meta=img_metas[i])
            sampling_result = self.track_roi_sampler.sample(
                assign_result,
                mask_preds[i][:self.num_proposals].detach(),
                gt_masks[i])
            key_sampling_results.append(sampling_result)

            ref_assign_result = self.track_roi_assigner.assign(
                ref_scaled_mask_preds[i][:self.num_proposals].detach(),
                ref_cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(),
                ref_gt_masks[i], ref_gt_labels[i], img_meta=ref_img_metas_new[i])
            ref_sampling_result = self.track_roi_sampler.sample(
                ref_assign_result,
                ref_mask_preds[i][:self.num_proposals].detach(),
                ref_gt_masks[i])
            ref_sampling_results.append(ref_sampling_result)
        if self.detach_mask_emd:
            object_feats = object_feats.detach()
            ref_obj_feats = ref_obj_feats.detach()

        if self.link_previous:
            object_feats = object_feats_track

        N, num_proposal, _, _, _ = object_feats.shape
        emb_feat = object_feats.squeeze(-2).squeeze(-1)[:, :self.num_proposals, ]

        for emb_layer in self.embed_fcs:
            emb_feat = emb_layer(emb_feat)
        object_feats_embed = self.fc_embed(emb_feat).view(N, self.num_proposals, -1)

        ref_emb_feat = ref_obj_feats.squeeze(-2).squeeze(-1)[:, :self.num_proposals, ]
        for emb_layer in self.embed_fcs:
            ref_emb_feat = emb_layer(ref_emb_feat)
        ref_object_feats_embed = self.fc_embed(ref_emb_feat).view(N, self.num_proposals, -1)

        # sampling predicted GT mask
        key_emb_indexs = [res.pos_inds for res in key_sampling_results]
        object_feats_embed_list = []
        for i in range(len(key_emb_indexs)):
            object_feats_embed_list.append(object_feats_embed[:, key_emb_indexs[i], :].squeeze(0))

        key_feats = self._track_forward(object_feats_embed_list)

        ref_emb_indexs = [res.pos_inds for res in ref_sampling_results]
        ref_object_feats_embed_list = []
        for i in range(len(ref_emb_indexs)):
            ref_object_feats_embed_list.append(ref_object_feats_embed[:, ref_emb_indexs[i], :].squeeze(0))

        ref_feats = self._track_forward(ref_object_feats_embed_list)

        match_feats = self.track_head.match(key_feats, ref_feats,
                                            key_sampling_results,
                                            ref_sampling_results)

        asso_targets = self.track_head.get_track_targets(
            gt_match_indices, key_sampling_results, ref_sampling_results)
        loss_track = self.track_head.loss(*match_feats, *asso_targets)

        losses.update(loss_track)
        losses.update(rpn_losses)

        return losses

    def simple_test(self, img, img_metas, rescale=False, ref_img=None, **kwargs):
        """Test function without test time augmentation.

        Args:
            imgs (list[torch.Tensor]): List of multiple images
            img_metas (list[dict]): List of image information.
            rescale (bool): Whether to rescale the results.
                Defaults to False.

        Returns:
            list[list[np.ndarray]]: BBox results of each image and classes.
                The outer list corresponds to each image. The inner list
                corresponds to each class.
        """
        # set the dataset type
        # whether is the first frame for such clips
        if self.cityscapes and not self.kitti_step and not self.cityscapes_short:
            iid = img_metas[0]['iid']
            fid = iid % 10000
            is_first = (fid == 1)
        else:
            iid = kwargs['img_id'][0].item()
            fid = iid % 10000
            is_first = (fid == 0)

        if is_first:
            self.init_tracker()
            self.obj_feats_memory = None
            self.x_feats_memory = None
            self.mask_preds_memory = None

        # for current frame
        x = self.extract_feat(img)
        # current frame inference
        rpn_results = self.rpn_head.simple_test_rpn(x, img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results

        if self.link_previous:
            cur_segm_results, obj_feats, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.simple_test_with_previous(
                x_feats,
                proposal_feats,
                mask_preds,
                cls_scores,
                img_metas,
                previous_obj_feats=self.obj_feats_memory,
                previous_mask_preds=self.mask_preds_memory,
                previous_x_feats=self.x_feats_memory,
                is_first=is_first,
            )

            self.obj_feats_memory = obj_feats
            self.x_feats_memory = x_feats
            self.mask_preds_memory = scaled_mask_preds
        else:
            cur_segm_results, query_output, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.simple_test(
                x_feats,
                proposal_feats,
                mask_preds,
                cls_scores,
                img_metas)

        # for tracking part
        _, segm_result, mask_preds, panoptic_result, query_output = cur_segm_results[0]
        panoptic_seg, segments_info = panoptic_result

        # get sorted tracking thing ids, labels, masks, score for tracking
        things_index_for_tracking, things_labels_for_tracking, thing_masks_for_tracking, things_score_for_tracking = \
            self.get_things_id_for_tracking(panoptic_seg, segments_info)
        things_labels_for_tracking = torch.Tensor(things_labels_for_tracking).to(cls_scores.device).long()

        # get the semantic filter
        if self.semantic_filter:
            seg_preds = torch.nn.functional.interpolate(seg_preds, panoptic_seg.shape, mode='bilinear',
                                                        align_corners=False)
            seg_preds = seg_preds.sigmoid()
            seg_out = seg_preds.argmax(1)
            semantic_thing = (seg_out < self.num_thing_classes).to(dtype=torch.float32)
        else:
            semantic_thing = 1.

        if len(things_labels_for_tracking) > 0:
            things_bbox_for_tracking = torch.zeros((len(things_score_for_tracking), 5),
                                                   dtype=torch.float, device=x_feats.device)
            things_bbox_for_tracking[:, 4] = torch.tensor(things_score_for_tracking,
                                                          device=things_bbox_for_tracking.device)

            thing_masks_for_tracking_final = []
            for mask in thing_masks_for_tracking:
                thing_masks_for_tracking_final.append(torch.Tensor(mask).unsqueeze(0).to(
                    x_feats.device).float())
            thing_masks_for_tracking_final = torch.cat(thing_masks_for_tracking_final, 0)
            thing_masks_for_tracking = thing_masks_for_tracking_final
            thing_masks_for_tracking_with_semantic_filter = thing_masks_for_tracking_final * semantic_thing

        if len(things_labels_for_tracking) == 0:
            track_feats = None
        else:
            # tracking embeddings
            N, _, _, _ = query_output.shape
            emb_feat = query_output.squeeze(-2).squeeze(-1).unsqueeze(0)  # (n,d,1,1) -> (1,n,d)

            for emb_layer in self.embed_fcs:
                emb_feat = emb_layer(emb_feat)
            object_feats_embed = self.fc_embed(emb_feat).view(1, N, -1)

            object_feats_embed_for_tracking = object_feats_embed.squeeze(0)
            # tracking embedding features
            track_feats = self._track_forward([object_feats_embed_for_tracking])

        if track_feats is not None:
            things_bbox_for_tracking[:, :4] = torch.tensor(
                tensor_mask2box(thing_masks_for_tracking_with_semantic_filter),
                device=things_bbox_for_tracking.device)
            bboxes, labels, ids = self.tracker.match(
                bboxes=things_bbox_for_tracking,
                labels=things_labels_for_tracking,
                track_feats=track_feats,
                frame_id=fid)
            ids = ids + 1
            ids[ids == -1] = 0
        else:
            ids = []

        print("ids", ids)

        track_maps = self.generate_track_id_maps(ids, thing_masks_for_tracking, panoptic_seg)

        semantic_map = self.get_semantic_seg(panoptic_seg, segments_info)

        from scripts.visualizer import trackmap2rgb, cityscapes_cat2rgb, draw_bbox_on_img
        vis_tracker = trackmap2rgb(track_maps)
        vis_sem = cityscapes_cat2rgb(semantic_map)
        if len(things_labels_for_tracking):
            vis_tracker = draw_bbox_on_img(vis_tracker, things_bbox_for_tracking.cpu().numpy())

        # Visualization usage
        return semantic_map, track_maps, None, vis_sem, vis_tracker

    def _track_forward(self, track_feats, x=None, mask_pred=None):
        """Track head forward function used in both training and testing.
        We use mask pooling to get the fine grain features"""
        # if not self.training:
        #     mask_pred = [mask_pred]
        # bbox_list = batch_mask2boxlist(mask_pred)
        # track_rois = bboxlist2roi(bbox_list)
        # track_rois = track_rois.clamp(min=0.0)
        # track_feats = self.track_roi_extractor(x[:self.track_roi_extractor.num_inputs], track_rois)
        track_feats = torch.cat(track_feats, 0)
        # print(track_feats.shape)
        # print(track_feats.shape)
        # track_feats = track_feats

        track_feats = self.track_head(track_feats)

        return track_feats

    def forward_dummy(self, img):
        """Used for computing network flops.

        See `mmdetection/tools/get_flops.py`
        """
        # backbone
        x = self.extract_feat(img)
        # rpn
        num_imgs = len(img)
        dummy_img_metas = [
            dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs)
        ]
        rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results
        roi_outs = self.roi_head.forward_dummy(x_feats, proposal_feats,
                                               dummy_img_metas)
        return roi_outs

    def extract_feat(self, img):
        """Directly extract features from the backbone+neck."""
        x = self.backbone(img)
        if self.with_neck:
            x = self.neck(x)
        return x

    @property
    def with_rpn(self):
        """bool: whether the detector has RPN"""
        return hasattr(self, 'rpn_head') and self.rpn_head is not None

    @property
    def with_roi_head(self):
        """bool: whether the detector has a RoI head"""
        return hasattr(self, 'roi_head') and self.roi_head is not None

    def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
        """Test with augmentations.

        If rescale is False, then returned bboxes and masks will fit the scale
        of imgs[0].
        """
        pass

    def get_things_id_for_tracking(self, panoptic_seg, seg_infos):
        idxs = []
        labels = []
        masks = []
        score = []
        for segment in seg_infos:
            if segment['isthing'] == True:
                thing_mask = panoptic_seg == segment["id"]
                masks.append(thing_mask)
                idxs.append(segment["instance_id"])
                labels.append(segment['category_id'])
                score.append(segment['score'])
        return idxs, labels, masks, score

    def pack_things_object(self, object_feats, ref_object_feats):
        object_feats, ref_object_feats = object_feats.squeeze(-1).squeeze(-1), ref_object_feats.squeeze(-1).squeeze(-1)
        thing_object_feats = torch.split(object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        ref_thing_object_feats = \
        torch.split(ref_object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        return thing_object_feats, ref_thing_object_feats

    def pack_things_masks(self, mask_pred, ref_mask_pred):
        thing_mask_pred = torch.split(mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        ref_thing_thing_mask_pred = \
        torch.split(ref_mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        return thing_mask_pred, ref_thing_thing_mask_pred

    def get_semantic_seg(self, panoptic_seg, segments_info):
        results = {}
        masks = []
        scores = []
        kitti_step2cityscpaes = [11, 13]
        semantic_seg = np.zeros(panoptic_seg.shape)
        for segment in segments_info:
            if segment['isthing'] == True:
                if self.kitti_step:
                    cat_cur = kitti_step2cityscpaes[segment["category_id"]]
                    semantic_seg[panoptic_seg == segment["id"]] = cat_cur
                else:
                    semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] + self.num_stuff_classes
            else:
                # for stuff (0- n-1)
                if self.kitti_step:
                    cat_cur = segment["category_id"]
                    cat_cur -= 1
                    offset = 0
                    for thing_id in kitti_step2cityscpaes:
                        if cat_cur + offset >= thing_id:
                            offset += 1
                    cat_cur += offset
                    semantic_seg[panoptic_seg == segment["id"]] = cat_cur
                else:
                    semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] - 1
        return semantic_seg

    def generate_track_id_maps(self, ids, masks, panopitc_seg_maps):
        final_id_maps = np.zeros(panopitc_seg_maps.shape)
        if len(ids) == 0:
            return final_id_maps
        # assert len(things_mask_results) == len(track_results)
        masks = masks.bool()
        for i, id in enumerate(ids):
            mask = masks[i].cpu().numpy()
            final_id_maps[mask] = id
        return final_id_maps

================================================
FILE: knet/video/knet_quansi_dense_embed_fc_joint_train.py
================================================
import warnings
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
from mmcv.cnn import (ConvModule, bias_init_with_prob,
                      build_activation_layer, build_norm_layer)
from mmdet.models.builder import DETECTORS
from mmdet.models.detectors import BaseDetector
from mmdet.models.builder import build_head, build_neck, build_backbone, build_roi_extractor
from mmdet.core import build_assigner, build_sampler
from knet.video.qdtrack.builder import build_tracker
from knet.det.utils import sem2ins_masks, sem2ins_masks_cityscapes, sem2ins_masks_kitti_step
from unitrack.mask import tensor_mask2box


@DETECTORS.register_module()
class VideoKNetQuansiEmbedFCJointTrain(BaseDetector):
    """
        Simple Extension of KNet to Video KNet by the implementation of VPSFuse Net.
    """
    def __init__(self,
                 backbone,
                 neck=None,
                 rpn_head=None,
                 roi_head=None,
                 track_head=None,
                 extra_neck=None,
                 track_localization_fpn=None,
                 tracker=None,
                 train_cfg=None,
                 test_cfg=None,
                 track_train_cfg=None,
                 pretrained=None,
                 init_cfg=None,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 ignore_label=255,
                 thing_label_in_seg=0,
                 detach_mask_emd=False,
                 cityscapes=False,
                 kitti_step=False,
                 cityscapes_short=False,
                 vipseg=False,
                 freeze_detector=False,
                 semantic_filter=True,
                 # linking parameters
                 link_previous=False,
                 bbox_roi_extractor=None,
                 **kwargs):
        super(VideoKNetQuansiEmbedFCJointTrain, self).__init__(init_cfg)

        if pretrained:
            warnings.warn('DeprecationWarning: pretrained is deprecated, '
                          'please use "init_cfg" instead')
            backbone.pretrained = pretrained
        self.backbone = build_backbone(backbone)

        if neck is not None:
            self.neck = build_neck(neck)

        if extra_neck is not None:
            self.extra_neck = build_neck(extra_neck)

        if rpn_head is not None:
            rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None
            rpn_head_ = rpn_head.copy()
            rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn)
            self.rpn_head = build_head(rpn_head_)

        if roi_head is not None:
            # update train and test cfg here for now
            # TODO: refactor assigner & sampler
            rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None
            roi_head.update(train_cfg=rcnn_train_cfg)
            roi_head.update(test_cfg=test_cfg.rcnn)
            roi_head.pretrained = pretrained
            self.roi_head = build_head(roi_head)

        if track_head is not None:
            self.track_train_cfg = track_train_cfg
            self.track_head = build_head(track_head)
            self.init_track_assigner_sampler()
            if track_localization_fpn is not None:
                self.track_localization_fpn = build_neck(track_localization_fpn)

            if bbox_roi_extractor is not None:
                self.track_roi_extractor = build_roi_extractor(
                    bbox_roi_extractor)

        if tracker is not None:
            self.tracker_cfg = tracker

        if freeze_detector:
           self._freeze_detector()

        self.train_cfg = train_cfg
        self.test_cfg = test_cfg
        self.num_proposals = self.rpn_head.num_proposals
        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.thing_label_in_seg = thing_label_in_seg
        self.ignore_label = ignore_label
        self.cityscapes = cityscapes  # whether to train the cityscape panoptic segmentation
        self.kitti_step = kitti_step  # whether to train the kitti step panoptic segmentation
        self.cityscapes_short = cityscapes_short  # whether to test the cityscape short panoptic segmentation
        self.vipseg = vipseg  # whether to test the vip panoptic segmentation
        self.semantic_filter = semantic_filter
        self.link_previous = link_previous
        self.detach_mask_emd = detach_mask_emd
        # add embedding fcs for the final stage queries
        num_emb_fcs = 1
        act_cfg = dict(type='ReLU', inplace=True)
        in_channels = 256
        out_channels = 256
        self.embed_fcs = nn.ModuleList()
        for _ in range(num_emb_fcs):
            self.embed_fcs.append(
                nn.Linear(in_channels, in_channels, bias=False))
            self.embed_fcs.append(
                build_norm_layer(dict(type='LN'), in_channels)[1])
            self.embed_fcs.append(build_activation_layer(act_cfg))

        self.fc_embed = nn.Linear(in_channels, out_channels)

    def init_tracker(self):
        self.tracker = build_tracker(self.tracker_cfg)

    def _freeze_detector(self):

        self.detector = [
            self.rpn_head, self.roi_head
        ]
        for model in self.detector:
            model.eval()
            for param in model.parameters():
                param.requires_grad = False

    def init_track_assigner_sampler(self):
        """Initialize assigner and sampler."""

        self.track_roi_assigner = build_assigner(
            self.track_train_cfg.assigner)
        self.track_share_assigner = False

        self.track_roi_sampler = build_sampler(
            self.track_train_cfg.sampler, context=self)
        self.track_share_sampler = False

    def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_semantic_seg):
        # gt_masks and gt_semantic_seg are not padded when forming batch
        gt_masks_tensor = []
        gt_sem_seg = []
        gt_sem_cls = []
        # batch_input_shape shoud be the same across images
        pad_H, pad_W = img_metas[0]['batch_input_shape']
        assign_H = pad_H // self.mask_assign_stride
        assign_W = pad_W // self.mask_assign_stride

        for i, gt_mask in enumerate(gt_masks):
            mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device)
            if gt_mask.width != pad_W or gt_mask.height != pad_H:
                pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height)
                mask_tensor = F.pad(mask_tensor, pad_wh, value=0)

            if gt_semantic_seg is not None:
                # gt_semantic seg is padded by zero when forming a batch
                # need to convert them from 0 to ignore
                gt_semantic_seg[
                i, :, img_metas[i]['img_shape'][0]:, :] = self.ignore_label
                gt_semantic_seg[
                i, :, :, img_metas[i]['img_shape'][1]:] = self.ignore_label

                if self.cityscapes or self.vipseg:
                    sem_labels, sem_seg = sem2ins_masks_cityscapes(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=self.num_thing_classes,
                        thing_label_in_seg=list(range(self.num_stuff_classes,
                                                      self.num_thing_classes + self.num_stuff_classes))
                    )
                elif self.kitti_step:
                    sem_labels, sem_seg = sem2ins_masks_kitti_step(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=2,
                        thing_label_in_seg=(11, 13))
                else:
                    sem_labels, sem_seg = sem2ins_masks(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=self.num_thing_classes,
                        thing_label_in_seg=self.thing_label_in_seg)

                if sem_seg.shape[0] == 0:
                    gt_sem_seg.append(
                        mask_tensor.new_zeros(
                            (mask_tensor.size(0), assign_H, assign_W)))
                else:
                    gt_sem_seg.append(
                        F.interpolate(
                            sem_seg[None], (assign_H, assign_W),
                            mode='bilinear',
                            align_corners=False)[0])
                gt_sem_cls.append(sem_labels)
            else:
                gt_sem_seg = None
                gt_sem_cls = None

            if mask_tensor.shape[0] == 0:
                gt_masks_tensor.append(
                    mask_tensor.new_zeros(
                        (mask_tensor.size(0), assign_H, assign_W)))
            else:
                gt_masks_tensor.append(
                    F.interpolate(
                        mask_tensor[None], (assign_H, assign_W),  # downsample to 1/4 resolution
                        mode='bilinear',
                        align_corners=False)[0])

        return gt_masks_tensor, gt_sem_cls, gt_sem_seg

    def forward_train(self,
                      img,
                      img_metas,
                      gt_bboxes=None,
                      gt_labels=None,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      gt_semantic_seg=None,
                      gt_instance_ids=None,
                      ref_img=None,
                      ref_img_metas=None,
                      ref_gt_bboxes_ignore=None,
                      ref_gt_labels=None,
                      ref_gt_bboxes=None,
                      ref_gt_masks=None,
                      ref_gt_semantic_seg=None,
                      ref_gt_instance_ids=None,
                      proposals=None,
                      **kwargs):
        """Forward function of SparseR-CNN-like network in train stage.

        Args:
            img (Tensor): of shape (N, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.
            img_metas (list[dict]): list of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                :class:`mmdet.datasets.pipelines.Collect`.
            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
            gt_labels (list[Tensor]): class indices corresponding to each box
            gt_bboxes_ignore (None | list[Tensor): specify which bounding
                boxes can be ignored when computing the loss.
            gt_masks (List[Tensor], optional) : Segmentation masks for
                each box. But we don't support it in this architecture.
            proposals (List[Tensor], optional): override rpn proposals with
                custom proposals. Use when `with_rpn` is False.

            # This is for video only:
            ref_img (Tensor): of shape (N, 2, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.
                2 denotes there is two reference images for each input image.

            ref_img_metas (list[list[dict]]): The first list only has one
                element. The second list contains reference image information
                dict where each dict has: 'img_shape', 'scale_factor', 'flip',
                and may also contain 'filename', 'ori_shape', 'pad_shape', and
                'img_norm_cfg'. For details on the values of these keys see
                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.

            ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The
                Tensor contains ground truth bboxes for each reference image
                with shape (num_all_ref_gts, 5) in
                [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id
                start from 0, and denotes the id of reference image for each
                key image.

            ref_gt_labels (list[Tensor]): The list only has one Tensor. The
                Tensor contains class indices corresponding to each reference
                box with shape (num_all_ref_gts, 2) in
                [ref_img_id, class_indice].

        Returns:
            dict[str, Tensor]: a dictionary of loss components
        """
        batch_input_shape = tuple(img[0].size()[-2:])
        for img_meta in img_metas:
            img_meta['batch_input_shape'] = batch_input_shape

        assert proposals is None, 'KNet does not support' \
                                  ' external proposals'
        assert gt_masks is not None
        assert gt_instance_ids is not None

        # preprocess the reference images
        ref_img = ref_img.squeeze(1)  # (b,3,h,w)
        img_h, img_w = batch_input_shape
        ref_masks_gt = []
        for ref_gt_mask in ref_gt_masks:
            ref_masks_gt.append(ref_gt_mask[0])

        ref_labels_gt = []
        for ref_gt_label in ref_gt_labels:
            ref_labels_gt.append(ref_gt_label[:, 1].long())
        ref_gt_labels = ref_labels_gt

        ref_semantic_seg_gt = ref_gt_semantic_seg.squeeze(1)

        ref_gt_instance_id_list = []
        for ref_gt_instance_id in ref_gt_instance_ids:
            ref_gt_instance_id_list.append(ref_gt_instance_id[:,1].long())

        ref_img_metas_new = []
        for ref_img_meta in ref_img_metas:
            ref_img_meta[0]['batch_input_shape'] = batch_input_shape
            ref_img_metas_new.append(ref_img_meta[0])

        # prepare the gt_match_indices
        gt_pids_list = []
        for i in range(len(ref_gt_instance_id_list)):
            ref_ids = ref_gt_instance_id_list[i].cpu().data.numpy().tolist()
            gt_ids = gt_instance_ids[i].cpu().data.numpy().tolist()
            gt_pids = [ref_ids.index(i) if i in ref_ids else -1 for i in gt_ids]
            gt_pids_list.append(torch.LongTensor([gt_pids]).to(img.device)[0])

        gt_match_indices = gt_pids_list

        # gt_masks and gt_semantic_seg are not padded when forming batch
        gt_masks, gt_sem_cls, gt_sem_seg = self.preprocess_gt_masks(img_metas, gt_masks, gt_labels, gt_semantic_seg)
        ref_gt_masks, ref_gt_sem_cls, ref_gt_sem_seg = self.preprocess_gt_masks(ref_img_metas_new,
                                                                    ref_masks_gt, ref_gt_labels, ref_semantic_seg_gt)

        x = self.extract_feat(img)
        x_ref = self.extract_feat(ref_img)

        rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks,
                                                  gt_labels, gt_sem_seg,
                                                  gt_sem_cls)

        ref_rpn_results = self.rpn_head.forward_train(x_ref, ref_img_metas_new, ref_gt_masks,
                                                      ref_labels_gt, ref_gt_sem_seg,
                                                      ref_gt_sem_cls)

        (rpn_losses, proposal_feats, x_feats, mask_preds,
         cls_scores) = rpn_results

        (ref_rpn_losses, ref_proposal_feats, ref_x_feats, ref_mask_preds,
         ref_cls_scores) = ref_rpn_results

        losses_ref, ref_obj_feats, ref_cls_scores, ref_mask_preds, ref_scaled_mask_preds = self.roi_head.forward_train(
            ref_x_feats,
            ref_proposal_feats,
            ref_mask_preds,
            ref_cls_scores,
            ref_img_metas,
            ref_gt_masks,
            ref_gt_labels,
            gt_sem_seg=ref_gt_sem_seg,
            gt_sem_cls=ref_gt_sem_cls,
            imgs_whwh=None)

        if self.link_previous:
            losses, object_feats, cls_scores, mask_preds, scaled_mask_preds, object_feats_track = self.roi_head.forward_train_with_previous(
                x_feats,
                proposal_feats,
                mask_preds,
                cls_scores,
                img_metas,
                gt_masks,
                gt_labels,
                gt_bboxes_ignore=gt_bboxes_ignore,
                gt_bboxes=gt_bboxes,
                gt_sem_seg=gt_sem_seg,
                gt_sem_cls=gt_sem_cls,
                imgs_whwh=None,
                previous_obj_feats=ref_obj_feats,
                previous_mask_preds=ref_scaled_mask_preds,
                previous_x_feats=ref_x_feats,
            )
        else:
            # forward to get the current results
            losses, object_feats, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.forward_train(
                x_feats,
                proposal_feats,
                mask_preds,
                cls_scores,
                img_metas,
                gt_masks,
                gt_labels,
                gt_bboxes_ignore=gt_bboxes_ignore,
                gt_bboxes=gt_bboxes,
                gt_sem_seg=gt_sem_seg,
                gt_sem_cls=gt_sem_cls,
                imgs_whwh=None)

        # ===== Tracking Part -==== #
        # assign both key frame and reference frame tracking targets
        key_sampling_results, ref_sampling_results = [], []
        num_imgs = len(img_metas)

        for i in range(num_imgs):
            assign_result = self.track_roi_assigner.assign(
                scaled_mask_preds[i][:self.num_proposals].detach(), cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(),
                gt_masks[i], gt_labels[i], img_meta=img_metas[i])
            sampling_result = self.track_roi_sampler.sample(
                assign_result,
                mask_preds[i][:self.num_proposals].detach(),
                gt_masks[i])
            key_sampling_results.append(sampling_result)

            ref_assign_result = self.track_roi_assigner.assign(
                ref_scaled_mask_preds[i][:self.num_proposals].detach(), ref_cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(),
                ref_gt_masks[i], ref_gt_labels[i], img_meta=ref_img_metas_new[i])
            ref_sampling_result = self.track_roi_sampler.sample(
                ref_assign_result,
                ref_mask_preds[i][:self.num_proposals].detach(),
                ref_gt_masks[i])
            ref_sampling_results.append(ref_sampling_result)

        # current is tracking object
        N, num_proposal, _, _, _ = object_feats_track.shape
        emb_feat = object_feats_track.squeeze(-2).squeeze(-1)[:, :self.num_proposals, ]

        for emb_layer in self.embed_fcs:
            emb_feat = emb_layer(emb_feat)
        object_feats_embed = self.fc_embed(emb_feat).view(N, self.num_proposals, -1)


        ref_emb_feat = ref_obj_feats.squeeze(-2).squeeze(-1)[:, :self.num_proposals, ]
        for emb_layer in self.embed_fcs:
            ref_emb_feat = emb_layer(ref_emb_feat)
        ref_object_feats_embed = self.fc_embed(ref_emb_feat).view(N, self.num_proposals, -1)

        # sampling predicted GT mask
        key_emb_indexs = [res.pos_inds for res in key_sampling_results]
        object_feats_embed_list = []
        for i in range(len(key_emb_indexs)):
            object_feats_embed_list.append(object_feats_embed[:, key_emb_indexs[i], :].squeeze(0))

        key_feats = self._track_forward(object_feats_embed_list)

        ref_emb_indexs = [res.pos_inds for res in ref_sampling_results]
        ref_object_feats_embed_list = []
        for i in range(len(ref_emb_indexs)):
            ref_object_feats_embed_list.append(ref_object_feats_embed[:, ref_emb_indexs[i], :].squeeze(0))

        ref_feats = self._track_forward(ref_object_feats_embed_list)

        match_feats = self.track_head.match(key_feats, ref_feats,
                                            key_sampling_results,
                                            ref_sampling_results)

        asso_targets = self.track_head.get_track_targets(
            gt_match_indices, key_sampling_results, ref_sampling_results)
        loss_track = self.track_head.loss(*match_feats, *asso_targets)

        ref_losses = self.add_ref_loss(losses_ref)
        ref_rpn_losses = self.add_ref_rpn_loss(ref_rpn_losses)

        losses.update(ref_rpn_losses)
        losses.update(rpn_losses)
        losses.update(ref_losses)
        losses.update(loss_track)

        return losses

    def simple_test(self, img, img_metas, rescale=False, ref_img=None, **kwargs):
        """Test function without test time augmentation.

        Args:
            imgs (list[torch.Tensor]): List of multiple images
            img_metas (list[dict]): List of image information.
            rescale (bool): Whether to rescale the results.
                Defaults to False.

        Returns:
            list[list[np.ndarray]]: BBox results of each image and classes.
                The outer list corresponds to each image. The inner list
                corresponds to each class.
        """

        # set the dataset type
        if self.cityscapes and not self.kitti_step and not self.cityscapes_short and not self.vipseg:
            iid = img_metas[0]['iid']
            fid = iid % 10000
            is_first = (fid == 1)
        else:
            iid = kwargs['img_id'][0].item()
            fid = iid % 10000
            is_first = (fid == 0)

        # for current frame
        x = self.extract_feat(img)
        # current frame inference
        rpn_results = self.rpn_head.simple_test_rpn(x, img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results

        # init tracker
        if is_first:
            self.init_tracker()
            self.obj_feats_memory = None
            self.x_feats_memory = None
            self.mask_preds_memory = None
            print("fid", fid)

        # wheter to link the previous
        if self.link_previous:
            cur_segm_results, obj_feats, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.simple_test_with_previous(
                x_feats,
                proposal_feats,
                mask_preds,
                cls_scores,
                img_metas,
                previous_obj_feats=self.obj_feats_memory,
                previous_mask_preds=self.mask_preds_memory,
                previous_x_feats=self.x_feats_memory,
                is_first=is_first
            )
            self.obj_feats_memory = obj_feats
            self.x_feats_memory = x_feats
            self.mask_preds_memory = scaled_mask_preds
        else:
            cur_segm_results, query_output, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.simple_test(
                x_feats,
                proposal_feats,
                mask_preds,
                cls_scores,
                img_metas)

        # for tracking part
        _, segm_result, mask_preds, panoptic_result, query_output = cur_segm_results[0]
        panoptic_seg, segments_info = panoptic_result

        # get sorted tracking thing ids, labels, masks, score for tracking
        things_index_for_tracking, things_labels_for_tracking, thing_masks_for_tracking, things_score_for_tracking = \
            self.get_things_id_for_tracking(panoptic_seg, segments_info)
        things_labels_for_tracking = torch.Tensor(things_labels_for_tracking).to(cls_scores.device).long()

        # get the semantic filter
        if self.semantic_filter:
            seg_preds = torch.nn.functional.interpolate(seg_preds, panoptic_seg.shape, mode='bilinear',
                                                        align_corners=False)
            seg_preds = seg_preds.sigmoid()
            seg_out = seg_preds.argmax(1)
            semantic_thing = (seg_out < self.num_thing_classes).to(dtype=torch.float32)
        else:
            semantic_thing = 1.

        if len(things_labels_for_tracking) > 0:
            things_bbox_for_tracking = torch.zeros((len(things_score_for_tracking), 5),
                                                   dtype=torch.float, device=x_feats.device)
            things_bbox_for_tracking[:, 4] = torch.tensor(things_score_for_tracking,
                                                          device=things_bbox_for_tracking.device)

            thing_masks_for_tracking_final = []
            for mask in thing_masks_for_tracking:
                thing_masks_for_tracking_final.append(torch.Tensor(mask).unsqueeze(0).to(
                    x_feats.device).float())
            thing_masks_for_tracking_final = torch.cat(thing_masks_for_tracking_final, 0)
            thing_masks_for_tracking = thing_masks_for_tracking_final
            thing_masks_for_tracking_with_semantic_filter = thing_masks_for_tracking_final * semantic_thing

        if len(things_labels_for_tracking) == 0:
            track_feats = None
        else:
            # tracking embeddings
            N, _, _, _ = query_output.shape
            emb_feat = query_output.squeeze(-2).squeeze(-1).unsqueeze(0)  # (n,d,1,1) -> (1,n,d)

            for emb_layer in self.embed_fcs:
                emb_feat = emb_layer(emb_feat)
            object_feats_embed = self.fc_embed(emb_feat).view(1, N, -1)
            object_feats_embed_for_tracking = object_feats_embed.squeeze(0)
            track_feats = self._track_forward([object_feats_embed_for_tracking])

        if track_feats is not None:
            things_bbox_for_tracking[:, :4] = torch.tensor(tensor_mask2box(thing_masks_for_tracking_with_semantic_filter),
                                                           device=things_bbox_for_tracking.device)
            bboxes, labels, ids = self.tracker.match(
                bboxes=things_bbox_for_tracking,
                labels=things_labels_for_tracking,
                track_feats=track_feats,
                frame_id=fid)

            ids = ids + 1
            ids[ids == -1] = 0

            # print("track feats:", track_feats[0])
            # print("id", ids)

        else:
            ids = []


        track_maps = self.generate_track_id_maps(ids, thing_masks_for_tracking, panoptic_seg)

        semantic_map = self.get_semantic_seg(panoptic_seg, segments_info)

        from scripts.visualizer import trackmap2rgb, cityscapes_cat2rgb, draw_bbox_on_img
        vis_tracker = trackmap2rgb(track_maps)
        vis_sem = cityscapes_cat2rgb(semantic_map)
        if len(things_labels_for_tracking):
            vis_tracker = draw_bbox_on_img(vis_tracker, things_bbox_for_tracking.cpu().numpy())

        # Visualization usage
        return semantic_map, track_maps, None, vis_sem, vis_tracker

    def _track_forward(self, track_feats, x=None, mask_pred=None):
        """Track head forward function used in both training and testing.
        We use mask pooling to get the fine grain features"""
        # if not self.training:
        #     mask_pred = [mask_pred]
        track_feats = torch.cat(track_feats, 0)

        track_feats = self.track_head(track_feats)

        return track_feats

    def forward_dummy(self, img, img_metas=None):
        """Used for computing network flops.

        See `mmdetection/tools/get_flops.py`
        """
        # backbone
        x = self.extract_feat(img)
        # rpn
        num_imgs = len(img)
        dummy_img_metas = [
            dict(img_shape=(0, 0, 3)) for _ in range(num_imgs)
        ]
        rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results
        roi_outs = self.roi_head.simple_test_mask_preds(
            x_feats,
            proposal_feats,
            mask_preds,
            cls_scores,
            dummy_img_metas)
        return roi_outs

    def extract_feat(self, img):
        """Directly extract features from the backbone+neck."""
        x = self.backbone(img)
        if self.with_neck:
            x = self.neck(x)
        return x

    @property
    def with_rpn(self):
        """bool: whether the detector has RPN"""
        return hasattr(self, 'rpn_head') and self.rpn_head is not None

    @property
    def with_roi_head(self):
        """bool: whether the detector has a RoI head"""
        return hasattr(self, 'roi_head') and self.roi_head is not None

    def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
        """Test with augmentations.

        If rescale is False, then returned bboxes and masks will fit the scale
        of imgs[0].
        """
        pass

    def get_things_id_for_tracking(self, panoptic_seg, seg_infos):
        idxs = []
        labels = []
        masks = []
        score = []
        for segment in seg_infos:
            if segment['isthing'] == True:
                thing_mask = panoptic_seg == segment["id"]
                masks.append(thing_mask)
                idxs.append(segment["instance_id"])
                labels.append(segment['category_id'])
                score.append(segment['score'])
        return idxs, labels, masks, score

    def pack_things_object(self, object_feats, ref_object_feats):
        object_feats, ref_object_feats = object_feats.squeeze(-1).squeeze(-1), ref_object_feats.squeeze(-1).squeeze(-1)
        thing_object_feats = torch.split(object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        ref_thing_object_feats = torch.split(ref_object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        return thing_object_feats, ref_thing_object_feats

    def pack_things_masks(self, mask_pred, ref_mask_pred):
        thing_mask_pred = torch.split(mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        ref_thing_thing_mask_pred = torch.split(ref_mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        return thing_mask_pred, ref_thing_thing_mask_pred

    def get_semantic_seg(self, panoptic_seg, segments_info):
        kitti_step2cityscpaes = [11, 13]
        semantic_seg = np.zeros(panoptic_seg.shape)
        for segment in segments_info:
            if segment['isthing'] == True:
                # for things
                if self.kitti_step:
                    cat_cur = kitti_step2cityscpaes[segment["category_id"]]
                    semantic_seg[panoptic_seg == segment["id"]] = cat_cur
                else:   # city and vip_seg
                    semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] + self.num_stuff_classes
            else:
                # for stuff (0 - n-1)
                if self.kitti_step:
                    cat_cur = segment["category_id"]
                    cat_cur -= 1
                    offset = 0
                    for thing_id in kitti_step2cityscpaes:
                        if cat_cur + offset >= thing_id:
                            offset += 1
                    cat_cur += offset
                    semantic_seg[panoptic_seg == segment["id"]] = cat_cur
                else:   # city and vip_seg
                    semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] - 1
        return semantic_seg

    def generate_track_id_maps(self, ids, masks, panopitc_seg_maps):

        final_id_maps = np.zeros(panopitc_seg_maps.shape)

        if len(ids) == 0:
            return final_id_maps
        masks = masks.bool()

        for i, id in enumerate(ids):
            mask = masks[i].cpu().numpy()
            final_id_maps[mask] = id

        return final_id_maps

    def add_ref_loss(self, loss_dict):
        track_loss ={}
        for k, v in loss_dict.items():
            track_loss[str(k)+"_ref"] = v
        return track_loss

    def add_ref_rpn_loss(self, loss_dict):
        ref_rpn_loss = {}
        for k, v in loss_dict.items():
            ref_rpn_loss[str(k) +"_ref_rpn"] = v
        return ref_rpn_loss

================================================
FILE: knet/video/knet_quansi_dense_embed_fc_toy_exp.py
================================================
import warnings
import numpy as np
import torch
import torch.nn.functional as F
from mmdet.models.builder import DETECTORS
from mmdet.models.detectors import BaseDetector
from mmdet.models.builder import build_head, build_neck, build_backbone, build_roi_extractor
from mmdet.core import build_assigner, build_sampler
from knet.video.qdtrack.builder import build_tracker
from knet.det.utils import sem2ins_masks, sem2ins_masks_cityscapes, sem2ins_masks_kitti_step
from unitrack.mask import tensor_mask2box


@DETECTORS.register_module()
class VideoKNetQuansiEmbedFCToy(BaseDetector):
    """
        Simple Extension of KNet to Video KNet by directly propagation the kernels.
    """
    def __init__(self,
                 backbone,
                 neck=None,
                 rpn_head=None,
                 roi_head=None,
                 track_head=None,
                 extra_neck=None,
                 track_localization_fpn=None,
                 track_mhsa=False,
                 tracker=None,
                 train_cfg=None,
                 test_cfg=None,
                 track_train_cfg=None,
                 pretrained=None,
                 init_cfg=None,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 ignore_label=255,
                 thing_label_in_seg=0,
                 detach_mask_emd=False,
                 cityscapes=False,
                 kitti_step=False,
                 freeze_detector=False,
                 semantic_filter=True,
                 link_previous=False,
                 bbox_roi_extractor=dict(
                     type='SingleRoIExtractor',
                     roi_layer=dict(
                         type='RoIAlign', output_size=7, sampling_ratio=2),
                     out_channels=256,
                     featmap_strides=[4, 8, 16, 32]),
                 **kwargs):
        super(VideoKNetQuansiEmbedFCToy, self).__init__(init_cfg)

        if pretrained:
            warnings.warn('DeprecationWarning: pretrained is deprecated, '
                          'please use "init_cfg" instead')
            backbone.pretrained = pretrained
        self.backbone = build_backbone(backbone)

        if neck is not None:
            self.neck = build_neck(neck)

        if extra_neck is not None:
            self.extra_neck = build_neck(extra_neck)

        if rpn_head is not None:
            rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None
            rpn_head_ = rpn_head.copy()
            rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn)
            self.rpn_head = build_head(rpn_head_)

        if roi_head is not None:
            # update train and test cfg here for now
            # TODO: refactor assigner & sampler
            rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None
            roi_head.update(train_cfg=rcnn_train_cfg)
            roi_head.update(test_cfg=test_cfg.rcnn)
            roi_head.pretrained = pretrained
            self.roi_head = build_head(roi_head)

        if track_head is not None:
            self.track_train_cfg = track_train_cfg
            self.track_head = build_head(track_head)
            self.init_track_assigner_sampler()
            if track_localization_fpn is not None:
                self.track_localization_fpn = build_neck(track_localization_fpn)
            if bbox_roi_extractor is not None:
                self.track_roi_extractor = build_roi_extractor(
                    bbox_roi_extractor)

        if tracker is not None:
            self.tracker_cfg = tracker

        if freeze_detector:
           self._freeze_detector()

        self.train_cfg = train_cfg
        self.test_cfg = test_cfg
        self.num_proposals = self.rpn_head.num_proposals
        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.thing_label_in_seg = thing_label_in_seg
        self.ignore_label = ignore_label
        self.cityscapes = cityscapes  # whether to train the cityscape panoptic segmentation
        self.kitti_step = kitti_step  # whether to train the kitti step panoptic segmentation

        self.semantic_filter = semantic_filter
        self.link_previous = link_previous
        self.detach_mask_emd = detach_mask_emd
        self.track_mhsa = track_mhsa
        # add embedding fcs for the final stage queries
        # num_emb_fcs = 1
        # act_cfg = dict(type='ReLU', inplace=True)
        # in_channels = 256
        # out_channels = 256
        # self.embed_fcs = nn.ModuleList()
        # for _ in range(num_emb_fcs):
        #     self.embed_fcs.append(
        #         nn.Linear(in_channels, in_channels, bias=False))
        #     self.embed_fcs.append(
        #         build_norm_layer(dict(type='LN'), in_channels)[1])
        #     self.embed_fcs.append(build_activation_layer(act_cfg))
        #
        # self.fc_embed = nn.Linear(in_channels, out_channels)

    def init_tracker(self):
        self.tracker = build_tracker(self.tracker_cfg)

    def _freeze_detector(self):

        self.detector = [
            self.rpn_head, self.roi_head
        ]
        for model in self.detector:
            model.eval()
            for param in model.parameters():
                param.requires_grad = False

    def init_track_assigner_sampler(self):
        """Initialize assigner and sampler."""

        self.track_roi_assigner = build_assigner(
            self.track_train_cfg.assigner)
        self.track_share_assigner = False

        self.track_roi_sampler = build_sampler(
            self.track_train_cfg.sampler, context=self)
        self.track_share_sampler = False

    def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_semantic_seg):
        # gt_masks and gt_semantic_seg are not padded when forming batch
        gt_masks_tensor = []
        gt_sem_seg = []
        gt_sem_cls = []
        # batch_input_shape shoud be the same across images
        pad_H, pad_W = img_metas[0]['batch_input_shape']
        assign_H = pad_H // self.mask_assign_stride
        assign_W = pad_W // self.mask_assign_stride

        for i, gt_mask in enumerate(gt_masks):
            mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device)
            if gt_mask.width != pad_W or gt_mask.height != pad_H:
                pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height)
                mask_tensor = F.pad(mask_tensor, pad_wh, value=0)

            if gt_semantic_seg is not None:
                # gt_semantic seg is padded by zero when forming a batch
                # need to convert them from 0 to ignore
                gt_semantic_seg[
                i, :, img_metas[i]['img_shape'][0]:, :] = self.ignore_label
                gt_semantic_seg[
                i, :, :, img_metas[i]['img_shape'][1]:] = self.ignore_label
                if self.cityscapes:
                    sem_labels, sem_seg = sem2ins_masks_cityscapes(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=self.num_thing_classes,
                        thing_label_in_seg=list(range(self.num_stuff_classes,
                                                      self.num_thing_classes + self.num_stuff_classes))
                    )
                elif self.kitti_step:
                    sem_labels, sem_seg = sem2ins_masks_kitti_step(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=2,
                        thing_label_in_seg=(11, 13))
                else:
                    sem_labels, sem_seg = sem2ins_masks(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=self.num_thing_classes,
                        thing_label_in_seg=self.thing_label_in_seg)

                if sem_seg.shape[0] == 0:
                    gt_sem_seg.append(
                        mask_tensor.new_zeros(
                            (mask_tensor.size(0), assign_H, assign_W)))
                else:
                    gt_sem_seg.append(
                        F.interpolate(
                            sem_seg[None], (assign_H, assign_W),
                            mode='bilinear',
                            align_corners=False)[0])
                gt_sem_cls.append(sem_labels)
            else:
                gt_sem_seg = None
                gt_sem_cls = None

            if mask_tensor.shape[0] == 0:
                gt_masks_tensor.append(
                    mask_tensor.new_zeros(
                        (mask_tensor.size(0), assign_H, assign_W)))
            else:
                gt_masks_tensor.append(
                    F.interpolate(
                        mask_tensor[None], (assign_H, assign_W),  # downsample to 1/4 resolution
                        mode='bilinear',
                        align_corners=False)[0])

        return gt_masks_tensor, gt_sem_cls, gt_sem_seg

    def forward_train(self,
                      img,
                      img_metas,
                      gt_bboxes=None,
                      gt_labels=None,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      gt_semantic_seg=None,
                      gt_instance_ids=None,
                      ref_img=None,
                      ref_img_metas=None,
                      ref_gt_bboxes_ignore=None,
                      ref_gt_labels=None,
                      ref_gt_bboxes=None,
                      ref_gt_masks=None,
                      ref_gt_semantic_seg=None,
                      ref_gt_instance_ids=None,
                      proposals=None,
                      **kwargs):
        """Forward function of SparseR-CNN-like network in train stage.

        Args:
            img (Tensor): of shape (N, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.
            img_metas (list[dict]): list of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                :class:`mmdet.datasets.pipelines.Collect`.
            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
            gt_labels (list[Tensor]): class indices corresponding to each box
            gt_bboxes_ignore (None | list[Tensor): specify which bounding
                boxes can be ignored when computing the loss.
            gt_masks (List[Tensor], optional) : Segmentation masks for
                each box. But we don't support it in this architecture.
            proposals (List[Tensor], optional): override rpn proposals with
                custom proposals. Use when `with_rpn` is False.

            # This is for video only:
            ref_img (Tensor): of shape (N, 2, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.
                2 denotes there is two reference images for each input image.

            ref_img_metas (list[list[dict]]): The first list only has one
                element. The second list contains reference image information
                dict where each dict has: 'img_shape', 'scale_factor', 'flip',
                and may also contain 'filename', 'ori_shape', 'pad_shape', and
                'img_norm_cfg'. For details on the values of these keys see
                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.

            ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The
                Tensor contains ground truth bboxes for each reference image
                with shape (num_all_ref_gts, 5) in
                [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id
                start from 0, and denotes the id of reference image for each
                key image.

            ref_gt_labels (list[Tensor]): The list only has one Tensor. The
                Tensor contains class indices corresponding to each reference
                box with shape (num_all_ref_gts, 2) in
                [ref_img_id, class_indice].

        Returns:
            dict[str, Tensor]: a dictionary of loss components
        """
        batch_input_shape = tuple(img[0].size()[-2:])
        for img_meta in img_metas:
            img_meta['batch_input_shape'] = batch_input_shape

        assert proposals is None, 'KNet does not support' \
                                  ' external proposals'
        assert gt_masks is not None
        assert gt_instance_ids is not None

        # preprocess the reference images
        ref_img = ref_img.squeeze(1)  # (b,3,h,w)
        img_h, img_w = batch_input_shape
        ref_masks_gt = []
        for ref_gt_mask in ref_gt_masks:
            ref_masks_gt.append(ref_gt_mask[0])

        ref_labels_gt = []
        for ref_gt_label in ref_gt_labels:
            ref_labels_gt.append(ref_gt_label[:, 1].long())
        ref_gt_labels = ref_labels_gt

        ref_semantic_seg_gt = ref_gt_semantic_seg.squeeze(1)

        ref_gt_instance_id_list = []
        for ref_gt_instance_id in ref_gt_instance_ids:
            ref_gt_instance_id_list.append(ref_gt_instance_id[:,1].long())

        ref_img_metas_new = []
        for ref_img_meta in ref_img_metas:
            ref_img_meta[0]['batch_input_shape'] = batch_input_shape
            ref_img_metas_new.append(ref_img_meta[0])

        # prepare the gt_match_indices
        gt_pids_list = []
        for i in range(len(ref_gt_instance_id_list)):
            ref_ids = ref_gt_instance_id_list[i].cpu().data.numpy().tolist()
            gt_ids = gt_instance_ids[i].cpu().data.numpy().tolist()
            gt_pids = [ref_ids.index(i) if i in ref_ids else -1 for i in gt_ids]
            gt_pids_list.append(torch.LongTensor([gt_pids]).to(img.device)[0])

        gt_match_indices = gt_pids_list

        # gt_masks and gt_semantic_seg are not padded when forming batch
        gt_masks, gt_sem_cls, gt_sem_seg = self.preprocess_gt_masks(img_metas, gt_masks, gt_labels, gt_semantic_seg)
        ref_gt_masks, ref_gt_sem_cls, ref_gt_sem_seg = self.preprocess_gt_masks(ref_img_metas_new,
                                                                    ref_masks_gt, ref_gt_labels, ref_semantic_seg_gt)

        x = self.extract_feat(img)
        x_ref = self.extract_feat(ref_img)

        rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks,
                                                  gt_labels, gt_sem_seg,
                                                  gt_sem_cls)

        # simple forward to get the reference results
        self.rpn_head.eval()
        ref_rpn_results = self.rpn_head.simple_test_rpn(x_ref, ref_img_metas_new)
        self.rpn_head.train()

        (rpn_losses, proposal_feats, x_feats, mask_preds,
         cls_scores) = rpn_results

        (ref_proposal_feats, ref_x_feats, ref_mask_preds,
         ref_cls_scores, ref_seg_preds) = ref_rpn_results

        ref_obj_feats,  ref_cls_scores, ref_mask_preds, ref_scaled_mask_preds = self.roi_head.simple_test_mask_preds(
            ref_x_feats,
            ref_proposal_feats,
            ref_mask_preds,
            ref_cls_scores,
            ref_img_metas_new,
           )

        if self.link_previous:
            losses, object_feats, cls_scores, mask_preds, scaled_mask_preds, object_feats_track = self.roi_head.forward_train_with_previous(
                x_feats,
                proposal_feats,
                mask_preds,
                cls_scores,
                img_metas,
                gt_masks,
                gt_labels,
                gt_bboxes_ignore=gt_bboxes_ignore,
                gt_bboxes=gt_bboxes,
                gt_sem_seg=gt_sem_seg,
                gt_sem_cls=gt_sem_cls,
                imgs_whwh=None,
                previous_obj_feats=ref_obj_feats,
                previous_mask_preds=ref_scaled_mask_preds,
                previous_x_feats=ref_x_feats,
            )
        else:
            # forward to get the current results
            losses, object_feats, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.forward_train(
                x_feats,
                proposal_feats,
                mask_preds,
                cls_scores,
                img_metas,
                gt_masks,
                gt_labels,
                gt_bboxes_ignore=gt_bboxes_ignore,
                gt_bboxes=gt_bboxes,
                gt_sem_seg=gt_sem_seg,
                gt_sem_cls=gt_sem_cls,
                imgs_whwh=None)

        # ===== Tracking Part -==== #
        # assign both key frame and reference frame tracking targets
        key_sampling_results, ref_sampling_results = [], []
        num_imgs = len(img_metas)

        for i in range(num_imgs):
            assign_result = self.track_roi_assigner.assign(
                scaled_mask_preds[i][:self.num_proposals].detach(), cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(),
                gt_masks[i], gt_labels[i], img_meta=img_metas[i])
            sampling_result = self.track_roi_sampler.sample(
                assign_result,
                mask_preds[i][:self.num_proposals].detach(),
                gt_masks[i])
            key_sampling_results.append(sampling_result)

            ref_assign_result = self.track_roi_assigner.assign(
                ref_scaled_mask_preds[i][:self.num_proposals].detach(), ref_cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(),
                ref_gt_masks[i], ref_gt_labels[i], img_meta=ref_img_metas_new[i])
            ref_sampling_result = self.track_roi_sampler.sample(
                ref_assign_result,
                ref_mask_preds[i][:self.num_proposals].detach(),
                ref_gt_masks[i])
            ref_sampling_results.append(ref_sampling_result)
        if self.detach_mask_emd:
            object_feats = object_feats.detach()
            ref_obj_feats = ref_obj_feats.detach()

        if self.link_previous:
            object_feats = object_feats_track

        N, num_proposal, _, _, _ = object_feats.shape
        emb_feat = object_feats.squeeze(-2).squeeze(-1)[:, :self.num_proposals, ]

        for emb_layer in self.embed_fcs:
            emb_feat = emb_layer(emb_feat)
        object_feats_embed = self.fc_embed(emb_feat).view(N, self.num_proposals, -1)


        ref_emb_feat = ref_obj_feats.squeeze(-2).squeeze(-1)[:, :self.num_proposals, ]
        for emb_layer in self.embed_fcs:
            ref_emb_feat = emb_layer(ref_emb_feat)
        ref_object_feats_embed = self.fc_embed(ref_emb_feat).view(N, self.num_proposals, -1)

        # sampling predicted GT mask
        key_emb_indexs = [res.pos_inds for res in key_sampling_results]
        object_feats_embed_list = []
        for i in range(len(key_emb_indexs)):
            object_feats_embed_list.append(object_feats_embed[:, key_emb_indexs[i], :].squeeze(0))

        key_feats = self._track_forward(object_feats_embed_list)

        ref_emb_indexs = [res.pos_inds for res in ref_sampling_results]
        ref_object_feats_embed_list = []
        for i in range(len(ref_emb_indexs)):
            ref_object_feats_embed_list.append(ref_object_feats_embed[:, ref_emb_indexs[i], :].squeeze(0))

        ref_feats = self._track_forward(ref_object_feats_embed_list)

        match_feats = self.track_head.match(key_feats, ref_feats,
                                            key_sampling_results,
                                            ref_sampling_results)

        asso_targets = self.track_head.get_track_targets(
            gt_match_indices, key_sampling_results, ref_sampling_results)
        loss_track = self.track_head.loss(*match_feats, *asso_targets)

        losses.update(loss_track)
        losses.update(rpn_losses)

        return losses

    def simple_test(self, img, img_metas, rescale=False, ref_img=None, **kwargs):
        """Test function without test time augmentation.

        Args:
            imgs (list[torch.Tensor]): List of multiple images
            img_metas (list[dict]): List of image information.
            rescale (bool): Whether to rescale the results.
                Defaults to False.

        Returns:
            list[list[np.ndarray]]: BBox results of each image and classes.
                The outer list corresponds to each image. The inner list
                corresponds to each class.
        """
        # set the dataset type
        # whether is the first frame for such clips
        if self.cityscapes and not self.kitti_step:
            iid = img_metas[0]['iid']
            fid = iid % 10000
            is_first = (fid == 1)
        else:
            iid = kwargs['img_id'][0].item()
            fid = iid % 10000
            is_first = (fid == 0)

        if is_first:
            self.init_tracker()
            self.obj_feats_memory = None
            self.x_feats_memory = None
            self.mask_preds_memory = None

        # for current frame
        x = self.extract_feat(img)
        # current frame inference
        rpn_results = self.rpn_head.simple_test_rpn(x, img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results

        cur_segm_results, query_output, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.simple_test(
                x_feats,
                proposal_feats,
                mask_preds,
                cls_scores,
                img_metas)

        # for tracking part
        _, segm_result, mask_preds, panoptic_result, query_output = cur_segm_results[0]
        panoptic_seg, segments_info = panoptic_result


        # get sorted tracking thing ids, labels, masks, score for tracking
        things_index_for_tracking, things_labels_for_tracking, thing_masks_for_tracking, things_score_for_tracking = \
            self.get_things_id_for_tracking(panoptic_seg, segments_info)
        things_labels_for_tracking = torch.Tensor(things_labels_for_tracking).to(cls_scores.device).long()

        # get the semantic filter
        if self.semantic_filter:
            seg_preds = torch.nn.functional.interpolate(seg_preds, panoptic_seg.shape, mode='bilinear',
                                                        align_corners=False)
            seg_preds = seg_preds.sigmoid()
            seg_out = seg_preds.argmax(1)
            semantic_thing = (seg_out < self.num_thing_classes).to(dtype=torch.float32)
        else:
            semantic_thing = 1.

        if len(things_labels_for_tracking) > 0:
            things_bbox_for_tracking = torch.zeros((len(things_score_for_tracking), 5),
                                                   dtype=torch.float, device=x_feats.device)
            things_bbox_for_tracking[:, 4] = torch.tensor(things_score_for_tracking,
                                                          device=things_bbox_for_tracking.device)

            thing_masks_for_tracking_final = []
            for mask in thing_masks_for_tracking:
                thing_masks_for_tracking_final.append(torch.Tensor(mask).unsqueeze(0).to(
                    x_feats.device).float())
            thing_masks_for_tracking_final = torch.cat(thing_masks_for_tracking_final, 0)
            thing_masks_for_tracking = thing_masks_for_tracking_final
            thing_masks_for_tracking_with_semantic_filter = thing_masks_for_tracking_final * semantic_thing

        if len(things_labels_for_tracking) == 0:
            track_feats = None
        else:
            # tracking embeddings
            N, _, _, _ = query_output.shape
            emb_feat = query_output.squeeze(-2).squeeze(-1).unsqueeze(0)  # (n,d,1,1) -> (1,n,d)

            # for emb_layer in self.embed_fcs:
            #     emb_feat = emb_layer(emb_feat)
            # object_feats_embed = self.fc_embed(emb_feat).view(1, N, -1)

            track_feats = emb_feat.squeeze(0)
            # tracking embedding features
            # track_feats = self._track_forward([object_feats_embed_for_tracking])

        if track_feats is not None:
            things_bbox_for_tracking[:, :4] = torch.tensor(tensor_mask2box(thing_masks_for_tracking_with_semantic_filter),
                                                           device=things_bbox_for_tracking.device)
            bboxes, labels, ids = self.tracker.match(
                bboxes=things_bbox_for_tracking,
                labels=things_labels_for_tracking,
                track_feats=track_feats,
                frame_id=fid)
            ids = ids + 1
            ids[ids == -1] = 0
        else:
            ids = []

        track_maps = self.generate_track_id_maps(ids, thing_masks_for_tracking, panoptic_seg)

        semantic_map = self.get_semantic_seg(panoptic_seg, segments_info)

        from scripts.visualizer import trackmap2rgb, cityscapes_cat2rgb, draw_bbox_on_img
        vis_tracker = trackmap2rgb(track_maps)
        vis_sem = cityscapes_cat2rgb(semantic_map)
        if len(things_labels_for_tracking):
            vis_tracker = draw_bbox_on_img(vis_tracker, things_bbox_for_tracking.cpu().numpy())

        # Visualization usage
        return semantic_map, track_maps, None, vis_sem, vis_tracker

    def _track_forward(self, track_feats, x=None, mask_pred=None):
        """Track head forward function used in both training and testing.
        We use mask pooling to get the fine grain features"""
        # if not self.training:
        #     mask_pred = [mask_pred]
        # bbox_list = batch_mask2boxlist(mask_pred)
        # track_rois = bboxlist2roi(bbox_list)
        # track_rois = track_rois.clamp(min=0.0)
        # track_feats = self.track_roi_extractor(x[:self.track_roi_extractor.num_inputs], track_rois)
        track_feats = torch.cat(track_feats, 0)
        # print(track_feats.shape)
        # print(track_feats.shape)
        # track_feats = track_feats

        track_feats = self.track_head(track_feats)

        return track_feats

    def forward_dummy(self, img):
        """Used for computing network flops.

        See `mmdetection/tools/get_flops.py`
        """
        # backbone
        x = self.extract_feat(img)
        # rpn
        num_imgs = len(img)
        dummy_img_metas = [
            dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs)
        ]
        rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results
        roi_outs = self.roi_head.forward_dummy(x_feats, proposal_feats,
                                               dummy_img_metas)
        return roi_outs

    def extract_feat(self, img):
        """Directly extract features from the backbone+neck."""
        x = self.backbone(img)
        if self.with_neck:
            x = self.neck(x)
        return x

    @property
    def with_rpn(self):
        """bool: whether the detector has RPN"""
        return hasattr(self, 'rpn_head') and self.rpn_head is not None

    @property
    def with_roi_head(self):
        """bool: whether the detector has a RoI head"""
        return hasattr(self, 'roi_head') and self.roi_head is not None

    def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
        """Test with augmentations.

        If rescale is False, then returned bboxes and masks will fit the scale
        of imgs[0].
        """
        pass

    def get_things_id_for_tracking(self, panoptic_seg, seg_infos):
        idxs = []
        labels = []
        masks = []
        score = []
        for segment in seg_infos:
            if segment['isthing'] == True:
                thing_mask = panoptic_seg == segment["id"]
                masks.append(thing_mask)
                idxs.append(segment["instance_id"])
                labels.append(segment['category_id'])
                score.append(segment['score'])
        return idxs, labels, masks, score

    def pack_things_object(self, object_feats, ref_object_feats):
        object_feats, ref_object_feats = object_feats.squeeze(-1).squeeze(-1), ref_object_feats.squeeze(-1).squeeze(-1)
        thing_object_feats = torch.split(object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        ref_thing_object_feats = torch.split(ref_object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        return thing_object_feats, ref_thing_object_feats

    def pack_things_masks(self, mask_pred, ref_mask_pred):
        thing_mask_pred = torch.split(mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        ref_thing_thing_mask_pred = torch.split(ref_mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        return thing_mask_pred, ref_thing_thing_mask_pred

    def get_semantic_seg(self, panoptic_seg, segments_info):
        results = {}
        masks = []
        scores = []
        kitti_step2cityscpaes = [11, 13]
        semantic_seg = np.zeros(panoptic_seg.shape)
        for segment in segments_info:
            if segment['isthing'] == True:
                if self.kitti_step:
                    cat_cur = kitti_step2cityscpaes[segment["category_id"]]
                    semantic_seg[panoptic_seg == segment["id"]] = cat_cur
                else:
                    semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] + self.num_stuff_classes
            else:
                # for stuff (0- n-1)
                if self.kitti_step:
                    cat_cur = segment["category_id"]
                    cat_cur -= 1
                    offset = 0
                    for thing_id in kitti_step2cityscpaes:
                        if cat_cur + offset >= thing_id:
                            offset += 1
                    cat_cur += offset
                    semantic_seg[panoptic_seg == segment["id"]] = cat_cur
                else:
                    semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] - 1
        return semantic_seg

    def generate_track_id_maps(self, ids, masks, panopitc_seg_maps):
        final_id_maps = np.zeros(panopitc_seg_maps.shape)
        if len(ids) == 0:
            return final_id_maps
        # assert len(things_mask_results) == len(track_results)
        masks = masks.bool()
        for i, id in enumerate(ids):
            mask = masks[i].cpu().numpy()
            final_id_maps[mask] = id
        return final_id_maps

================================================
FILE: knet/video/knet_quansi_dense_roi_gt_box.py
================================================
import warnings
import numpy as np
import torch
import torch.nn.functional as F
from mmcv.cnn import ConvModule
from mmdet.models.builder import DETECTORS
from mmdet.models.detectors import BaseDetector
from mmdet.models.builder import build_head, build_neck, build_backbone, build_roi_extractor
from mmdet.core import build_assigner, build_sampler
from knet.video.qdtrack.builder import build_tracker
from knet.det.utils import sem2ins_masks, sem2ins_masks_cityscapes, sem2ins_masks_kitti_step
from unitrack.mask import tensor_mask2box
from unitrack.utils.mask import mask2box, batch_mask2boxlist, bboxlist2roi

@DETECTORS.register_module()
class VideoKNetQuansiTrackROIGTBox(BaseDetector):
    """
        Simple Extension of KNet to Video KNet by the implementation of VPSFuse Net.
    """
    def __init__(self,
                 backbone,
                 neck=None,
                 rpn_head=None,
                 roi_head=None,
                 track_head=None,
                 extra_neck=None,
                 track_localization_fpn=None,
                 tracker=None,
                 train_cfg=None,
                 test_cfg=None,
                 track_train_cfg=None,
                 pretrained=None,
                 init_cfg=None,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 ignore_label=255,
                 thing_label_in_seg=0,
                 cityscapes=False,
                 kitti_step=False,
                 freeze_detector=False,
                 semantic_filter=False,
                 # linking parameters
                 link_previous=False,
                 bbox_roi_extractor=dict(
                     type='SingleRoIExtractor',
                     roi_layer=dict(
                         type='RoIAlign', output_size=7, sampling_ratio=2),
                     out_channels=256,
                     featmap_strides=[4, 8, 16, 32]),
                 **kwargs):
        super(VideoKNetQuansiTrackROIGTBox, self).__init__(init_cfg)

        if pretrained:
            warnings.warn('DeprecationWarning: pretrained is deprecated, '
                          'please use "init_cfg" instead')
            backbone.pretrained = pretrained
        self.backbone = build_backbone(backbone)

        if neck is not None:
            self.neck = build_neck(neck)

        if extra_neck is not None:
            self.extra_neck = build_neck(extra_neck)

        if rpn_head is not None:
            rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None
            rpn_head_ = rpn_head.copy()
            rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn)
            self.rpn_head = build_head(rpn_head_)

        if roi_head is not None:
            # update train and test cfg here for now
            # TODO: refactor assigner & sampler
            rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None
            roi_head.update(train_cfg=rcnn_train_cfg)
            roi_head.update(test_cfg=test_cfg.rcnn)
            roi_head.pretrained = pretrained
            self.roi_head = build_head(roi_head)

        if track_head is not None:
            self.track_train_cfg = track_train_cfg
            self.track_head = build_head(track_head)
            self.init_track_assigner_sampler()
            if track_localization_fpn is not None:
                self.track_localization_fpn = build_neck(track_localization_fpn)

            self.track_roi_extractor = build_roi_extractor(
                bbox_roi_extractor)

        if tracker is not None:
            self.tracker_cfg = tracker

        if freeze_detector:
           self._freeze_detector()

        self.train_cfg = train_cfg
        self.test_cfg = test_cfg
        self.num_proposals = self.rpn_head.num_proposals
        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.thing_label_in_seg = thing_label_in_seg
        self.ignore_label = ignore_label
        self.cityscapes = cityscapes  # whether to train the cityscape panoptic segmentation
        self.kitti_step = kitti_step  # whether to train the kitti step panoptic segmentation

        self.semantic_filter = semantic_filter
        self.link_previous = link_previous

    def init_tracker(self):
        self.tracker = build_tracker(self.tracker_cfg)

    def _freeze_detector(self):

        self.detector = [
            self.rpn_head, self.roi_head
        ]
        for model in self.detector:
            model.eval()
            for param in model.parameters():
                param.requires_grad = False

    def init_track_assigner_sampler(self):
        """Initialize assigner and sampler."""

        self.track_roi_assigner = build_assigner(
            self.track_train_cfg.assigner)
        self.track_share_assigner = False

        self.track_roi_sampler = build_sampler(
            self.track_train_cfg.sampler, context=self)
        self.track_share_sampler = False

    def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_semantic_seg):
        # gt_masks and gt_semantic_seg are not padded when forming batch
        gt_masks_tensor = []
        gt_sem_seg = []
        gt_sem_cls = []
        # batch_input_shape shoud be the same across images
        pad_H, pad_W = img_metas[0]['batch_input_shape']
        assign_H = pad_H // self.mask_assign_stride
        assign_W = pad_W // self.mask_assign_stride

        for i, gt_mask in enumerate(gt_masks):
            mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device)
            if gt_mask.width != pad_W or gt_mask.height != pad_H:
                pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height)
                mask_tensor = F.pad(mask_tensor, pad_wh, value=0)

            if gt_semantic_seg is not None:
                # gt_semantic seg is padded by zero when forming a batch
                # need to convert them from 0 to ignore
                gt_semantic_seg[
                i, :, img_metas[i]['img_shape'][0]:, :] = self.ignore_label
                gt_semantic_seg[
                i, :, :, img_metas[i]['img_shape'][1]:] = self.ignore_label
                if self.cityscapes:
                    sem_labels, sem_seg = sem2ins_masks_cityscapes(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=self.num_thing_classes,
                        thing_label_in_seg=list(range(self.num_stuff_classes,
                                                      self.num_thing_classes + self.num_stuff_classes))
                    )
                elif self.kitti_step:
                    sem_labels, sem_seg = sem2ins_masks_kitti_step(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=2,
                        thing_label_in_seg=(11, 13))
                else:
                    sem_labels, sem_seg = sem2ins_masks(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=self.num_thing_classes,
                        thing_label_in_seg=self.thing_label_in_seg)

                if sem_seg.shape[0] == 0:
                    gt_sem_seg.append(
                        mask_tensor.new_zeros(
                            (mask_tensor.size(0), assign_H, assign_W)))
                else:
                    gt_sem_seg.append(
                        F.interpolate(
                            sem_seg[None], (assign_H, assign_W),
                            mode='bilinear',
                            align_corners=False)[0])
                gt_sem_cls.append(sem_labels)
            else:
                gt_sem_seg = None
                gt_sem_cls = None

            if mask_tensor.shape[0] == 0:
                gt_masks_tensor.append(
                    mask_tensor.new_zeros(
                        (mask_tensor.size(0), assign_H, assign_W)))
            else:
                gt_masks_tensor.append(
                    F.interpolate(
                        mask_tensor[None], (assign_H, assign_W),  # downsample to 1/4 resolution
                        mode='bilinear',
                        align_corners=False)[0])

        return gt_masks_tensor, gt_sem_cls, gt_sem_seg

    def forward_train(self,
                      img,
                      img_metas,
                      gt_bboxes=None,
                      gt_labels=None,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      gt_semantic_seg=None,
                      gt_instance_ids=None,
                      ref_img=None,
                      ref_img_metas=None,
                      ref_gt_bboxes_ignore=None,
                      ref_gt_labels=None,
                      ref_gt_bboxes=None,
                      ref_gt_masks=None,
                      ref_gt_semantic_seg=None,
                      ref_gt_instance_ids=None,
                      proposals=None,
                      **kwargs):
        """Forward function of SparseR-CNN-like network in train stage.

        Args:
            img (Tensor): of shape (N, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.
            img_metas (list[dict]): list of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                :class:`mmdet.datasets.pipelines.Collect`.
            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
            gt_labels (list[Tensor]): class indices corresponding to each box
            gt_bboxes_ignore (None | list[Tensor): specify which bounding
                boxes can be ignored when computing the loss.
            gt_masks (List[Tensor], optional) : Segmentation masks for
                each box. But we don't support it in this architecture.
            proposals (List[Tensor], optional): override rpn proposals with
                custom proposals. Use when `with_rpn` is False.

            # This is for video only:
            ref_img (Tensor): of shape (N, 2, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.
                2 denotes there is two reference images for each input image.

            ref_img_metas (list[list[dict]]): The first list only has one
                element. The second list contains reference image information
                dict where each dict has: 'img_shape', 'scale_factor', 'flip',
                and may also contain 'filename', 'ori_shape', 'pad_shape', and
                'img_norm_cfg'. For details on the values of these keys see
                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.

            ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The
                Tensor contains ground truth bboxes for each reference image
                with shape (num_all_ref_gts, 5) in
                [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id
                start from 0, and denotes the id of reference image for each
                key image.

            ref_gt_labels (list[Tensor]): The list only has one Tensor. The
                Tensor contains class indices corresponding to each reference
                box with shape (num_all_ref_gts, 2) in
                [ref_img_id, class_indice].

        Returns:
            dict[str, Tensor]: a dictionary of loss components
        """
        batch_input_shape = tuple(img[0].size()[-2:])
        for img_meta in img_metas:
            img_meta['batch_input_shape'] = batch_input_shape

        assert proposals is None, 'KNet does not support' \
                                  ' external proposals'
        assert gt_masks is not None
        assert gt_instance_ids is not None

        # preprocess the reference images
        ref_img = ref_img.squeeze(1)  # (b,3,h,w)
        img_h, img_w = batch_input_shape
        ref_masks_gt = []
        for ref_gt_mask in ref_gt_masks:
            ref_masks_gt.append(ref_gt_mask[0])

        ref_labels_gt = []
        for ref_gt_label in ref_gt_labels:
            ref_labels_gt.append(ref_gt_label[:, 1].long())
        ref_gt_labels = ref_labels_gt

        ref_semantic_seg_gt = ref_gt_semantic_seg.squeeze(1)

        ref_gt_instance_id_list = []
        for ref_gt_instance_id in ref_gt_instance_ids:
            ref_gt_instance_id_list.append(ref_gt_instance_id[:,1].long())

        ref_img_metas_new = []
        for ref_img_meta in ref_img_metas:
            ref_img_meta[0]['batch_input_shape'] = batch_input_shape
            ref_img_metas_new.append(ref_img_meta[0])

        # prepare the gt_match_indices
        gt_pids_list = []
        for i in range(len(ref_gt_instance_id_list)):
            ref_ids = ref_gt_instance_id_list[i].cpu().data.numpy().tolist()
            gt_ids = gt_instance_ids[i].cpu().data.numpy().tolist()
            gt_pids = [ref_ids.index(i) if i in ref_ids else -1 for i in gt_ids]
            gt_pids_list.append(torch.LongTensor([gt_pids]).to(img.device)[0])

        gt_match_indices = gt_pids_list

        # gt_masks and gt_semantic_seg are not padded when forming batch
        gt_masks, gt_sem_cls, gt_sem_seg = self.preprocess_gt_masks(img_metas, gt_masks, gt_labels, gt_semantic_seg)

        ref_gt_masks, ref_gt_sem_cls, ref_gt_sem_seg = self.preprocess_gt_masks(ref_img_metas_new,
                                                                    ref_masks_gt, ref_gt_labels, ref_semantic_seg_gt)

        x = self.extract_feat(img)
        x_ref = self.extract_feat(ref_img)

        rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks,
                                                  gt_labels, gt_sem_seg,
                                                  gt_sem_cls)

        # simple forward to get the reference results
        self.rpn_head.eval()
        ref_rpn_results = self.rpn_head.simple_test_rpn(x_ref, ref_img_metas_new)
        self.rpn_head.train()

        (rpn_losses, proposal_feats, x_feats, mask_preds,
         cls_scores) = rpn_results

        (ref_proposal_feats, ref_x_feats, ref_mask_preds,
         ref_cls_scores, ref_seg_preds) = ref_rpn_results

        ref_obj_feats,  ref_cls_scores, ref_mask_preds, ref_scaled_mask_preds = self.roi_head.simple_test_mask_preds(
            ref_x_feats,
            ref_proposal_feats,
            ref_mask_preds,
            ref_cls_scores,
            ref_img_metas_new,
           )

        if self.link_previous:
            losses, object_feats, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.forward_train_with_previous(
                x_feats,
                proposal_feats,
                mask_preds,
                cls_scores,
                img_metas,
                gt_masks,
                gt_labels,
                gt_bboxes_ignore=gt_bboxes_ignore,
                gt_bboxes=gt_bboxes,
                gt_sem_seg=gt_sem_seg,
                gt_sem_cls=gt_sem_cls,
                imgs_whwh=None,
                previous_obj_feats=ref_obj_feats,
                previous_mask_preds=ref_scaled_mask_preds,
                previous_x_feats=ref_x_feats,
            )
        else:
            # forward to get the current results
            losses, object_feats, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.forward_train(
                x_feats,
                proposal_feats,
                mask_preds,
                cls_scores,
                img_metas,
                gt_masks,
                gt_labels,
                gt_bboxes_ignore=gt_bboxes_ignore,
                gt_bboxes=gt_bboxes,
                gt_sem_seg=gt_sem_seg,
                gt_sem_cls=gt_sem_cls,
                imgs_whwh=None)

        # ===== Tracking Part -==== #
        # assign both key frame and reference frame tracking targets
        key_sampling_results, ref_sampling_results = [], []
        num_imgs = len(img_metas)

        for i in range(num_imgs):
            assign_result = self.track_roi_assigner.assign(
                scaled_mask_preds[i][:self.num_proposals].detach(), cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(),
                gt_masks[i], gt_labels[i], img_meta=img_metas[i])
            sampling_result = self.track_roi_sampler.sample(
                assign_result,
                mask_preds[i][:self.num_proposals].detach(),
                gt_masks[i])
            key_sampling_results.append(sampling_result)

            ref_assign_result = self.track_roi_assigner.assign(
                ref_scaled_mask_preds[i][:self.num_proposals].detach(), ref_cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(),
                ref_gt_masks[i], ref_gt_labels[i], img_meta=ref_img_metas_new[i])
            ref_sampling_result = self.track_roi_sampler.sample(
                ref_assign_result,
                ref_mask_preds[i][:self.num_proposals].detach(),
                ref_gt_masks[i])
            ref_sampling_results.append(ref_sampling_result)

        # roi feature embeddings
        key_masks = [res.pos_gt_masks for res in key_sampling_results]
        for i in range(len(key_masks)):
            key_masks[i] = F.interpolate(key_masks[i].unsqueeze(0),
                                        size=(img_h, img_w), mode="bilinear", align_corners=False).squeeze(0)
            key_masks[i] = (key_masks[i].sigmoid() > 0.5).float()

        key_feats = self._track_forward(x, key_masks)

        # roi feature embeddings
        ref_masks = [res.pos_gt_masks for res in ref_sampling_results]
        for i in range(len(ref_masks)):
            ref_masks[i] = F.interpolate(ref_masks[i].unsqueeze(0),
                                        size=(img_h, img_w), mode="bilinear", align_corners=False).squeeze(0)
            ref_masks[i] = (ref_masks[i].sigmoid() > 0.5).float()

        ref_feats = self._track_forward(x_ref, ref_masks)

        match_feats = self.track_head.match(key_feats, ref_feats,
                                            key_sampling_results,
                                            ref_sampling_results)

        asso_targets = self.track_head.get_track_targets(
            gt_match_indices, key_sampling_results, ref_sampling_results)
        loss_track = self.track_head.loss(*match_feats, *asso_targets)

        losses.update(loss_track)
        losses.update(rpn_losses)

        return losses

    def simple_test(self, img, img_metas, rescale=False, ref_img=None, **kwargs):
        """Test function without test time augmentation.

        Args:
            imgs (list[torch.Tensor]): List of multiple images
            img_metas (list[dict]): List of image information.
            rescale (bool): Whether to rescale the results.
                Defaults to False.

        Returns:
            list[list[np.ndarray]]: BBox results of each image and classes.
                The outer list corresponds to each image. The inner list
                corresponds to each class.
        """

        # whether is the first frame for such clips
        # assert 'city' in img_metas[0]['filename'] and 'iid' in img_metas[0]
        if "city" in img_metas[0]['filename']:
            iid = img_metas[0]['iid']
            fid = iid % 10000
            is_first = (fid == 1)
        elif "motchallenge" in img_metas[0]['filename']:
            iid = kwargs['img_id'][0].item()
            fid = iid % 10000
            is_first = (fid == 1)
            if is_first:
                print("First detected on {}".format(fid))
        else:
            iid = kwargs['img_id'][0].item()
            fid = iid % 10000
            is_first = (fid == 0)

        if is_first:
            self.init_tracker()
            self.obj_feats_memory = None
            self.x_feats_memory = None
            self.mask_preds_memory = None

        # for current frame
        x = self.extract_feat(img)
        # current frame inference
        rpn_results = self.rpn_head.simple_test_rpn(x, img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results

        if self.link_previous:
            cur_segm_results, query_output, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.simple_test_with_previous(
                x_feats,
                proposal_feats,
                mask_preds,
                cls_scores,
                img_metas,
                previous_obj_feats=self.obj_feats_memory,
                previous_mask_preds=self.mask_preds_memory,
                previous_x_feats=self.x_feats_memory,
            )
            self.obj_feats_memory = query_output
            self.x_feats_memory = x_feats
            self.mask_preds_memory = scaled_mask_preds
        else:
            cur_segm_results, query_output, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.simple_test(
                x_feats,
                proposal_feats,
                mask_preds,
                cls_scores,
                img_metas)

        # for tracking part
        _, segm_result, mask_preds, panoptic_result, _ = cur_segm_results[0]
        panoptic_seg, segments_info = panoptic_result

        if self.semantic_filter:
            seg_preds = torch.nn.functional.interpolate(seg_preds, panoptic_seg.shape, mode='bilinear', align_corners=False)
            seg_preds = seg_preds.sigmoid()
            seg_out = seg_preds.argmax(1)
            semantic_thing = (seg_out < self.num_thing_classes).to(dtype=torch.float32)
        else:
            semantic_thing = 1.

        # get sorted tracking thing ids, labels, masks, score for tracking
        things_index_for_tracking, things_labels_for_tracking, thing_masks_for_tracking, things_score_for_tracking = \
            self.get_things_id_for_tracking(panoptic_seg, segments_info)
        things_labels_for_tracking = torch.Tensor(things_labels_for_tracking).to(cls_scores.device).long()
        if len(things_labels_for_tracking) > 0:
            things_bbox_for_tracking = torch.zeros((len(things_score_for_tracking), 5),
                                                   dtype=torch.float, device=x_feats.device)
            things_bbox_for_tracking[:, 4] = torch.tensor(things_score_for_tracking,
                                                          device=things_bbox_for_tracking.device)

            thing_masks_for_tracking_final = []
            for mask in thing_masks_for_tracking:
                thing_masks_for_tracking_final.append(torch.Tensor(mask).unsqueeze(0).to(
                    x_feats.device).float())
            thing_masks_for_tracking_final = torch.cat(thing_masks_for_tracking_final, 0)
            thing_masks_for_tracking = thing_masks_for_tracking_final
            thing_masks_for_tracking_with_semantic_filter = thing_masks_for_tracking_final * semantic_thing

        if len(things_labels_for_tracking) == 0:
            track_feats = None
        else:
            # tracking embedding features
            track_feats = self._track_forward(x, thing_masks_for_tracking_with_semantic_filter)

        if track_feats is not None:
            # assert len(things_id_for_tracking) == len(things_labels_for_tracking)
            things_bbox_for_tracking[:, :4] = torch.tensor(tensor_mask2box(thing_masks_for_tracking_with_semantic_filter),
                                                           device=things_bbox_for_tracking.device)
            bboxes, labels, ids = self.tracker.match(
                bboxes=things_bbox_for_tracking,
                labels=things_labels_for_tracking,
                track_feats=track_feats,
                frame_id=fid)
            ids = ids + 1
            ids[ids == -1] = 0
        else:
            ids = []

        track_maps = self.generate_track_id_maps(ids, thing_masks_for_tracking, panoptic_seg)

        semantic_map = self.get_semantic_seg(panoptic_seg, segments_info)

        from scripts.visualizer import trackmap2rgb, cityscapes_cat2rgb, draw_bbox_on_img
        vis_tracker = trackmap2rgb(track_maps)
        vis_sem = cityscapes_cat2rgb(semantic_map)
        if len(things_labels_for_tracking):
            vis_tracker = draw_bbox_on_img(vis_tracker, things_bbox_for_tracking.cpu().numpy())

        # Visualization end
        return semantic_map, track_maps, None, vis_sem, vis_tracker


    def _track_forward(self, x, mask_pred):
        """Track head forward function used in both training and testing.
        We use mask pooling to get the fine grain features"""
        if not self.training:
            mask_pred = [mask_pred]
        bbox_list = batch_mask2boxlist(mask_pred)
        track_rois = bboxlist2roi(bbox_list)
        track_rois = track_rois.clamp(min=0.0)
        track_feats = self.track_roi_extractor(x[:self.track_roi_extractor.num_inputs], track_rois)
        track_feats = self.track_head(track_feats)

        return track_feats

    def forward_dummy(self, img):
        """Used for computing network flops.

        See `mmdetection/tools/get_flops.py`
        """
        # backbone
        x = self.extract_feat(img)
        # rpn
        num_imgs = len(img)
        dummy_img_metas = [
            dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs)
        ]
        rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results
        roi_outs = self.roi_head.forward_dummy(x_feats, proposal_feats,
                                               dummy_img_metas)
        return roi_outs

    def extract_feat(self, img):
        """Directly extract features from the backbone+neck."""
        x = self.backbone(img)
        if self.with_neck:
            x = self.neck(x)
        return x

    @property
    def with_rpn(self):
        """bool: whether the detector has RPN"""
        return hasattr(self, 'rpn_head') and self.rpn_head is not None

    @property
    def with_roi_head(self):
        """bool: whether the detector has a RoI head"""
        return hasattr(self, 'roi_head') and self.roi_head is not None

    def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
        """Test with augmentations.

        If rescale is False, then returned bboxes and masks will fit the scale
        of imgs[0].
        """
        pass

    def get_things_id_for_tracking(self, panoptic_seg, seg_infos):
        idxs = []
        labels = []
        masks = []
        score = []
        for segment in seg_infos:
            if segment['isthing'] == True:
                thing_mask = panoptic_seg == segment["id"]
                masks.append(thing_mask)
                idxs.append(segment["instance_id"])
                labels.append(segment['category_id'])
                score.append(segment['score'])
        return idxs, labels, masks, score


    def pack_things_object(self, object_feats, ref_object_feats):
        object_feats, ref_object_feats = object_feats.squeeze(-1).squeeze(-1), ref_object_feats.squeeze(-1).squeeze(-1)
        thing_object_feats = torch.split(object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        ref_thing_object_feats = torch.split(ref_object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        return thing_object_feats, ref_thing_object_feats

    def pack_things_masks(self, mask_pred, ref_mask_pred):
        thing_mask_pred = torch.split(mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        ref_thing_thing_mask_pred = torch.split(ref_mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        return thing_mask_pred, ref_thing_thing_mask_pred

    def get_semantic_seg(self, panoptic_seg, segments_info):
        results = {}
        masks = []
        scores = []
        kitti_step2cityscpaes = [11, 13]
        semantic_seg = np.zeros(panoptic_seg.shape)
        for segment in segments_info:
            if segment['isthing'] == True:
                if self.kitti_step:
                    cat_cur = kitti_step2cityscpaes[segment["category_id"]]
                    semantic_seg[panoptic_seg == segment["id"]] = cat_cur
                else:
                    semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] + 11
            else:
                # for stuff (0- n-1)
                if self.kitti_step:
                    cat_cur = segment["category_id"]
                    cat_cur -= 1
                    offset = 0
                    for thing_id in kitti_step2cityscpaes:
                        if cat_cur + offset >= thing_id:
                            offset += 1
                    cat_cur += offset
                    semantic_seg[panoptic_seg == segment["id"]] = cat_cur
                else:
                    semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] - 1
        return semantic_seg

    def generate_track_id_maps(self, ids, masks, panopitc_seg_maps):
        final_id_maps = np.zeros(panopitc_seg_maps.shape)
        if len(ids) == 0:
            return final_id_maps
        # assert len(things_mask_results) == len(track_results)
        masks = masks.bool()
        for i, id in enumerate(ids):
            mask = masks[i].cpu().numpy()
            final_id_maps[mask] = id
        return final_id_maps

================================================
FILE: knet/video/knet_quansi_dense_roi_gt_box_joint_train.py
================================================
import warnings
import numpy as np
import torch
import torch.nn.functional as F
from mmcv.cnn import ConvModule
from mmdet.models.builder import DETECTORS
from mmdet.models.detectors import BaseDetector
from mmdet.models.builder import build_head, build_neck, build_backbone, build_roi_extractor
from mmdet.core import build_assigner, build_sampler
from knet.video.qdtrack.builder import build_tracker
from knet.det.utils import sem2ins_masks, sem2ins_masks_cityscapes, sem2ins_masks_kitti_step
from unitrack.mask import tensor_mask2box
from unitrack.utils.mask import mask2box, batch_mask2boxlist, bboxlist2roi

# RoI box based Video K-Net baseline.
@DETECTORS.register_module()
class VideoKNetQuansiTrackROIGTBoxJointTrain(BaseDetector):
    """
        Simple Extension of KNet to Video KNet by the implementation of VPSFuse Net.
    """
    def __init__(self,
                 backbone,
                 neck=None,
                 rpn_head=None,
                 roi_head=None,
                 track_head=None,
                 extra_neck=None,
                 track_localization_fpn=None,
                 tracker=None,
                 train_cfg=None,
                 test_cfg=None,
                 track_train_cfg=None,
                 pretrained=None,
                 init_cfg=None,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 ignore_label=255,
                 thing_label_in_seg=0,
                 cityscapes=False,
                 kitti_step=False,
                 freeze_detector=False,
                 semantic_filter=False,
                 # linking parameters
                 link_previous=False,
                 bbox_roi_extractor=dict(
                     type='SingleRoIExtractor',
                     roi_layer=dict(
                         type='RoIAlign', output_size=7, sampling_ratio=2),
                     out_channels=256,
                     featmap_strides=[4, 8, 16, 32]),
                 **kwargs):
        super(VideoKNetQuansiTrackROIGTBoxJointTrain, self).__init__(init_cfg)

        if pretrained:
            warnings.warn('DeprecationWarning: pretrained is deprecated, '
                          'please use "init_cfg" instead')
            backbone.pretrained = pretrained
        self.backbone = build_backbone(backbone)

        if neck is not None:
            self.neck = build_neck(neck)

        if extra_neck is not None:
            self.extra_neck = build_neck(extra_neck)

        if rpn_head is not None:
            rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None
            rpn_head_ = rpn_head.copy()
            rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn)
            self.rpn_head = build_head(rpn_head_)

        if roi_head is not None:
            # update train and test cfg here for now
            # TODO: refactor assigner & sampler
            rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None
            roi_head.update(train_cfg=rcnn_train_cfg)
            roi_head.update(test_cfg=test_cfg.rcnn)
            roi_head.pretrained = pretrained
            self.roi_head = build_head(roi_head)

        if track_head is not None:
            self.track_train_cfg = track_train_cfg
            self.track_head = build_head(track_head)
            self.init_track_assigner_sampler()
            if track_localization_fpn is not None:
                self.track_localization_fpn = build_neck(track_localization_fpn)

            self.track_roi_extractor = build_roi_extractor(
                bbox_roi_extractor)

        if tracker is not None:
            self.tracker_cfg = tracker

        if freeze_detector:
           self._freeze_detector()

        self.train_cfg = train_cfg
        self.test_cfg = test_cfg
        self.num_proposals = self.rpn_head.num_proposals
        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.thing_label_in_seg = thing_label_in_seg
        self.ignore_label = ignore_label
        self.cityscapes = cityscapes  # whether to train the cityscape panoptic segmentation
        self.kitti_step = kitti_step  # whether to train the kitti step panoptic segmentation

        self.semantic_filter = semantic_filter
        self.link_previous = link_previous

    def init_tracker(self):
        self.tracker = build_tracker(self.tracker_cfg)

    def _freeze_detector(self):

        self.detector = [
            self.rpn_head, self.roi_head
        ]
        for model in self.detector:
            model.eval()
            for param in model.parameters():
                param.requires_grad = False

    def init_track_assigner_sampler(self):
        """Initialize assigner and sampler."""

        self.track_roi_assigner = build_assigner(
            self.track_train_cfg.assigner)
        self.track_share_assigner = False

        self.track_roi_sampler = build_sampler(
            self.track_train_cfg.sampler, context=self)
        self.track_share_sampler = False

    def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_semantic_seg):
        # gt_masks and gt_semantic_seg are not padded when forming batch
        gt_masks_tensor = []
        gt_sem_seg = []
        gt_sem_cls = []
        # batch_input_shape shoud be the same across images
        pad_H, pad_W = img_metas[0]['batch_input_shape']
        assign_H = pad_H // self.mask_assign_stride
        assign_W = pad_W // self.mask_assign_stride

        for i, gt_mask in enumerate(gt_masks):
            mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device)
            if gt_mask.width != pad_W or gt_mask.height != pad_H:
                pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height)
                mask_tensor = F.pad(mask_tensor, pad_wh, value=0)

            if gt_semantic_seg is not None:
                # gt_semantic seg is padded by zero when forming a batch
                # need to convert them from 0 to ignore
                gt_semantic_seg[
                i, :, img_metas[i]['img_shape'][0]:, :] = self.ignore_label
                gt_semantic_seg[
                i, :, :, img_metas[i]['img_shape'][1]:] = self.ignore_label
                if self.cityscapes:
                    sem_labels, sem_seg = sem2ins_masks_cityscapes(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=self.num_thing_classes)
                elif self.kitti_step:
                    sem_labels, sem_seg = sem2ins_masks_kitti_step(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=2,
                        thing_label_in_seg=(11, 13))
                else:
                    sem_labels, sem_seg = sem2ins_masks(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=self.num_thing_classes,
                        thing_label_in_seg=self.thing_label_in_seg)

                if sem_seg.shape[0] == 0:
                    gt_sem_seg.append(
                        mask_tensor.new_zeros(
                            (mask_tensor.size(0), assign_H, assign_W)))
                else:
                    gt_sem_seg.append(
                        F.interpolate(
                            sem_seg[None], (assign_H, assign_W),
                            mode='bilinear',
                            align_corners=False)[0])
                gt_sem_cls.append(sem_labels)
            else:
                gt_sem_seg = None
                gt_sem_cls = None

            if mask_tensor.shape[0] == 0:
                gt_masks_tensor.append(
                    mask_tensor.new_zeros(
                        (mask_tensor.size(0), assign_H, assign_W)))
            else:
                gt_masks_tensor.append(
                    F.interpolate(
                        mask_tensor[None], (assign_H, assign_W),  # downsample to 1/4 resolution
                        mode='bilinear',
                        align_corners=False)[0])

        return gt_masks_tensor, gt_sem_cls, gt_sem_seg

    def forward_train(self,
                      img,
                      img_metas,
                      gt_bboxes=None,
                      gt_labels=None,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      gt_semantic_seg=None,
                      gt_instance_ids=None,
                      ref_img=None,
                      ref_img_metas=None,
                      ref_gt_bboxes_ignore=None,
                      ref_gt_labels=None,
                      ref_gt_bboxes=None,
                      ref_gt_masks=None,
                      ref_gt_semantic_seg=None,
                      ref_gt_instance_ids=None,
                      proposals=None,
                      **kwargs):
        """Forward function of SparseR-CNN-like network in train stage.

        Args:
            img (Tensor): of shape (N, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.
            img_metas (list[dict]): list of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                :class:`mmdet.datasets.pipelines.Collect`.
            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
            gt_labels (list[Tensor]): class indices corresponding to each box
            gt_bboxes_ignore (None | list[Tensor): specify which bounding
                boxes can be ignored when computing the loss.
            gt_masks (List[Tensor], optional) : Segmentation masks for
                each box. But we don't support it in this architecture.
            proposals (List[Tensor], optional): override rpn proposals with
                custom proposals. Use when `with_rpn` is False.

            # This is for video only:
            ref_img (Tensor): of shape (N, 2, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.
                2 denotes there is two reference images for each input image.

            ref_img_metas (list[list[dict]]): The first list only has one
                element. The second list contains reference image information
                dict where each dict has: 'img_shape', 'scale_factor', 'flip',
                and may also contain 'filename', 'ori_shape', 'pad_shape', and
                'img_norm_cfg'. For details on the values of these keys see
                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.

            ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The
                Tensor contains ground truth bboxes for each reference image
                with shape (num_all_ref_gts, 5) in
                [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id
                start from 0, and denotes the id of reference image for each
                key image.

            ref_gt_labels (list[Tensor]): The list only has one Tensor. The
                Tensor contains class indices corresponding to each reference
                box with shape (num_all_ref_gts, 2) in
                [ref_img_id, class_indice].

        Returns:
            dict[str, Tensor]: a dictionary of loss components
        """
        batch_input_shape = tuple(img[0].size()[-2:])
        for img_meta in img_metas:
            img_meta['batch_input_shape'] = batch_input_shape

        assert proposals is None, 'KNet does not support' \
                                  ' external proposals'
        assert gt_masks is not None
        assert gt_instance_ids is not None

        # preprocess the reference images
        ref_img = ref_img.squeeze(1)  # (b,3,h,w)
        img_h, img_w = batch_input_shape
        ref_masks_gt = []
        for ref_gt_mask in ref_gt_masks:
            ref_masks_gt.append(ref_gt_mask[0])

        ref_labels_gt = []
        for ref_gt_label in ref_gt_labels:
            ref_labels_gt.append(ref_gt_label[:, 1].long())
        ref_gt_labels = ref_labels_gt

        ref_semantic_seg_gt = ref_gt_semantic_seg.squeeze(1)

        ref_gt_instance_id_list = []
        for ref_gt_instance_id in ref_gt_instance_ids:
            ref_gt_instance_id_list.append(ref_gt_instance_id[:,1].long())

        ref_img_metas_new = []
        for ref_img_meta in ref_img_metas:
            ref_img_meta[0]['batch_input_shape'] = batch_input_shape
            ref_img_metas_new.append(ref_img_meta[0])

        # prepare the gt_match_indices
        gt_pids_list = []
        for i in range(len(ref_gt_instance_id_list)):
            ref_ids = ref_gt_instance_id_list[i].cpu().data.numpy().tolist()
            gt_ids = gt_instance_ids[i].cpu().data.numpy().tolist()
            gt_pids = [ref_ids.index(i) if i in ref_ids else -1 for i in gt_ids]
            gt_pids_list.append(torch.LongTensor([gt_pids]).to(img.device)[0])

        gt_match_indices = gt_pids_list

        # gt_masks and gt_semantic_seg are not padded when forming batch
        gt_masks, gt_sem_cls, gt_sem_seg = self.preprocess_gt_masks(img_metas, gt_masks, gt_labels, gt_semantic_seg)

        ref_gt_masks, ref_gt_sem_cls, ref_gt_sem_seg = self.preprocess_gt_masks(ref_img_metas_new,
                                                                    ref_masks_gt, ref_gt_labels, ref_semantic_seg_gt)

        x = self.extract_feat(img)
        x_ref = self.extract_feat(ref_img)

        # current frame
        rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks,
                                                  gt_labels, gt_sem_seg,
                                                  gt_sem_cls)

        # simple forward to get the reference results
        # self.rpn_head.eval()
        # ref_rpn_results = self.rpn_head.simple_test_rpn(x_ref, ref_img_metas_new)
        # self.rpn_head.train()

        # reference frame
        ref_rpn_results = self.rpn_head.forward_train(x_ref, ref_img_metas_new, ref_gt_masks,
                                                  ref_labels_gt, ref_gt_sem_seg,
                                                  ref_gt_sem_cls)

        (rpn_losses, proposal_feats, x_feats, mask_preds,
         cls_scores) = rpn_results

        (ref_rpn_losses, ref_proposal_feats, ref_x_feats, ref_mask_preds,
         ref_cls_scores) = ref_rpn_results

        losses_ref, ref_obj_feats, ref_cls_scores, ref_mask_preds, ref_scaled_mask_preds = self.roi_head.forward_train(
            ref_x_feats,
            ref_proposal_feats,
            ref_mask_preds,
            ref_cls_scores,
            ref_img_metas,
            ref_gt_masks,
            ref_gt_labels,
            gt_sem_seg=ref_gt_sem_seg,
            gt_sem_cls=ref_gt_sem_cls,
            imgs_whwh=None)


        if self.link_previous:
            losses, object_feats, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.forward_train_with_previous(
                x_feats,
                proposal_feats,
                mask_preds,
                cls_scores,
                img_metas,
                gt_masks,
                gt_labels,
                gt_bboxes_ignore=gt_bboxes_ignore,
                gt_bboxes=gt_bboxes,
                gt_sem_seg=gt_sem_seg,
                gt_sem_cls=gt_sem_cls,
                imgs_whwh=None,
                previous_obj_feats=ref_obj_feats,
                previous_mask_preds=ref_scaled_mask_preds,
                previous_x_feats=ref_x_feats,
            )
        else:
            # forward to get the current results
            losses, object_feats, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.forward_train(
                x_feats,
                proposal_feats,
                mask_preds,
                cls_scores,
                img_metas,
                gt_masks,
                gt_labels,
                gt_bboxes_ignore=gt_bboxes_ignore,
                gt_bboxes=gt_bboxes,
                gt_sem_seg=gt_sem_seg,
                gt_sem_cls=gt_sem_cls,
                imgs_whwh=None)

        # ===== Tracking Part -==== #
        # assign both key frame and reference frame tracking targets
        key_sampling_results, ref_sampling_results = [], []
        num_imgs = len(img_metas)

        for i in range(num_imgs):
            assign_result = self.track_roi_assigner.assign(
                scaled_mask_preds[i][:self.num_proposals].detach(), cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(),
                gt_masks[i], gt_labels[i], img_meta=img_metas[i])
            sampling_result = self.track_roi_sampler.sample(
                assign_result,
                mask_preds[i][:self.num_proposals].detach(),
                gt_masks[i])
            key_sampling_results.append(sampling_result)

            ref_assign_result = self.track_roi_assigner.assign(
                ref_scaled_mask_preds[i][:self.num_proposals].detach(), ref_cls_scores[i][:self.num_proposals, :self.num_thing_classes].detach(),
                ref_gt_masks[i], ref_gt_labels[i], img_meta=ref_img_metas_new[i])
            ref_sampling_result = self.track_roi_sampler.sample(
                ref_assign_result,
                ref_mask_preds[i][:self.num_proposals].detach(),
                ref_gt_masks[i])
            ref_sampling_results.append(ref_sampling_result)

        # roi feature embeddings
        key_masks = [res.pos_gt_masks for res in key_sampling_results]
        for i in range(len(key_masks)):
            key_masks[i] = F.interpolate(key_masks[i].unsqueeze(0),
                                        size=(img_h, img_w), mode="bilinear", align_corners=False).squeeze(0)
            key_masks[i] = (key_masks[i].sigmoid() > 0.5).float()

        key_feats = self._track_forward(x, key_masks)

        # roi feature embeddings
        ref_masks = [res.pos_gt_masks for res in ref_sampling_results]
        for i in range(len(ref_masks)):
            ref_masks[i] = F.interpolate(ref_masks[i].unsqueeze(0),
                                        size=(img_h, img_w), mode="bilinear", align_corners=False).squeeze(0)
            ref_masks[i] = (ref_masks[i].sigmoid() > 0.5).float()

        ref_feats = self._track_forward(x_ref, ref_masks)

        match_feats = self.track_head.match(key_feats, ref_feats,
                                            key_sampling_results,
                                            ref_sampling_results)

        asso_targets = self.track_head.get_track_targets(
            gt_match_indices, key_sampling_results, ref_sampling_results)
        loss_track = self.track_head.loss(*match_feats, *asso_targets)

        losses_ref = self.add_ref_loss(losses_ref)
        ref_rpn_losses = self.add_ref_rpn_loss(ref_rpn_losses)

        losses.update(ref_rpn_losses)
        losses.update(rpn_losses)
        losses.update(losses_ref)
        losses.update(loss_track)

        return losses

    def simple_test(self, img, img_metas, rescale=False, ref_img=None, **kwargs):
        """Test function without test time augmentation.

        Args:
            imgs (list[torch.Tensor]): List of multiple images
            img_metas (list[dict]): List of image information.
            rescale (bool): Whether to rescale the results.
                Defaults to False.

        Returns:
            list[list[np.ndarray]]: BBox results of each image and classes.
                The outer list corresponds to each image. The inner list
                corresponds to each class.
        """

        # whether is the first frame for such clips
        # assert 'city' in img_metas[0]['filename'] and 'iid' in img_metas[0]
        if "city" in img_metas[0]['filename']:
            iid = img_metas[0]['iid']
            fid = iid % 10000
            is_first = (fid == 1)
        else:
            iid = kwargs['img_id'][0].item()
            fid = iid % 10000
            is_first = (fid == 0)

        if is_first:
            self.init_tracker()
            self.obj_feats_memory = None
            self.x_feats_memory = None
            self.mask_preds_memory = None

        # for current frame
        x = self.extract_feat(img)
        # current frame inference
        rpn_results = self.rpn_head.simple_test_rpn(x, img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results

        if self.link_previous:
            cur_segm_results, query_output, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.simple_test_with_previous(
                x_feats,
                proposal_feats,
                mask_preds,
                cls_scores,
                img_metas,
                previous_obj_feats=self.obj_feats_memory,
                previous_mask_preds=self.mask_preds_memory,
                previous_x_feats=self.x_feats_memory,
            )
            self.obj_feats_memory = query_output
            self.x_feats_memory = x_feats
            self.mask_preds_memory = scaled_mask_preds
        else:
            cur_segm_results, query_output, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.simple_test(
                x_feats,
                proposal_feats,
                mask_preds,
                cls_scores,
                img_metas)

        # for tracking part
        _, segm_result, mask_preds, panoptic_result = cur_segm_results[0]
        panoptic_seg, segments_info = panoptic_result

        if self.semantic_filter:
            seg_preds = torch.nn.functional.interpolate(seg_preds, panoptic_seg.shape, mode='bilinear', align_corners=False)
            seg_preds = seg_preds.sigmoid()
            seg_out = seg_preds.argmax(1)
            semantic_thing = (seg_out < self.num_thing_classes).to(dtype=torch.float32)
        else:
            semantic_thing = 1.

        # get sorted tracking thing ids, labels, masks, score for tracking
        things_index_for_tracking, things_labels_for_tracking, thing_masks_for_tracking, things_score_for_tracking = \
            self.get_things_id_for_tracking(panoptic_seg, segments_info)
        things_labels_for_tracking = torch.Tensor(things_labels_for_tracking).to(cls_scores.device).long()
        if len(things_labels_for_tracking) > 0:
            things_bbox_for_tracking = torch.zeros((len(things_score_for_tracking), 5),
                                                   dtype=torch.float, device=x_feats.device)
            things_bbox_for_tracking[:, 4] = torch.tensor(things_score_for_tracking,
                                                          device=things_bbox_for_tracking.device)

            thing_masks_for_tracking_final = []
            for mask in thing_masks_for_tracking:
                thing_masks_for_tracking_final.append(torch.Tensor(mask).unsqueeze(0).to(
                    x_feats.device).float())
            thing_masks_for_tracking_final = torch.cat(thing_masks_for_tracking_final, 0)
            thing_masks_for_tracking = thing_masks_for_tracking_final
            thing_masks_for_tracking_with_semantic_filter = thing_masks_for_tracking_final * semantic_thing

        if len(things_labels_for_tracking) == 0:
            track_feats = None
        else:
            # tracking embedding features
            track_feats = self._track_forward(x, thing_masks_for_tracking_with_semantic_filter)

        if track_feats is not None:
            # assert len(things_id_for_tracking) == len(things_labels_for_tracking)
            things_bbox_for_tracking[:, :4] = torch.tensor(tensor_mask2box(thing_masks_for_tracking_with_semantic_filter),
                                                           device=things_bbox_for_tracking.device)
            bboxes, labels, ids = self.tracker.match(
                bboxes=things_bbox_for_tracking,
                labels=things_labels_for_tracking,
                track_feats=track_feats,
                frame_id=fid)
            ids = ids + 1
            ids[ids == -1] = 0
        else:
            ids = []

        track_maps = self.generate_track_id_maps(ids, thing_masks_for_tracking, panoptic_seg)

        semantic_map = self.get_semantic_seg(panoptic_seg, segments_info)

        from scripts.visualizer import trackmap2rgb, cityscapes_cat2rgb, draw_bbox_on_img
        vis_tracker = trackmap2rgb(track_maps)
        vis_sem = cityscapes_cat2rgb(semantic_map)
        if len(things_labels_for_tracking):
            vis_tracker = draw_bbox_on_img(vis_tracker, things_bbox_for_tracking.cpu().numpy())

        # Visualization end
        return semantic_map, track_maps, None, vis_sem, vis_tracker


    def _track_forward(self, x, mask_pred):
        """Track head forward function used in both training and testing.
        We use mask pooling to get the fine grain features"""
        if not self.training:
            mask_pred = [mask_pred]
        bbox_list = batch_mask2boxlist(mask_pred)
        track_rois = bboxlist2roi(bbox_list)
        track_rois = track_rois.clamp(min=0.0)
        track_feats = self.track_roi_extractor(x[:self.track_roi_extractor.num_inputs], track_rois)
        track_feats = self.track_head(track_feats)

        return track_feats

    def forward_dummy(self, img):
        """Used for computing network flops.

        See `mmdetection/tools/get_flops.py`
        """
        # backbone
        x = self.extract_feat(img)
        # rpn
        num_imgs = len(img)
        dummy_img_metas = [
            dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs)
        ]
        rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results
        roi_outs = self.roi_head.forward_dummy(x_feats, proposal_feats,
                                               dummy_img_metas)
        return roi_outs

    def extract_feat(self, img):
        """Directly extract features from the backbone+neck."""
        x = self.backbone(img)
        if self.with_neck:
            x = self.neck(x)
        return x

    @property
    def with_rpn(self):
        """bool: whether the detector has RPN"""
        return hasattr(self, 'rpn_head') and self.rpn_head is not None

    @property
    def with_roi_head(self):
        """bool: whether the detector has a RoI head"""
        return hasattr(self, 'roi_head') and self.roi_head is not None

    def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
        """Test with augmentations.

        If rescale is False, then returned bboxes and masks will fit the scale
        of imgs[0].
        """
        pass

    def get_things_id_for_tracking(self, panoptic_seg, seg_infos):
        idxs = []
        labels = []
        masks = []
        score = []
        for segment in seg_infos:
            if segment['isthing'] == True:
                thing_mask = panoptic_seg == segment["id"]
                masks.append(thing_mask)
                idxs.append(segment["instance_id"])
                labels.append(segment['category_id'])
                score.append(segment['score'])
        return idxs, labels, masks, score


    def pack_things_object(self, object_feats, ref_object_feats):
        object_feats, ref_object_feats = object_feats.squeeze(-1).squeeze(-1), ref_object_feats.squeeze(-1).squeeze(-1)
        thing_object_feats = torch.split(object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        ref_thing_object_feats = torch.split(ref_object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        return thing_object_feats, ref_thing_object_feats

    def pack_things_masks(self, mask_pred, ref_mask_pred):
        thing_mask_pred = torch.split(mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        ref_thing_thing_mask_pred = torch.split(ref_mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        return thing_mask_pred, ref_thing_thing_mask_pred

    def get_semantic_seg(self, panoptic_seg, segments_info):
        results = {}
        masks = []
        scores = []
        kitti_step2cityscpaes = [11, 13]
        semantic_seg = np.zeros(panoptic_seg.shape)
        for segment in segments_info:
            if segment['isthing'] == True:
                if self.kitti_step:
                    cat_cur = kitti_step2cityscpaes[segment["category_id"]]
                    semantic_seg[panoptic_seg == segment["id"]] = cat_cur
                else:
                    semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] + 11
            else:
                # for stuff (0- n-1)
                if self.kitti_step:
                    cat_cur = segment["category_id"]
                    cat_cur -= 1
                    offset = 0
                    for thing_id in kitti_step2cityscpaes:
                        if cat_cur + offset >= thing_id:
                            offset += 1
                    cat_cur += offset
                    semantic_seg[panoptic_seg == segment["id"]] = cat_cur
                else:
                    semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] - 1
        return semantic_seg

    def generate_track_id_maps(self, ids, masks, panopitc_seg_maps):
        final_id_maps = np.zeros(panopitc_seg_maps.shape)
        if len(ids) == 0:
            return final_id_maps
        # assert len(things_mask_results) == len(track_results)
        masks = masks.bool()
        for i, id in enumerate(ids):
            mask = masks[i].cpu().numpy()
            final_id_maps[mask] = id
        return final_id_maps

    def add_ref_loss(self, loss_dict):
        track_loss ={}
        for k, v in loss_dict.items():
            track_loss[str(k)+"_ref"] = v
        return track_loss

    def add_ref_rpn_loss(self, loss_dict):
        ref_rpn_loss = {}
        for k, v in loss_dict.items():
            ref_rpn_loss[str(k) +"_ref_rpn"] = v
        return ref_rpn_loss

================================================
FILE: knet/video/knet_track_head.py
================================================
import warnings
import numpy as np
import torch
import torch.nn.functional as F
from mmdet.models.builder import DETECTORS
from mmdet.models.detectors import BaseDetector
from mmdet.models.builder import build_head, build_neck, build_backbone
from knet.det.utils import sem2ins_masks, sem2ins_masks_cityscapes


@DETECTORS.register_module()
class VideoKNetFuseTrack(BaseDetector):
    """
        Simple Extension of KNet to Video KNet by the implementation of VPSFuse Net.
    """
    def __init__(self,
                 backbone,
                 neck=None,
                 rpn_head=None,
                 roi_head=None,
                 track_head=None,
                 extra_neck=None,
                 train_cfg=None,
                 test_cfg=None,
                 pretrained=None,
                 init_cfg=None,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 ignore_label=255,
                 thing_label_in_seg=0,
                 cityscapes=False,
                 **kwargs):
        super(VideoKNetFuseTrack, self).__init__(init_cfg)

        if pretrained:
            warnings.warn('DeprecationWarning: pretrained is deprecated, '
                          'please use "init_cfg" instead')
            backbone.pretrained = pretrained
        self.backbone = build_backbone(backbone)

        if neck is not None:
            self.neck = build_neck(neck)

        if extra_neck is not None:
            self.extra_neck = build_neck(extra_neck)

        if rpn_head is not None:
            rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None
            rpn_head_ = rpn_head.copy()
            rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn)
            self.rpn_head = build_head(rpn_head_)

        if roi_head is not None:
            # update train and test cfg here for now
            # TODO: refactor assigner & sampler
            rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None
            roi_head.update(train_cfg=rcnn_train_cfg)
            roi_head.update(test_cfg=test_cfg.rcnn)
            roi_head.pretrained = pretrained
            self.roi_head = build_head(roi_head)

        if track_head is not None:
            self.track_head = build_head(track_head)

        self.train_cfg = train_cfg
        self.test_cfg = test_cfg

        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.thing_label_in_seg = thing_label_in_seg
        self.ignore_label = ignore_label
        self.cityscapes = cityscapes  # whether to train the cityscape panoptic segmentation

    def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_semantic_seg):
        # gt_masks and gt_semantic_seg are not padded when forming batch
        gt_masks_tensor = []
        gt_sem_seg = []
        gt_sem_cls = []
        # batch_input_shape shoud be the same across images
        pad_H, pad_W = img_metas[0]['batch_input_shape']
        assign_H = pad_H // self.mask_assign_stride
        assign_W = pad_W // self.mask_assign_stride

        for i, gt_mask in enumerate(gt_masks):
            mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device)
            if gt_mask.width != pad_W or gt_mask.height != pad_H:
                pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height)
                mask_tensor = F.pad(mask_tensor, pad_wh, value=0)

            if gt_semantic_seg is not None:
                # gt_semantic seg is padded by zero when forming a batch
                # need to convert them from 0 to ignore
                gt_semantic_seg[
                i, :, img_metas[i]['img_shape'][0]:, :] = self.ignore_label
                gt_semantic_seg[
                i, :, :, img_metas[i]['img_shape'][1]:] = self.ignore_label
                if self.cityscapes:
                    sem_labels, sem_seg = sem2ins_masks_cityscapes(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=self.num_thing_classes)
                else:
                    sem_labels, sem_seg = sem2ins_masks(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=self.num_thing_classes,
                        thing_label_in_seg=self.thing_label_in_seg)

                if sem_seg.shape[0] == 0:
                    gt_sem_seg.append(
                        mask_tensor.new_zeros(
                            (mask_tensor.size(0), assign_H, assign_W)))
                else:
                    gt_sem_seg.append(
                        F.interpolate(
                            sem_seg[None], (assign_H, assign_W),
                            mode='bilinear',
                            align_corners=False)[0])
                gt_sem_cls.append(sem_labels)
            else:
                gt_sem_seg = None
                gt_sem_cls = None

            if mask_tensor.shape[0] == 0:
                gt_masks_tensor.append(
                    mask_tensor.new_zeros(
                        (mask_tensor.size(0), assign_H, assign_W)))
            else:
                gt_masks_tensor.append(
                    F.interpolate(
                        mask_tensor[None], (assign_H, assign_W),  # downsample to 1/4 resolution
                        mode='bilinear',
                        align_corners=False)[0])

        return gt_masks_tensor, gt_sem_cls, gt_sem_seg

    def forward_train(self,
                      img,
                      img_metas,
                      gt_bboxes=None,
                      gt_labels=None,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      gt_semantic_seg=None,
                      gt_instance_ids=None,
                      ref_img=None,
                      ref_img_metas=None,
                      ref_gt_bboxes_ignore=None,
                      ref_gt_labels=None,
                      ref_gt_bboxes=None,
                      ref_gt_masks=None,
                      ref_gt_semantic_seg=None,
                      ref_gt_instance_ids=None,
                      proposals=None,
                      **kwargs):
        """Forward function of SparseR-CNN-like network in train stage.

        Args:
            img (Tensor): of shape (N, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.
            img_metas (list[dict]): list of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                :class:`mmdet.datasets.pipelines.Collect`.
            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
            gt_labels (list[Tensor]): class indices corresponding to each box
            gt_bboxes_ignore (None | list[Tensor): specify which bounding
                boxes can be ignored when computing the loss.
            gt_masks (List[Tensor], optional) : Segmentation masks for
                each box. But we don't support it in this architecture.
            proposals (List[Tensor], optional): override rpn proposals with
                custom proposals. Use when `with_rpn` is False.

            # This is for video only:
            ref_img (Tensor): of shape (N, 2, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.
                2 denotes there is two reference images for each input image.

            ref_img_metas (list[list[dict]]): The first list only has one
                element. The second list contains reference image information
                dict where each dict has: 'img_shape', 'scale_factor', 'flip',
                and may also contain 'filename', 'ori_shape', 'pad_shape', and
                'img_norm_cfg'. For details on the values of these keys see
                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.

            ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The
                Tensor contains ground truth bboxes for each reference image
                with shape (num_all_ref_gts, 5) in
                [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id
                start from 0, and denotes the id of reference image for each
                key image.

            ref_gt_labels (list[Tensor]): The list only has one Tensor. The
                Tensor contains class indices corresponding to each reference
                box with shape (num_all_ref_gts, 2) in
                [ref_img_id, class_indice].

        Returns:
            dict[str, Tensor]: a dictionary of loss components
        """
        batch_input_shape = tuple(img[0].size()[-2:])
        for img_meta in img_metas:
            img_meta['batch_input_shape'] = batch_input_shape

        assert proposals is None, 'KNet does not support' \
                                  ' external proposals'
        assert gt_masks is not None
        assert gt_instance_ids is not None

        # preprocess the reference images
        ref_img = ref_img.squeeze(1)  # (b,3,h,w)
        ref_masks_gt = []
        for ref_gt_mask in ref_gt_masks:
            ref_masks_gt.append(ref_gt_mask[0])

        ref_labels_gt = []
        for ref_gt_label in ref_gt_labels:
            ref_labels_gt.append(ref_gt_label[:, 1].long())
        ref_gt_labels = ref_labels_gt

        ref_semantic_seg_gt = ref_gt_semantic_seg.squeeze(1)

        ref_gt_instance_id_list = []
        for ref_gt_instance_id in ref_gt_instance_ids:
            ref_gt_instance_id_list.append(ref_gt_instance_id[:,1].long())

        ref_img_metas_new = []
        for ref_img_meta in ref_img_metas:
            ref_img_meta[0]['batch_input_shape'] = batch_input_shape
            ref_img_metas_new.append(ref_img_meta[0])

        gt_pids_list =[]
        for i in range(len(ref_gt_instance_id_list)):
            ref_ids = ref_gt_instance_id_list[i].cpu().data.numpy().tolist()
            gt_ids = gt_instance_ids[i].cpu().data.numpy().tolist()
            gt_pids = [ref_ids.index(i) + 1 if i in ref_ids else 0 for i in gt_ids]
            gt_pids_list.append(torch.LongTensor([gt_pids]).to(img.device)[0])
        gt_pids = gt_pids_list

        # gt_masks and gt_semantic_seg are not padded when forming batch
        gt_masks, gt_sem_cls, gt_sem_seg = self.preprocess_gt_masks(img_metas, gt_masks, gt_labels, gt_semantic_seg)

        ref_gt_masks, ref_gt_sem_cls, ref_gt_sem_seg = self.preprocess_gt_masks(ref_img_metas_new,
                                                                    ref_masks_gt, ref_labels_gt, ref_semantic_seg_gt)

        x = self.extract_feat(img)
        x_ref = self.extract_feat(ref_img)
        rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks,
                                                  gt_labels, gt_sem_seg,
                                                  gt_sem_cls)

        ref_rpn_results = self.rpn_head.forward_train(x_ref, ref_img_metas_new, ref_gt_masks,
                                                  ref_labels_gt, ref_gt_sem_seg,
                                                  ref_gt_sem_cls)

        (rpn_losses, proposal_feats, x_feats, mask_preds,
         cls_scores) = rpn_results

        (ref_rpn_losses, ref_proposal_feats, ref_x_feats, ref_mask_preds,
         ref_cls_scores) = ref_rpn_results

        losses, sample_results, object_feats = self.roi_head.forward_train(
            x_feats,
            proposal_feats,
            mask_preds,
            cls_scores,
            img_metas,
            gt_masks,
            gt_labels,
            gt_pids=gt_pids,
            gt_bboxes_ignore=gt_bboxes_ignore,
            gt_bboxes=gt_bboxes,
            gt_sem_seg=gt_sem_seg,
            gt_sem_cls=gt_sem_cls,
            imgs_whwh=None)

        ref_losses, ref_sample_results, ref_object_feats = self.roi_head.forward_train(
            ref_x_feats,
            ref_proposal_feats,
            ref_mask_preds,
            ref_cls_scores,
            ref_img_metas,
            ref_gt_masks,
            ref_gt_labels,
            gt_bboxes=ref_gt_bboxes,
            gt_bboxes_ignore=ref_gt_bboxes_ignore,
            gt_sem_seg=ref_gt_sem_seg,
            gt_sem_cls=ref_gt_sem_cls,
            imgs_whwh=None)
        proposals_nums = [self.roi_head.num_proposals] * img.size()[0]
        ref_proposals_nums = proposals_nums

        object_feats, ref_object_feats = self.pack_things_object(object_feats, ref_object_feats)
        match_score = self.track_head(object_feats, ref_object_feats, proposals_nums, ref_proposals_nums)
        track_loss = self.track_head.loss(match_score, sample_results)

        # format the loss
        ref_rpn_losses = self.add_ref_rpn_loss(ref_rpn_losses)
        ref_losses = self.add_ref_rpn_loss(ref_losses)

        losses.update(ref_rpn_losses)
        losses.update(ref_losses)
        losses.update(track_loss)
        losses.update(rpn_losses)

        return losses

    def simple_test(self, img, img_metas, rescale=False, ref_img=None):
        """Test function without test time augmentation.

        Args:
            imgs (list[torch.Tensor]): List of multiple images
            img_metas (list[dict]): List of image information.
            rescale (bool): Whether to rescale the results.
                Defaults to False.

        Returns:
            list[list[np.ndarray]]: BBox results of each image and classes.
                The outer list corresponds to each image. The inner list
                corresponds to each class.
        """

        if ref_img is not None:
            ref_img = ref_img[0]
        # whether is the first frame for such clips
        assert 'city' in img_metas[0]['filename'] and 'iid' in img_metas[0]
        iid = img_metas[0]['iid']
        fid = iid % 10000
        is_first = (fid == 1)

        # for current frame
        x = self.extract_feat(img)
        rpn_results = self.rpn_head.simple_test_rpn(x, img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results

        if not is_first:
            ref_x = self.extract_feat(ref_img)
            ref_rpn_results = self.rpn_head.simple_test_rpn(ref_x, img_metas)
            (ref_proposal_feats, ref_x_feats, ref_mask_preds, ref_cls_scores,
             ref_seg_preds) = ref_rpn_results
            x_fuse = self.combine(ref_x_feats + x_feats)

        cur_segm_results, cur_object_query = self.roi_head.simple_test(
            x_feats,
            proposal_feats,
            mask_preds,
            cls_scores,
            img_metas,
            imgs_whwh=None,
            rescale=rescale)

        bbox_result, segm_result, panoptic_result = cur_segm_results[0]

        panoptic_seg, segments_info = panoptic_result

        cur_results, sseg_results = self.pack_stuff_things_result(panoptic_seg, segments_info)

        if is_first:
            self.track_query = cur_object_query

        if not is_first:
            track_seg_results = self.track_roi_head.simple_test(
                    x_fuse,
                    self.track_query,
                    ref_mask_preds,
                    ref_cls_scores,
                    img_metas,
                    imgs_whwh=None,
                    rescale=rescale
            )
            bbox_result, segm_result, panoptic_result = track_seg_results[0]
            track_panoptic_seg, track_segments_info = panoptic_result
            track_results, ref_sseg_results = self.pack_stuff_things_result(track_panoptic_seg, track_segments_info)

            # update the tracking query
            self.track_query = cur_object_query

        if is_first:
            self.tracker.reset_all()
            init_track_results = self.tracker.init_track(cur_results)
            track_maps = self.generate_track_id_maps(init_track_results, panoptic_seg)

        elif not is_first:
            results = self.tracker.step(cur_results, track_results)
            track_maps = self.generate_track_id_maps(results, panoptic_seg)

        return cur_segm_results, track_maps, sseg_results

    def forward_dummy(self, img):
        """Used for computing network flops.

        See `mmdetection/tools/get_flops.py`
        """
        # backbone
        x = self.extract_feat(img)
        # rpn
        num_imgs = len(img)
        dummy_img_metas = [
            dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs)
        ]
        rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results
        roi_outs = self.roi_head.forward_dummy(x_feats, proposal_feats,
                                               dummy_img_metas)
        return roi_outs

    def extract_feat(self, img):
        """Directly extract features from the backbone+neck."""
        x = self.backbone(img)
        if self.with_neck:
            x = self.neck(x)
        return x

    @property
    def with_rpn(self):
        """bool: whether the detector has RPN"""
        return hasattr(self, 'rpn_head') and self.rpn_head is not None

    @property
    def with_roi_head(self):
        """bool: whether the detector has a RoI head"""
        return hasattr(self, 'roi_head') and self.roi_head is not None

    def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
        """Test with augmentations.

        If rescale is False, then returned bboxes and masks will fit the scale
        of imgs[0].
        """
        pass

    def pack_things_object(self, object_feats, ref_object_feats):
        object_feats, ref_object_feats = object_feats.squeeze(-1).squeeze(-1), ref_object_feats.squeeze(-1).squeeze(-1)
        thing_object_feats = torch.split(object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        ref_thing_object_feats = torch.split(ref_object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        return thing_object_feats, ref_thing_object_feats

    def add_track_loss(self, loss_dict):
        track_loss ={}
        for k,v in loss_dict.items():
            track_loss[str(k)+"_track"] = v
        return track_loss

    def add_ref_rpn_loss(self, loss_dict):
        ref_rpn_loss = {}
        for k,v in loss_dict.items():
            ref_rpn_loss[str(k) +"_ref"] = v
        return ref_rpn_loss

    def pack_stuff_things_result(self, panoptic_seg, segments_info):
        results = {}
        masks = []
        scores = []
        semantic_seg = np.zeros(panoptic_seg.shape)
        for segment in segments_info:
            if segment['isthing'] == True:
                thing_mask = panoptic_seg == segment["id"]
                masks.append(thing_mask)
                scores.append(segment["score"])
                # for things to shift the labels
                # (n - c)
                semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] + 11
            else:
                # for stuff (0- n-1)
                semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] - 1

        results["masks"] = np.array(masks)  # (N)
        results["scores"] = np.array(scores)  # (N,H,W)

        return results, semantic_seg

    def generate_track_id_maps(self, track_results, panopitc_seg_maps):
        final_id_maps = np.zeros(panopitc_seg_maps.shape)
        # assert len(things_mask_results) == len(track_results)
        for track in track_results:
            id = track["tracking_id"]
            mask = track["mask"]
            final_id_maps[mask] = id
        return final_id_maps

================================================
FILE: knet/video/knet_track_head_roi_align.py
================================================
import warnings
import numpy as np
import torch
import torch.nn.functional as F
from mmdet.models.builder import DETECTORS
from mmdet.models.detectors import BaseDetector
from mmdet.models.builder import build_head, build_neck, build_backbone
from knet.det.utils import sem2ins_masks, sem2ins_masks_cityscapes


@DETECTORS.register_module()
class VideoKNetFuseROITrack(BaseDetector):
    """
        Simple Extension of KNet to Video KNet by the implementation of VPSFuse Net.
    """
    def __init__(self,
                 backbone,
                 neck=None,
                 rpn_head=None,
                 roi_head=None,
                 track_head=None,
                 extra_neck=None,
                 train_cfg=None,
                 test_cfg=None,
                 pretrained=None,
                 init_cfg=None,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 ignore_label=255,
                 thing_label_in_seg=0,
                 cityscapes=False,
                 **kwargs):
        super(VideoKNetFuseROITrack, self).__init__(init_cfg)

        if pretrained:
            warnings.warn('DeprecationWarning: pretrained is deprecated, '
                          'please use "init_cfg" instead')
            backbone.pretrained = pretrained
        self.backbone = build_backbone(backbone)

        if neck is not None:
            self.neck = build_neck(neck)

        if extra_neck is not None:
            self.extra_neck = build_neck(extra_neck)

        if rpn_head is not None:
            rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None
            rpn_head_ = rpn_head.copy()
            rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn)
            self.rpn_head = build_head(rpn_head_)

        if roi_head is not None:
            # update train and test cfg here for now
            # TODO: refactor assigner & sampler
            rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None
            roi_head.update(train_cfg=rcnn_train_cfg)
            roi_head.update(test_cfg=test_cfg.rcnn)
            roi_head.pretrained = pretrained
            self.roi_head = build_head(roi_head)

        if track_head is not None:
            self.track_head = build_head(track_head)

        self.train_cfg = train_cfg
        self.test_cfg = test_cfg

        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.thing_label_in_seg = thing_label_in_seg
        self.ignore_label = ignore_label
        self.cityscapes = cityscapes  # whether to train the cityscape panoptic segmentation

    def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_semantic_seg):
        # gt_masks and gt_semantic_seg are not padded when forming batch
        gt_masks_tensor = []
        gt_sem_seg = []
        gt_sem_cls = []
        # batch_input_shape shoud be the same across images
        pad_H, pad_W = img_metas[0]['batch_input_shape']
        assign_H = pad_H // self.mask_assign_stride
        assign_W = pad_W // self.mask_assign_stride

        for i, gt_mask in enumerate(gt_masks):
            mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device)
            if gt_mask.width != pad_W or gt_mask.height != pad_H:
                pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height)
                mask_tensor = F.pad(mask_tensor, pad_wh, value=0)

            if gt_semantic_seg is not None:
                # gt_semantic seg is padded by zero when forming a batch
                # need to convert them from 0 to ignore
                gt_semantic_seg[
                i, :, img_metas[i]['img_shape'][0]:, :] = self.ignore_label
                gt_semantic_seg[
                i, :, :, img_metas[i]['img_shape'][1]:] = self.ignore_label
                if self.cityscapes:
                    sem_labels, sem_seg = sem2ins_masks_cityscapes(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=self.num_thing_classes)
                else:
                    sem_labels, sem_seg = sem2ins_masks(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=self.num_thing_classes,
                        thing_label_in_seg=self.thing_label_in_seg)

                if sem_seg.shape[0] == 0:
                    gt_sem_seg.append(
                        mask_tensor.new_zeros(
                            (mask_tensor.size(0), assign_H, assign_W)))
                else:
                    gt_sem_seg.append(
                        F.interpolate(
                            sem_seg[None], (assign_H, assign_W),
                            mode='bilinear',
                            align_corners=False)[0])
                gt_sem_cls.append(sem_labels)
            else:
                gt_sem_seg = None
                gt_sem_cls = None

            if mask_tensor.shape[0] == 0:
                gt_masks_tensor.append(
                    mask_tensor.new_zeros(
                        (mask_tensor.size(0), assign_H, assign_W)))
            else:
                gt_masks_tensor.append(
                    F.interpolate(
                        mask_tensor[None], (assign_H, assign_W),  # downsample to 1/4 resolution
                        mode='bilinear',
                        align_corners=False)[0])

        return gt_masks_tensor, gt_sem_cls, gt_sem_seg

    def forward_train(self,
                      img,
                      img_metas,
                      gt_bboxes=None,
                      gt_labels=None,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      gt_semantic_seg=None,
                      gt_instance_ids=None,
                      ref_img=None,
                      ref_img_metas=None,
                      ref_gt_bboxes_ignore=None,
                      ref_gt_labels=None,
                      ref_gt_bboxes=None,
                      ref_gt_masks=None,
                      ref_gt_semantic_seg=None,
                      ref_gt_instance_ids=None,
                      proposals=None,
                      **kwargs):
        """Forward function of SparseR-CNN-like network in train stage.

        Args:
            img (Tensor): of shape (N, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.
            img_metas (list[dict]): list of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                :class:`mmdet.datasets.pipelines.Collect`.
            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
            gt_labels (list[Tensor]): class indices corresponding to each box
            gt_bboxes_ignore (None | list[Tensor): specify which bounding
                boxes can be ignored when computing the loss.
            gt_masks (List[Tensor], optional) : Segmentation masks for
                each box. But we don't support it in this architecture.
            proposals (List[Tensor], optional): override rpn proposals with
                custom proposals. Use when `with_rpn` is False.

            # This is for video only:
            ref_img (Tensor): of shape (N, 2, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.
                2 denotes there is two reference images for each input image.

            ref_img_metas (list[list[dict]]): The first list only has one
                element. The second list contains reference image information
                dict where each dict has: 'img_shape', 'scale_factor', 'flip',
                and may also contain 'filename', 'ori_shape', 'pad_shape', and
                'img_norm_cfg'. For details on the values of these keys see
                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.

            ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The
                Tensor contains ground truth bboxes for each reference image
                with shape (num_all_ref_gts, 5) in
                [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id
                start from 0, and denotes the id of reference image for each
                key image.

            ref_gt_labels (list[Tensor]): The list only has one Tensor. The
                Tensor contains class indices corresponding to each reference
                box with shape (num_all_ref_gts, 2) in
                [ref_img_id, class_indice].

        Returns:
            dict[str, Tensor]: a dictionary of loss components
        """
        batch_input_shape = tuple(img[0].size()[-2:])
        for img_meta in img_metas:
            img_meta['batch_input_shape'] = batch_input_shape

        assert proposals is None, 'KNet does not support' \
                                  ' external proposals'
        assert gt_masks is not None
        assert gt_instance_ids is not None

        # preprocess the reference images
        ref_img = ref_img.squeeze(1)  # (b,3,h,w)
        ref_masks_gt = []
        for ref_gt_mask in ref_gt_masks:
            ref_masks_gt.append(ref_gt_mask[0])

        ref_labels_gt = []
        for ref_gt_label in ref_gt_labels:
            ref_labels_gt.append(ref_gt_label[:, 1].long())
        ref_gt_labels = ref_labels_gt

        ref_semantic_seg_gt = ref_gt_semantic_seg.squeeze(1)

        ref_gt_instance_id_list = []
        for ref_gt_instance_id in ref_gt_instance_ids:
            ref_gt_instance_id_list.append(ref_gt_instance_id[:,1].long())

        ref_img_metas_new = []
        for ref_img_meta in ref_img_metas:
            ref_img_meta[0]['batch_input_shape'] = batch_input_shape
            ref_img_metas_new.append(ref_img_meta[0])

        gt_pids_list =[]
        for i in range(len(ref_gt_instance_id_list)):
            ref_ids = ref_gt_instance_id_list[i].cpu().data.numpy().tolist()
            gt_ids = gt_instance_ids[i].cpu().data.numpy().tolist()
            gt_pids = [ref_ids.index(i) + 1 if i in ref_ids else 0 for i in gt_ids]
            gt_pids_list.append(torch.LongTensor([gt_pids]).to(img.device)[0])
        gt_pids = gt_pids_list

        # gt_masks and gt_semantic_seg are not padded when forming batch
        gt_masks, gt_sem_cls, gt_sem_seg = self.preprocess_gt_masks(img_metas, gt_masks, gt_labels, gt_semantic_seg)

        ref_gt_masks, ref_gt_sem_cls, ref_gt_sem_seg = self.preprocess_gt_masks(ref_img_metas_new,
                                                                    ref_masks_gt, ref_labels_gt, ref_semantic_seg_gt)

        x = self.extract_feat(img)
        x_ref = self.extract_feat(ref_img)
        rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks,
                                                  gt_labels, gt_sem_seg,
                                                  gt_sem_cls)

        ref_rpn_results = self.rpn_head.forward_train(x_ref, ref_img_metas_new, ref_gt_masks,
                                                  ref_labels_gt, ref_gt_sem_seg,
                                                  ref_gt_sem_cls)

        (rpn_losses, proposal_feats, x_feats, mask_preds,
         cls_scores) = rpn_results

        (ref_rpn_losses, ref_proposal_feats, ref_x_feats, ref_mask_preds,
         ref_cls_scores) = ref_rpn_results

        losses, sample_results, object_feats, mask_preds = self.roi_head.forward_train(
            x_feats,
            proposal_feats,
            mask_preds,
            cls_scores,
            img_metas,
            gt_masks,
            gt_labels,
            gt_pids=gt_pids,
            gt_bboxes_ignore=gt_bboxes_ignore,
            gt_bboxes=gt_bboxes,
            gt_sem_seg=gt_sem_seg,
            gt_sem_cls=gt_sem_cls,
            imgs_whwh=None)

        ref_losses, ref_sample_results, ref_object_feats, ref_mask_preds = self.roi_head.forward_train(
            ref_x_feats,
            ref_proposal_feats,
            ref_mask_preds,
            ref_cls_scores,
            ref_img_metas,
            ref_gt_masks,
            ref_gt_labels,
            gt_bboxes=ref_gt_bboxes,
            gt_bboxes_ignore=ref_gt_bboxes_ignore,
            gt_sem_seg=ref_gt_sem_seg,
            gt_sem_cls=ref_gt_sem_cls,
            imgs_whwh=None)
        proposals_nums = [self.roi_head.num_proposals] * img.size()[0]
        ref_proposals_nums = proposals_nums

        thing_mask_preds, ref_thing_mask_preds = self.pack_things_masks(mask_preds, ref_mask_preds)
        match_score = self.track_head(x, x_ref, thing_mask_preds, ref_thing_mask_preds, proposals_nums, ref_proposals_nums)

        track_loss = self.track_head.loss(match_score, sample_results)

        # format the loss
        ref_rpn_losses = self.add_ref_rpn_loss(ref_rpn_losses)
        ref_losses = self.add_ref_rpn_loss(ref_losses)

        losses.update(ref_rpn_losses)
        losses.update(ref_losses)
        losses.update(track_loss)
        losses.update(rpn_losses)

        return losses

    def simple_test(self, img, img_metas, rescale=False, ref_img=None):
        """Test function without test time augmentation.

        Args:
            imgs (list[torch.Tensor]): List of multiple images
            img_metas (list[dict]): List of image information.
            rescale (bool): Whether to rescale the results.
                Defaults to False.

        Returns:
            list[list[np.ndarray]]: BBox results of each image and classes.
                The outer list corresponds to each image. The inner list
                corresponds to each class.
        """

        if ref_img is not None:
            ref_img = ref_img[0]
        # whether is the first frame for such clips
        assert 'city' in img_metas[0]['filename'] and 'iid' in img_metas[0]
        iid = img_metas[0]['iid']
        fid = iid % 10000
        is_first = (fid == 1)

        # for current frame
        x = self.extract_feat(img)
        rpn_results = self.rpn_head.simple_test_rpn(x, img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results

        if not is_first:
            ref_x = self.extract_feat(ref_img)
            ref_rpn_results = self.rpn_head.simple_test_rpn(ref_x, img_metas)
            (ref_proposal_feats, ref_x_feats, ref_mask_preds, ref_cls_scores,
             ref_seg_preds) = ref_rpn_results
            x_fuse = self.combine(ref_x_feats + x_feats)

        cur_segm_results, cur_object_query = self.roi_head.simple_test(
            x_feats,
            proposal_feats,
            mask_preds,
            cls_scores,
            img_metas,
            imgs_whwh=None,
            rescale=rescale)

        bbox_result, segm_result, panoptic_result = cur_segm_results[0]

        panoptic_seg, segments_info = panoptic_result

        cur_results, sseg_results = self.pack_stuff_things_result(panoptic_seg, segments_info)

        if is_first:
            self.track_query = cur_object_query

        if not is_first:
            track_seg_results = self.track_roi_head.simple_test(
                    x_fuse,
                    self.track_query,
                    ref_mask_preds,
                    ref_cls_scores,
                    img_metas,
                    imgs_whwh=None,
                    rescale=rescale
            )
            bbox_result, segm_result, panoptic_result = track_seg_results[0]
            track_panoptic_seg, track_segments_info = panoptic_result
            track_results, ref_sseg_results = self.pack_stuff_things_result(track_panoptic_seg, track_segments_info)

            # update the tracking query
            self.track_query = cur_object_query

        if is_first:
            self.tracker.reset_all()
            init_track_results = self.tracker.init_track(cur_results)
            track_maps = self.generate_track_id_maps(init_track_results, panoptic_seg)

        elif not is_first:
            results = self.tracker.step(cur_results, track_results)
            track_maps = self.generate_track_id_maps(results, panoptic_seg)

        return cur_segm_results, track_maps, sseg_results

    def forward_dummy(self, img):
        """Used for computing network flops.

        See `mmdetection/tools/get_flops.py`
        """
        # backbone
        x = self.extract_feat(img)
        # rpn
        num_imgs = len(img)
        dummy_img_metas = [
            dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs)
        ]
        rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results
        roi_outs = self.roi_head.forward_dummy(x_feats, proposal_feats,
                                               dummy_img_metas)
        return roi_outs

    def extract_feat(self, img):
        """Directly extract features from the backbone+neck."""
        x = self.backbone(img)
        if self.with_neck:
            x = self.neck(x)
        return x

    @property
    def with_rpn(self):
        """bool: whether the detector has RPN"""
        return hasattr(self, 'rpn_head') and self.rpn_head is not None

    @property
    def with_roi_head(self):
        """bool: whether the detector has a RoI head"""
        return hasattr(self, 'roi_head') and self.roi_head is not None

    def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
        """Test with augmentations.

        If rescale is False, then returned bboxes and masks will fit the scale
        of imgs[0].
        """
        pass

    def pack_things_object(self, object_feats, ref_object_feats):
        object_feats, ref_object_feats = object_feats.squeeze(-1).squeeze(-1), ref_object_feats.squeeze(-1).squeeze(-1)
        thing_object_feats = torch.split(object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        ref_thing_object_feats = torch.split(ref_object_feats, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        return thing_object_feats, ref_thing_object_feats

    def pack_things_masks(self, mask_pred, ref_mask_pred):
        thing_mask_pred = torch.split(mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        ref_thing_thing_mask_pred= torch.split(ref_mask_pred, [self.roi_head.num_proposals, self.num_stuff_classes], dim=1)[0]
        return thing_mask_pred, ref_thing_thing_mask_pred

    def add_track_loss(self, loss_dict):
        track_loss ={}
        for k,v in loss_dict.items():
            track_loss[str(k)+"_track"] = v
        return track_loss

    def add_ref_rpn_loss(self, loss_dict):
        ref_rpn_loss = {}
        for k,v in loss_dict.items():
            ref_rpn_loss[str(k) +"_ref"] = v
        return ref_rpn_loss

    def pack_stuff_things_result(self, panoptic_seg, segments_info):
        results = {}
        masks = []
        scores = []
        semantic_seg = np.zeros(panoptic_seg.shape)
        for segment in segments_info:
            if segment['isthing'] == True:
                thing_mask = panoptic_seg == segment["id"]
                masks.append(thing_mask)
                scores.append(segment["score"])
                # for things to shift the labels
                # (n - c)
                semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] + 11
            else:
                # for stuff (0- n-1)
                semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] - 1

        results["masks"] = np.array(masks)  # (N)
        results["scores"] = np.array(scores)  # (N,H,W)

        return results, semantic_seg

    def generate_track_id_maps(self, track_results, panopitc_seg_maps):
        final_id_maps = np.zeros(panopitc_seg_maps.shape)
        # assert len(things_mask_results) == len(track_results)
        for track in track_results:
            id = track["tracking_id"]
            mask = track["mask"]
            final_id_maps[mask] = id
        return final_id_maps

================================================
FILE: knet/video/knet_uni_track.py
================================================
import warnings
import numpy as np
import torch
import torch.nn.functional as F
from mmdet.models.builder import DETECTORS
from mmdet.models.detectors import BaseDetector
from mmdet.models.builder import build_head, build_neck, build_backbone
from knet.det.utils import sem2ins_masks, sem2ins_masks_cityscapes
from unitrack.mask import MaskAssociationTracker


@DETECTORS.register_module()
class VideoKNetUniTrack(BaseDetector):
    def __init__(self,
                 backbone,
                 neck=None,
                 rpn_head=None,
                 roi_head=None,
                 track_roi_head=None,
                 train_cfg=None,
                 test_cfg=None,
                 pretrained=None,
                 init_cfg=None,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 ignore_label=255,
                 thing_label_in_seg=0,
                 kitti_step=False,
                 cityscapes=False,
                 uni_tracker_cfg=None,
                 **kwargs):
        super(VideoKNetUniTrack, self).__init__(init_cfg)

        if pretrained:
            warnings.warn('DeprecationWarning: pretrained is deprecated, '
                          'please use "init_cfg" instead')
            backbone.pretrained = pretrained
        self.backbone = build_backbone(backbone)

        if neck is not None:
            self.neck = build_neck(neck)

        if rpn_head is not None:
            rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None
            rpn_head_ = rpn_head.copy()
            rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn)
            self.rpn_head = build_head(rpn_head_)

        if roi_head is not None:
            # update train and test cfg here for now
            # TODO: refactor assigner & sampler
            rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None
            roi_head.update(train_cfg=rcnn_train_cfg)
            roi_head.update(test_cfg=test_cfg.rcnn)
            roi_head.pretrained = pretrained
            self.roi_head = build_head(roi_head)

        self.tracker = MaskAssociationTracker(uni_tracker_cfg)
        self.img0 = None
        self.train_cfg = train_cfg
        self.test_cfg = test_cfg

        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.thing_label_in_seg = thing_label_in_seg
        self.ignore_label = ignore_label
        self.cityscapes = cityscapes  # whether to train the cityscape panoptic segmentation
        self.kitti_step = kitti_step  # whether to use kitti step dataset

    def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_semantic_seg):
        # gt_masks and gt_semantic_seg are not padded when forming batch
        gt_masks_tensor = []
        gt_sem_seg = []
        gt_sem_cls = []
        # batch_input_shape shoud be the same across images
        pad_H, pad_W = img_metas[0]['batch_input_shape']
        assign_H = pad_H // self.mask_assign_stride
        assign_W = pad_W // self.mask_assign_stride

        for i, gt_mask in enumerate(gt_masks):
            mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device)
            if gt_mask.width != pad_W or gt_mask.height != pad_H:
                pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height)
                mask_tensor = F.pad(mask_tensor, pad_wh, value=0)

            if gt_semantic_seg is not None:
                # gt_semantic seg is padded by zero when forming a batch
                # need to convert them from 0 to ignore
                gt_semantic_seg[
                i, :, img_metas[i]['img_shape'][0]:, :] = self.ignore_label
                gt_semantic_seg[
                i, :, :, img_metas[i]['img_shape'][1]:] = self.ignore_label
                if self.cityscapes:
                    sem_labels, sem_seg = sem2ins_masks_cityscapes(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=self.num_thing_classes)
                else:
                    sem_labels, sem_seg = sem2ins_masks(
                        gt_semantic_seg[i],
                        ignore_label=self.ignore_label,
                        label_shift=self.num_thing_classes,
                        thing_label_in_seg=self.thing_label_in_seg)

                if sem_seg.shape[0] == 0:
                    gt_sem_seg.append(
                        mask_tensor.new_zeros(
                            (mask_tensor.size(0), assign_H, assign_W)))
                else:
                    gt_sem_seg.append(
                        F.interpolate(
                            sem_seg[None], (assign_H, assign_W),
                            mode='bilinear',
                            align_corners=False)[0])
                gt_sem_cls.append(sem_labels)
            else:
                gt_sem_seg = None
                gt_sem_cls = None

            if mask_tensor.shape[0] == 0:
                gt_masks_tensor.append(
                    mask_tensor.new_zeros(
                        (mask_tensor.size(0), assign_H, assign_W)))
            else:
                gt_masks_tensor.append(
                    F.interpolate(
                        mask_tensor[None], (assign_H, assign_W),  # downsample to 1/4 resolution
                        mode='bilinear',
                        align_corners=False)[0])

        return gt_masks_tensor, gt_sem_cls, gt_sem_seg

    def forward_train(self,
                      img,
                      img_metas,
                      gt_bboxes=None,
                      gt_labels=None,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      gt_semantic_seg=None,
                      ref_img=None,
                      ref_img_metas=None,
                      ref_gt_bboxes_ignore=None,
                      ref_gt_labels=None,
                      ref_gt_masks=None,
                      ref_gt_semantic_seg=None,
                      proposals=None,
                      **kwargs):
        """Forward function of SparseR-CNN-like network in train stage.

        Args:
            img (Tensor): of shape (N, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.
            img_metas (list[dict]): list of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                :class:`mmdet.datasets.pipelines.Collect`.
            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
            gt_labels (list[Tensor]): class indices corresponding to each box
            gt_bboxes_ignore (None | list[Tensor): specify which bounding
                boxes can be ignored when computing the loss.
            gt_masks (List[Tensor], optional) : Segmentation masks for
                each box. But we don't support it in this architecture.
            proposals (List[Tensor], optional): override rpn proposals with
                custom proposals. Use when `with_rpn` is False.

            # This is for video only:
            ref_img (Tensor): of shape (N, 2, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.
                2 denotes there is two reference images for each input image.

            ref_img_metas (list[list[dict]]): The first list only has one
                element. The second list contains reference image information
                dict where each dict has: 'img_shape', 'scale_factor', 'flip',
                and may also contain 'filename', 'ori_shape', 'pad_shape', and
                'img_norm_cfg'. For details on the values of these keys see
                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.

            ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The
                Tensor contains ground truth bboxes for each reference image
                with shape (num_all_ref_gts, 5) in
                [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id
                start from 0, and denotes the id of reference image for each
                key image.

            ref_gt_labels (list[Tensor]): The list only has one Tensor. The
                Tensor contains class indices corresponding to each reference
                box with shape (num_all_ref_gts, 2) in
                [ref_img_id, class_indice].

        Returns:
            dict[str, Tensor]: a dictionary of loss components
        """
        batch_input_shape = tuple(img[0].size()[-2:])
        for img_meta in img_metas:
            img_meta['batch_input_shape'] = batch_input_shape

        assert proposals is None, 'KNet does not support' \
                                  ' external proposals'
        assert gt_masks is not None

        # preprocess the reference images
        ref_img = ref_img.squeeze(1)  # (b,3,h,w)
        ref_masks_gt = []
        for ref_gt_mask in ref_gt_masks:
            ref_masks_gt.append(ref_gt_mask[0])
        ref_labels_gt = []
        for ref_gt_label in ref_gt_labels:
            ref_labels_gt.append(ref_gt_label[:, 1].long())
        ref_gt_labels = ref_labels_gt
        ref_semantic_seg_gt = ref_gt_semantic_seg.squeeze(1)

        ref_img_metas_new = []
        for ref_img_meta in ref_img_metas:
            ref_img_meta[0]['batch_input_shape'] = batch_input_shape
            ref_img_metas_new.append(ref_img_meta[0])

        # gt_masks and gt_semantic_seg are not padded when forming batch
        gt_masks, gt_sem_cls, gt_sem_seg = self.preprocess_gt_masks(img_metas, gt_masks, gt_labels, gt_semantic_seg)

        ref_gt_masks, ref_gt_sem_cls, ref_gt_sem_seg = self.preprocess_gt_masks(ref_img_metas_new,
                                                                    ref_masks_gt, ref_labels_gt, ref_semantic_seg_gt)
        x = self.extract_feat(img)
        x_ref = self.extract_feat(ref_img)
        rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks,
                                                  gt_labels, gt_sem_seg,
                                                  gt_sem_cls)

        ref_rpn_results = self.rpn_head.forward_train(x_ref, ref_img_metas_new, ref_gt_masks,
                                                  ref_labels_gt, ref_gt_sem_seg,
                                                  ref_gt_sem_cls)

        (rpn_losses, proposal_feats, x_feats, mask_preds,
         cls_scores) = rpn_results

        (ref_rpn_losses, ref_proposal_feats, ref_x_feats, ref_mask_preds,
         ref_cls_scores) = ref_rpn_results

        x_fuse = self.combine(ref_x_feats + x_feats)

        losses, cur_object_query = self.roi_head.forward_train(
            x_feats,
            proposal_feats,
            mask_preds,
            cls_scores,
            img_metas,
            gt_masks,
            gt_labels,
            gt_bboxes_ignore=gt_bboxes_ignore,
            gt_bboxes=gt_bboxes,
            gt_sem_seg=gt_sem_seg,
            gt_sem_cls=gt_sem_cls,
            imgs_whwh=None)

        track_query_loss = self.track_roi_head.forward_train(
            x_fuse,
            cur_object_query,
            ref_mask_preds,
            ref_cls_scores,
            ref_img_metas_new,
            ref_gt_masks,
            ref_gt_labels,
            gt_sem_seg=ref_gt_sem_seg,
            gt_sem_cls=ref_gt_sem_cls,
            imgs_whwh=None
        )

        track_query_loss = self.add_track_loss(track_query_loss)
        ref_rpn_losses = self.add_ref_rpn_loss(ref_rpn_losses)
        # single frame loss
        # query track loss for reference frame
        losses.update(ref_rpn_losses)
        losses.update(rpn_losses)
        losses.update(track_query_loss)

        return losses

    def simple_test(self, img, img_metas, rescale=False, ref_img=None):
        """Test function without test time augmentation.

        Args:
            imgs (list[torch.Tensor]): List of multiple images
            img_metas (list[dict]): List of image information.
            rescale (bool): Whether to rescale the results.
                Defaults to False.

        Returns:
            list[list[np.ndarray]]: BBox results of each image and classes.
                The outer list corresponds to each image. The inner list
                corresponds to each class.
        """
        if ref_img is not None:
            ref_img = ref_img[0]
        # whether is the first frame for such clips
        assert 'city' in img_metas[0]['filename'] and 'iid' in img_metas[0]
        iid = img_metas[0]['iid']
        fid = iid % 10000
        is_first = (fid == 1)

        # for current frame
        x = self.extract_feat(img)
        rpn_results = self.rpn_head.simple_test_rpn(x, img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results

        # Changed from the notation above, need further check.
        cur_segm_results, object_feats, cls_score, mask_preds, scaled_mask_preds = self.roi_head.simple_test(
            x_feats,
            proposal_feats,
            mask_preds,
            cls_scores,
            img_metas)

        bbox_result, segm_result, thing_mask_preds, panoptic_result = cur_segm_results[0]

        panoptic_seg, segments_info = panoptic_result

        cur_results, sseg_results = self.pack_stuff_things_result(panoptic_seg, segments_info)

        if is_first:
            self.img0 = img
            self.tracker.reset_all()
            if len(cur_results["masks"]) == 0:
                track_maps = np.zeros(panoptic_seg.shape)
            else:
                init_track_results = self.tracker.update(img, self.img0, cur_results["masks"])
                track_maps = self.generate_track_id_maps(init_track_results, panoptic_seg)

        else:
            if len(cur_results["masks"]) == 0:
                track_maps = np.zeros(panoptic_seg.shape)
            else:
                results = self.tracker.update(img, self.img0, cur_results["masks"])
                track_maps = self.generate_track_id_maps(results, panoptic_seg)

        semantic_map = self.get_semantic_seg(panoptic_seg, segments_info)

        from scripts.visualizer import trackmap2rgb, cityscapes_cat2rgb, draw_bbox_on_img
        vis_tracker = trackmap2rgb(track_maps)
        vis_sem = cityscapes_cat2rgb(semantic_map)

        return semantic_map, track_maps, None,vis_sem, vis_tracker

    def forward_dummy(self, img):
        """Used for computing network flops.

        See `mmdetection/tools/get_flops.py`
        """
        # backbone
        x = self.extract_feat(img)
        # rpn
        num_imgs = len(img)
        dummy_img_metas = [
            dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs)
        ]
        rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results
        # roi_head
        roi_outs = self.roi_head.forward_dummy(x_feats, proposal_feats,
                                               dummy_img_metas)
        return roi_outs

    def extract_feat(self, img):
        """Directly extract features from the backbone+neck."""
        x = self.backbone(img)
        if self.with_neck:
            x = self.neck(x)
        return x

    @property
    def with_rpn(self):
        """bool: whether the detector has RPN"""
        return hasattr(self, 'rpn_head') and self.rpn_head is not None

    @property
    def with_roi_head(self):
        """bool: whether the detector has a RoI head"""
        return hasattr(self, 'roi_head') and self.roi_head is not None

    def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
        """Test with augmentations.

        If rescale is False, then returned bboxes and masks will fit the scale
        of imgs[0].
        """
        pass

    def add_track_loss(self, loss_dict):
        track_loss ={}
        for k, v in loss_dict.items():
            track_loss[str(k)+"_track"] = v
        return track_loss

    def add_ref_rpn_loss(self, loss_dict):
        ref_rpn_loss = {}
        for k, v in loss_dict.items():
            ref_rpn_loss[str(k) +"_ref"] = v
        return ref_rpn_loss

    def pack_stuff_things_result(self, panoptic_seg, segments_info):
        results = {}
        masks = []
        scores = []
        semantic_seg = np.zeros(panoptic_seg.shape)
        for segment in segments_info:
            if segment['isthing'] == True:
                thing_mask = panoptic_seg == segment["id"]
                masks.append(thing_mask)
                scores.append(segment["score"])
                # for things to shift the labels
                # (n - c)
                semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] + 11
            else:
                # for stuff (0- n-1)
                semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] - 1

        results["masks"] = np.array(masks)  # (N)
        results["scores"] = np.array(scores)  # (N,H,W)

        return results, semantic_seg

    def generate_track_id_maps(self, track_results, panopitc_seg_maps):
        final_id_maps = np.zeros(panopitc_seg_maps.shape)
        # print(" current track results: ", len(track_results))
        for track in track_results:
            id = track.track_id
            mask = track.mask
            final_id_maps[mask] = id
        return final_id_maps

    def get_semantic_seg(self, panoptic_seg, segments_info):
        results = {}
        masks = []
        scores = []
        kitti_step2cityscpaes = [11, 13]
        semantic_seg = np.zeros(panoptic_seg.shape)
        for segment in segments_info:
            if segment['isthing'] == True:
                if self.kitti_step:
                    cat_cur = kitti_step2cityscpaes[segment["category_id"]]
                    semantic_seg[panoptic_seg == segment["id"]] = cat_cur
                else:
                    semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] + 11
            else:
                # for stuff (0- n-1)
                if self.kitti_step:
                    cat_cur = segment["category_id"]
                    cat_cur -= 1
                    offset = 0
                    for thing_id in kitti_step2cityscpaes:
                        if cat_cur + offset >= thing_id:
                            offset += 1
                    cat_cur += offset
                    semantic_seg[panoptic_seg == segment["id"]] = cat_cur
                else:
                    semantic_seg[panoptic_seg == segment["id"]] = segment["category_id"] - 1
        return semantic_seg

================================================
FILE: knet/video/mask_hungarian_assigner.py
================================================
import numpy as np
import torch
from mmdet.core import AssignResult, BaseAssigner, reduce_mean
from mmdet.core.bbox.builder import BBOX_ASSIGNERS
from mmdet.core.bbox.match_costs.builder import MATCH_COST, build_match_cost

try:
    from scipy.optimize import linear_sum_assignment
except ImportError:
    linear_sum_assignment = None


@MATCH_COST.register_module()
class DiceCost(object):
    """DiceCost.

     Args:
         weight (int | float, optional): loss_weight
         pred_act (bool): Whether to activate the prediction
            before calculating cost

     Examples:
         >>> from mmdet.core.bbox.match_costs.match_cost import BBoxL1Cost
         >>> import torch
         >>> self = BBoxL1Cost()
         >>> bbox_pred = torch.rand(1, 4)
         >>> gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
         >>> factor = torch.tensor([10, 8, 10, 8])
         >>> self(bbox_pred, gt_bboxes, factor)
         tensor([[1.6172, 1.6422]])
    """

    def __init__(self,
                 weight=1.,
                 pred_act=False,
                 act_mode='sigmoid',
                 eps=1e-3):
        self.weight = weight
        self.pred_act = pred_act
        self.act_mode = act_mode
        self.eps = eps

    def dice_loss(cls, input, target, eps=1e-3):
        input = input.reshape(input.size()[0], -1)
        target = target.reshape(target.size()[0], -1).float()
        # einsum saves 10x memory
        # a = torch.sum(input[:, None] * target[None, ...], -1)
        a = torch.einsum('nh,mh->nm', input, target)
        b = torch.sum(input * input, 1) + eps
        c = torch.sum(target * target, 1) + eps
        d = (2 * a) / (b[:, None] + c[None, ...])
        # 1 is a constance that will not affect the matching, so ommitted
        return -d

    def __call__(self, mask_preds, gt_masks):
        """
        Args:
            bbox_pred (Tensor): Predicted boxes with normalized coordinates
                (cx, cy, w, h), which are all in range [0, 1]. Shape
                [num_query, 4].
            gt_bboxes (Tensor): Ground truth boxes with normalized
                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].

        Returns:
            torch.Tensor: bbox_cost value with weight
        """
        if self.pred_act and self.act_mode == 'sigmoid':
            mask_preds = mask_preds.sigmoid()
        elif self.pred_act:
            mask_preds = mask_preds.softmax(dim=0)
        dice_cost = self.dice_loss(mask_preds, gt_masks, self.eps)
        return dice_cost * self.weight


@MATCH_COST.register_module()
class MaskCost(object):
    """MaskCost.

    Args:
        weight (int | float, optional): loss_weight
    """

    def __init__(self, weight=1., pred_act=False, act_mode='sigmoid'):
        self.weight = weight
        self.pred_act = pred_act
        self.act_mode = act_mode

    def __call__(self, cls_pred, target):
        """
        Args:
            cls_pred (Tensor): Predicted classification logits, shape
                [num_query, num_class].
            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).

        Returns:
            torch.Tensor: cls_cost value with weight
        """
        if self.pred_act and self.act_mode == 'sigmoid':
            cls_pred = cls_pred.sigmoid()
        elif self.pred_act:
            cls_pred = cls_pred.softmax(dim=0)
        num_proposals = cls_pred.shape[0]
        num_gts, H, W = target.shape
        # flatten_cls_pred = cls_pred.view(num_proposals, -1)
        # eingum is ~10 times faster than matmul
        pos_cost = torch.einsum('nhw,mhw->nm', cls_pred, target)
        neg_cost = torch.einsum('nhw,mhw->nm', 1 - cls_pred, 1 - target)
        # flatten_target = target.view(num_gts, -1).t()
        # pos_cost = flatten_cls_pred.matmul(flatten_target)
        # neg_cost = (1 - flatten_cls_pred).matmul(1 - flatten_target)
        cls_cost = -(pos_cost + neg_cost) / (H * W)
        return cls_cost * self.weight


@BBOX_ASSIGNERS.register_module()
class MaskHungarianAssigner(BaseAssigner):
    """Computes one-to-one matching between predictions and ground truth.

    This class computes an assignment between the targets and the predictions
    based on the costs. The costs are weighted sum of three components:
    classfication cost, regression L1 cost and regression iou cost. The
    targets don't include the no_object, so generally there are more
    predictions than targets. After the one-to-one matching, the un-matched
    are treated as backgrounds. Thus each query prediction will be assigned
    with `0` or a positive integer indicating the ground truth index:

    - 0: negative sample, no assigned gt
    - positive integer: positive sample, index (1-based) of assigned gt

    Args:
        cls_weight (int | float, optional): The scale factor for classification
            cost. Default 1.0.
        bbox_weight (int | float, optional): The scale factor for regression
            L1 cost. Default 1.0.
        iou_weight (int | float, optional): The scale factor for regression
            iou cost. Default 1.0.
        iou_calculator (dict | optional): The config for the iou calculation.
            Default type `BboxOverlaps2D`.
        iou_mode (str | optional): "iou" (intersection over union), "iof"
                (intersection over foreground), or "giou" (generalized
                intersection over union). Default "giou".
    """

    def __init__(self,
                 cls_cost=dict(type='ClassificationCost', weight=1.),
                 mask_cost=dict(type='SigmoidCost', weight=1.0),
                 dice_cost=dict(),
                 boundary_cost=None,
                 topk=1):
        self.cls_cost = build_match_cost(cls_cost)
        self.mask_cost = build_match_cost(mask_cost)
        self.dice_cost = build_match_cost(dice_cost)
        if boundary_cost is not None:
            self.boundary_cost = build_match_cost(boundary_cost)
        else:
            self.boundary_cost = None
        self.topk = topk

    def assign(self,
               bbox_pred,
               cls_pred,
               gt_bboxes,
               gt_labels,
               img_meta=None,
               gt_bboxes_ignore=None,
               eps=1e-7):
        """Computes one-to-one matching based on the weighted costs.

        This method assign each query prediction to a ground truth or
        background. The `assigned_gt_inds` with -1 means don't care,
        0 means negative sample, and positive number is the index (1-based)
        of assigned gt.
        The assignment is done in the following steps, the order matters.

        1. assign every prediction to -1
        2. compute the weighted costs
        3. do Hungarian matching on CPU based on the costs
        4. assign all to 0 (background) first, then for each matched pair
           between predictions and gts, treat this prediction as foreground
           and assign the corresponding gt index (plus 1) to it.

        Args:
            bbox_pred (Tensor): Predicted boxes with normalized coordinates
                (cx, cy, w, h), which are all in range [0, 1]. Shape
                [num_query, 4].
            cls_pred (Tensor): Predicted classification logits, shape
                [num_query, num_class].
            gt_bboxes (Tensor): Ground truth boxes with unnormalized
                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
            img_meta (dict): Meta information for current image.
            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
                labelled as `ignored`. Default None.
            eps (int | float, optional): A value added to the denominator for
                numerical stability. Default 1e-7.

        Returns:
            :obj:`AssignResult`: The assigned result.
        """
        assert gt_bboxes_ignore is None, \
            'Only case when gt_bboxes_ignore is None is supported.'
        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)

        # 1. assign -1 by default
        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
                                              -1,
                                              dtype=torch.long)
        assigned_labels = bbox_pred.new_full((num_bboxes, ),
                                             -1,
                                             dtype=torch.long)
        if num_gts == 0 or num_bboxes == 0:
            # No ground truth or boxes, return empty assignment
            if num_gts == 0:
                # No ground truth, assign all to background
                assigned_gt_inds[:] = 0
            return AssignResult(
                num_gts, assigned_gt_inds, None, labels=assigned_labels)

        # 2. compute the weighted costs
        # classification and bboxcost.
        if self.cls_cost.weight != 0 and cls_pred is not None:
            cls_cost = self.cls_cost(cls_pred, gt_labels)
        else:
            cls_cost = 0
        if self.mask_cost.weight != 0:
            reg_cost = self.mask_cost(bbox_pred, gt_bboxes)
        else:
            reg_cost = 0
        if self.dice_cost.weight != 0:
            dice_cost = self.dice_cost(bbox_pred, gt_bboxes)
        else:
            dice_cost = 0
        if self.boundary_cost is not None and self.boundary_cost.weight != 0:
            b_cost = self.boundary_cost(bbox_pred, gt_bboxes)
        else:
            b_cost = 0
        cost = cls_cost + reg_cost + dice_cost + b_cost

        # 3. do Hungarian matching on CPU using linear_sum_assignment
        cost = cost.detach().cpu()
        if linear_sum_assignment is None:
            raise ImportError('Please run "pip install scipy" '
                              'to install scipy first.')
        if self.topk == 1:
            matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
        else:
            topk_matched_row_inds = []
            topk_matched_col_inds = []
            for i in range(self.topk):
                matched_row_inds, matched_col_inds = linear_sum_assignment(
                    cost)
                topk_matched_row_inds.append(matched_row_inds)
                topk_matched_col_inds.append(matched_col_inds)
                cost[matched_row_inds] = 1e10
            matched_row_inds = np.concatenate(topk_matched_row_inds)
            matched_col_inds = np.concatenate(topk_matched_col_inds)

        matched_row_inds = torch.from_numpy(matched_row_inds).to(
            bbox_pred.device)
        matched_col_inds = torch.from_numpy(matched_col_inds).to(
            bbox_pred.device)

        # 4. assign backgrounds and foregrounds
        # assign all indices to backgrounds first
        assigned_gt_inds[:] = 0
        # assign foregrounds based on matching results
        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
        return AssignResult(
            num_gts, assigned_gt_inds, None, labels=assigned_labels)


@BBOX_ASSIGNERS.register_module()
class MaskHungarianAssignerWithEmbed(BaseAssigner):
    """Computes one-to-one matching between predictions and ground truth.

    This class computes an assignment between the targets and the predictions
    based on the costs. The costs are weighted sum of three components:
    classfication cost, regression L1 cost and regression iou cost. The
    targets don't include the no_object, so generally there are more
    predictions than targets. After the one-to-one matching, the un-matched
    are treated as backgrounds. Thus each query prediction will be assigned
    with `0` or a positive integer indicating the ground truth index:

    - 0: negative sample, no assigned gt
    - positive integer: positive sample, index (1-based) of assigned gt

    Args:
        cls_weight (int | float, optional): The scale factor for classification
            cost. Default 1.0.
        bbox_weight (int | float, optional): The scale factor for regression
            L1 cost. Default 1.0.
        iou_weight (int | float, optional): The scale factor for regression
            iou cost. Default 1.0.
        iou_calculator (dict | optional): The config for the iou calculation.
            Default type `BboxOverlaps2D`.
        iou_mode (str | optional): "iou" (intersection over union), "iof"
                (intersection over foreground), or "giou" (generalized
                intersection over union). Default "giou".
    """

    def __init__(self,
                 cls_cost=dict(type='ClassificationCost', weight=1.),
                 mask_cost=dict(type='SigmoidCost', weight=1.0),
                 dice_cost=dict(),
                 boundary_cost=None,
                 topk=1):
        self.cls_cost = build_match_cost(cls_cost)
        self.mask_cost = build_match_cost(mask_cost)
        self.dice_cost = build_match_cost(dice_cost)
        if boundary_cost is not None:
            self.boundary_cost = build_match_cost(boundary_cost)
        else:
            self.boundary_cost = None
        self.topk = topk

    def assign(self,
               bbox_pred,
               cls_pred,
               gt_bboxes,
               gt_labels,
               embed_pred=None,
               img_meta=None,
               gt_bboxes_ignore=None,
               eps=1e-7):
        """Computes one-to-one matching based on the weighted costs.

        This method assign each query prediction to a ground truth or
        background. The `assigned_gt_inds` with -1 means don't care,
        0 means negative sample, and positive number is the index (1-based)
        of assigned gt.
        The assignment is done in the following steps, the order matters.

        1. assign every prediction to -1
        2. compute the weighted costs
        3. do Hungarian matching on CPU based on the costs
        4. assign all to 0 (background) first, then for each matched pair
           between predictions and gts, treat this prediction as foreground
           and assign the corresponding gt index (plus 1) to it.

        Args:
            bbox_pred (Tensor): Predicted boxes with normalized coordinates
                (cx, cy, w, h), which are all in range [0, 1]. Shape
                [num_query, 4].
            cls_pred (Tensor): Predicted classification logits, shape
                [num_query, num_class].
            gt_bboxes (Tensor): Ground truth boxes with unnormalized
                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
            img_meta (dict): Meta information for current image.
            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
                labelled as `ignored`. Default None.
            eps (int | float, optional): A value added to the denominator for
                numerical stability. Default 1e-7.

        Returns:
            :obj:`AssignResult`: The assigned result.
        """
        assert gt_bboxes_ignore is None, \
            'Only case when gt_bboxes_ignore is None is supported.'
        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)

        # 1. assign -1 by default
        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
                                              -1,
                                              dtype=torch.long)
        assigned_labels = bbox_pred.new_full((num_bboxes, ),
                                             -1,
                                             dtype=torch.long)
        if num_gts == 0 or num_bboxes == 0:
            # No ground truth or boxes, return empty assignment
            if num_gts == 0:
                # No ground truth, assign all to background
                assigned_gt_inds[:] = 0
            return AssignResult(
                num_gts, assigned_gt_inds, None, labels=assigned_labels)

        # 2. compute the weighted costs
        # classification and bboxcost.
        if self.cls_cost.weight != 0 and cls_pred is not None:
            cls_cost = self.cls_cost(cls_pred, gt_labels)
        else:
            cls_cost = 0
        if self.mask_cost.weight != 0:
            reg_cost = self.mask_cost(bbox_pred, gt_bboxes)
        else:
            reg_cost = 0
        if self.dice_cost.weight != 0:
            dice_cost = self.dice_cost(bbox_pred, gt_bboxes)
        else:
            dice_cost = 0
        if self.boundary_cost is not None and self.boundary_cost.weight != 0:
            b_cost = self.boundary_cost(bbox_pred, gt_bboxes)
        else:
            b_cost = 0
        cost = cls_cost + reg_cost + dice_cost + b_cost

        # 3. do Hungarian matching on CPU using linear_sum_assignment
        cost = cost.detach().cpu()
        if linear_sum_assignment is None:
            raise ImportError('Please run "pip install scipy" '
                              'to install scipy first.')
        if self.topk == 1:
            matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
        else:
            topk_matched_row_inds = []
            topk_matched_col_inds = []
            for i in range(self.topk):
                matched_row_inds, matched_col_inds = linear_sum_assignment(
                    cost)
                topk_matched_row_inds.append(matched_row_inds)
                topk_matched_col_inds.append(matched_col_inds)
                cost[matched_row_inds] = 1e10
            matched_row_inds = np.concatenate(topk_matched_row_inds)
            matched_col_inds = np.concatenate(topk_matched_col_inds)

        matched_row_inds = torch.from_numpy(matched_row_inds).to(
            bbox_pred.device)
        matched_col_inds = torch.from_numpy(matched_col_inds).to(
            bbox_pred.device)

        # 4. assign backgrounds and foregrounds
        # assign all indices to backgrounds first
        assigned_gt_inds[:] = 0
        # assign foregrounds based on matching results
        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
        return AssignResult(
            num_gts, assigned_gt_inds, None, labels=assigned_labels)


================================================
FILE: knet/video/mask_pseudo_sampler.py
================================================
import torch

from mmdet.core.bbox import BaseSampler, SamplingResult
from mmdet.core.bbox.builder import BBOX_SAMPLERS


class MaskSamplingResult(SamplingResult):
    """Bbox sampling result.

    Example:
        >>> # xdoctest: +IGNORE_WANT
        >>> from mmdet.core.bbox.samplers.sampling_result import *  # NOQA
        >>> self = SamplingResult.random(rng=10)
        >>> print(f'self = {self}')
        self = <SamplingResult({
            'neg_masks': torch.Size([12, 4]),
            'neg_inds': tensor([ 0,  1,  2,  4,  5,  6,  7,  8,  9, 10, 11, 12]),
            'num_gts': 4,
            'pos_assigned_gt_inds': tensor([], dtype=torch.int64),
            'pos_masks': torch.Size([0, 4]),
            'pos_inds': tensor([], dtype=torch.int64),
            'pos_is_gt': tensor([], dtype=torch.uint8)
        })>
    """

    def __init__(self, pos_inds, neg_inds, masks, gt_masks, assign_result,
                 gt_flags):
        self.pos_inds = pos_inds
        self.neg_inds = neg_inds
        self.pos_masks = masks[pos_inds]
        self.neg_masks = masks[neg_inds]
        self.pos_is_gt = gt_flags[pos_inds]

        self.num_gts = gt_masks.shape[0]
        self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1

        if gt_masks.numel() == 0:
            # hack for index error case
            assert self.pos_assigned_gt_inds.numel() == 0
            self.pos_gt_masks = torch.empty_like(gt_masks)
        else:
            self.pos_gt_masks = gt_masks[self.pos_assigned_gt_inds, :]

        if assign_result.labels is not None:
            self.pos_gt_labels = assign_result.labels[pos_inds]
        else:
            self.pos_gt_labels = None

    @property
    def masks(self):
        """torch.Tensor: concatenated positive and negative boxes"""
        return torch.cat([self.pos_masks, self.neg_masks])

    def __nice__(self):
        data = self.info.copy()
        data['pos_masks'] = data.pop('pos_masks').shape
        data['neg_masks'] = data.pop('neg_masks').shape
        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
        body = '    ' + ',\n    '.join(parts)
        return '{\n' + body + '\n}'

    @property
    def info(self):
        """Returns a dictionary of info about the object."""
        return {
            'pos_inds': self.pos_inds,
            'neg_inds': self.neg_inds,
            'pos_masks': self.pos_masks,
            'neg_masks': self.neg_masks,
            'pos_is_gt': self.pos_is_gt,
            'num_gts': self.num_gts,
            'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
        }

================================================
FILE: knet/video/qdtrack/builder.py
================================================
from mmcv.utils import Registry
from mmcv.cnn import build_model_from_cfg as build

TRACKERS = Registry('tracker')


def build_tracker(cfg):
    """Build tracker."""
    return build(cfg, TRACKERS)


================================================
FILE: knet/video/qdtrack/losses/__init__.py
================================================
from .l2_loss import L2Loss
from .multipos_cross_entropy_loss import MultiPosCrossEntropyLoss

__all__ = ['L2Loss', 'MultiPosCrossEntropyLoss']

================================================
FILE: knet/video/qdtrack/losses/l2_loss.py
================================================
import numpy as np
import torch
import torch.nn as nn
from mmdet.models import LOSSES, weighted_loss


@weighted_loss
def l2_loss(pred, target):
    """L2 loss.

    Args:
        pred (torch.Tensor): The prediction.
        target (torch.Tensor): The learning target of the prediction.

    Returns:
        torch.Tensor: Calculated loss
    """
    assert pred.size() == target.size() and target.numel() > 0
    loss = torch.abs(pred - target)**2
    return loss


@LOSSES.register_module()
class L2Loss(nn.Module):
    """L2 loss.

    Args:
        reduction (str, optional): The method to reduce the loss.
            Options are "none", "mean" and "sum".
        loss_weight (float, optional): The weight of loss.
    """

    def __init__(self,
                 neg_pos_ub=-1,
                 pos_margin=-1,
                 neg_margin=-1,
                 hard_mining=False,
                 reduction='mean',
                 loss_weight=1.0):
        super(L2Loss, self).__init__()
        self.neg_pos_ub = neg_pos_ub
        self.pos_margin = pos_margin
        self.neg_margin = neg_margin
        self.hard_mining = hard_mining
        self.reduction = reduction
        self.loss_weight = loss_weight

    def forward(self,
                pred,
                target,
                weight=None,
                avg_factor=None,
                reduction_override=None):
        """Forward function.

        Args:
            pred (torch.Tensor): The prediction.
            target (torch.Tensor): The learning target of the prediction.
            weight (torch.Tensor, optional): The weight of loss for each
                prediction. Defaults to None.
            avg_factor (int, optional): Average factor that is used to average
                the loss. Defaults to None.
            reduction_override (str, optional): The reduction method used to
                override the original reduction method of the loss.
                Defaults to None.
        """
        assert reduction_override in (None, 'none', 'mean', 'sum')
        reduction = (
            reduction_override if reduction_override else self.reduction)
        pred, weight, avg_factor = self.update_weight(pred, target, weight, avg_factor)
        loss_bbox = self.loss_weight * l2_loss(
            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
        return loss_bbox

    def update_weight(self, pred, target, weight, avg_factor):
        if weight is None:
            weight = target.new_ones(target.size())
        invalid_inds = weight <= 0
        target[invalid_inds] = -1
        pos_inds = target == 1
        neg_inds = target == 0

        if self.pos_margin > 0:
            pred[pos_inds] -= self.pos_margin
        if self.neg_margin > 0:
            pred[neg_inds] -= self.neg_margin
        pred = torch.clamp(pred, min=0, max=1)

        num_pos = int((target == 1).sum())
        num_neg = int((target == 0).sum())
        if self.neg_pos_ub > 0 and num_neg / (num_pos + 1 ) > self.neg_pos_ub:
            num_neg = num_pos * self.neg_pos_ub
            neg_idx = torch.nonzero(target == 0, as_tuple=False)

            if self.hard_mining:
                costs = l2_loss(
                    pred, target, reduction='none')[neg_idx[:, 0],
                                                    neg_idx[:, 1]].detach()
                neg_idx = neg_idx[costs.topk(num_neg)[1], :]
            else:
                neg_idx = self.random_choice(neg_idx, num_neg)

            new_neg_inds = neg_inds.new_zeros(neg_inds.size()).bool()
            new_neg_inds[neg_idx[:, 0], neg_idx[:, 1]] = True

            invalid_neg_inds = torch.logical_xor(neg_inds, new_neg_inds)
            weight[invalid_neg_inds] = 0

        avg_factor = (weight > 0).sum()
        return pred, weight, avg_factor

    @staticmethod
    def random_choice(gallery, num):
        """Random select some elements from the gallery.

        It seems that Pytorch's implementation is slower than numpy so we use
        numpy to randperm the indices.
        """
        assert len(gallery) >= num
        if isinstance(gallery, list):
            gallery = np.array(gallery)
        cands = np.arange(len(gallery))
        np.random.shuffle(cands)
        rand_inds = cands[:num]
        if not isinstance(gallery, np.ndarray):
            rand_inds = torch.from_numpy(rand_inds).long().to(gallery.device)
        return gallery[rand_inds]


================================================
FILE: knet/video/qdtrack/losses/multipos_cross_entropy_loss.py
================================================
import torch
import torch.nn as nn
from mmdet.models import LOSSES, weight_reduce_loss


def multi_pos_cross_entropy(pred,
                            label,
                            weight=None,
                            reduction='mean',
                            avg_factor=None):
    # element-wise losses
    # pos_inds = (label == 1).float()
    # neg_inds = (label == 0).float()
    # exp_pos = (torch.exp(-1 * pred) * pos_inds).sum(dim=1)
    # exp_neg = (torch.exp(pred.clamp(max=80)) * neg_inds).sum(dim=1)
    # loss = torch.log(1 + exp_pos * exp_neg)

    # a more numerical stable implementation.
    pos_inds = (label == 1)
    neg_inds = (label == 0)
    pred_pos = pred * pos_inds.float()
    pred_neg = pred * neg_inds.float()
    # use -inf to mask out unwanted elements.
    pred_pos[neg_inds] = pred_pos[neg_inds] + float('inf')
    pred_neg[pos_inds] = pred_neg[pos_inds] + float('-inf')

    _pos_expand = torch.repeat_interleave(pred_pos, pred.shape[1], dim=1)
    _neg_expand = pred_neg.repeat(1, pred.shape[1])

    x = torch.nn.functional.pad((_neg_expand - _pos_expand), (0, 1), "constant", 0)
    loss = torch.logsumexp(x, dim=1)


    # apply weights and do the reduction
    if weight is not None:
        weight = weight.float()
    loss = weight_reduce_loss(
        loss, weight=weight, reduction=reduction, avg_factor=avg_factor)

    return loss


@LOSSES.register_module()
class MultiPosCrossEntropyLoss(nn.Module):

    def __init__(self, reduction='mean', loss_weight=1.0):
        super(MultiPosCrossEntropyLoss, self).__init__()
        self.reduction = reduction
        self.loss_weight = loss_weight

    def forward(self,
                cls_score,
                label,
                weight=None,
                avg_factor=None,
                reduction_override=None,
                **kwargs):
        assert cls_score.size() == label.size()
        assert reduction_override in (None, 'none', 'mean', 'sum')
        reduction = (
            reduction_override if reduction_override else self.reduction)
        loss_cls = self.loss_weight * multi_pos_cross_entropy(
            cls_score,
            label,
            weight,
            reduction=reduction,
            avg_factor=avg_factor,
            **kwargs)
        return loss_cls


================================================
FILE: knet/video/qdtrack/track/__init__.py
================================================
from .similarity import cal_similarity
from .transforms import track2result, restore_result

__all__ = ['cal_similarity', 'track2result', 'restore_result']


================================================
FILE: knet/video/qdtrack/track/similarity.py
================================================
import torch
import torch.nn.functional as F


def cal_similarity(key_embeds,
                   ref_embeds,
                   method='dot_product',
                   temperature=-1):
    assert method in ['dot_product', 'cosine']

    if key_embeds.size(0) == 0 or ref_embeds.size(0) == 0:
        return torch.zeros((key_embeds.size(0), ref_embeds.size(0)),
                           device=key_embeds.device)

    if method == 'cosine':
        key_embeds = F.normalize(key_embeds, p=2, dim=1)
        ref_embeds = F.normalize(ref_embeds, p=2, dim=1)
        return torch.mm(key_embeds, ref_embeds.t())
    elif method == 'dot_product':
        if temperature > 0:
            dists = cal_similarity(key_embeds, ref_embeds, method='cosine')
            dists /= temperature
            return dists
        else:
            return torch.mm(key_embeds, ref_embeds.t())


================================================
FILE: knet/video/qdtrack/track/transforms.py
================================================
import numpy as np
import torch


def track2result(bboxes, labels, ids, num_classes):
    valid_inds = ids > -1
    bboxes = bboxes[valid_inds]
    labels = labels[valid_inds]
    ids = ids[valid_inds]

    if bboxes.shape[0] == 0:
        return [np.zeros((0, 6), dtype=np.float32) for i in range(num_classes)]
    else:
        if isinstance(bboxes, torch.Tensor):
            bboxes = bboxes.cpu().numpy()
            labels = labels.cpu().numpy()
            ids = ids.cpu().numpy()
        return [
            np.concatenate((ids[labels == i, None], bboxes[labels == i, :]),
                           axis=1) for i in range(num_classes)
        ]


def restore_result(result, return_ids=False):
    labels = []
    for i, bbox in enumerate(result):
        labels.extend([i] * bbox.shape[0])
    bboxes = np.concatenate(result, axis=0).astype(np.float32)
    labels = np.array(labels, dtype=np.int64)
    if return_ids:
        ids = bboxes[:, 0].astype(np.int64)
        bboxes = bboxes[:, 1:]
        return bboxes, labels, ids
    else:
        return bboxes, labels


================================================
FILE: knet/video/qdtrack/trackers/__init__.py
================================================
from .quasi_dense_embed_tracker import QuasiDenseEmbedTracker
from .tao_tracker import TaoTracker

__all__ = ['QuasiDenseEmbedTracker', 'TaoTracker']

================================================
FILE: knet/video/qdtrack/trackers/quasi_dense_embed_tracker.py
================================================
import torch
import torch.nn.functional as F
from mmdet.core import bbox_overlaps

from ..builder import TRACKERS


@TRACKERS.register_module()
class QuasiDenseEmbedTracker(object):

    def __init__(self,
                 init_score_thr=0.8,
                 obj_score_thr=0.5,
                 match_score_thr=0.5,
                 memo_tracklet_frames=10,
                 memo_backdrop_frames=1,
                 memo_momentum=0.8,
                 nms_conf_thr=0.5,
                 nms_backdrop_iou_thr=0.3,
                 nms_class_iou_thr=0.7,
                 with_cats=True,
                 match_metric='bisoftmax'):
        assert 0 <= memo_momentum <= 1.0
        assert memo_tracklet_frames >= 0
        assert memo_backdrop_frames >= 0
        self.init_score_thr = init_score_thr
        self.obj_score_thr = obj_score_thr
        self.match_score_thr = match_score_thr
        self.memo_tracklet_frames = memo_tracklet_frames
        self.memo_backdrop_frames = memo_backdrop_frames
        self.memo_momentum = memo_momentum
        self.nms_conf_thr = nms_conf_thr
        self.nms_backdrop_iou_thr = nms_backdrop_iou_thr
        self.nms_class_iou_thr = nms_class_iou_thr
        self.with_cats = with_cats
        assert match_metric in ['bisoftmax', 'softmax', 'cosine']
        self.match_metric = match_metric

        self.num_tracklets = 0
        self.tracklets = dict()
        self.backdrops = []

    @property
    def empty(self):
        return False if self.tracklets else True

    def update_memo(self, ids, bboxes, embeds, labels, frame_id):
        tracklet_inds = ids > -1

        # update memo
        for id, bbox, embed, label in zip(ids[tracklet_inds],
                                          bboxes[tracklet_inds],
                                          embeds[tracklet_inds],
                                          labels[tracklet_inds]):
            id = int(id)
            if id in self.tracklets.keys():
                velocity = (bbox - self.tracklets[id]['bbox']) / (
                    frame_id - self.tracklets[id]['last_frame'])
                self.tracklets[id]['bbox'] = bbox
                self.tracklets[id]['embed'] = (
                    1 - self.memo_momentum
                ) * self.tracklets[id]['embed'] + self.memo_momentum * embed
                self.tracklets[id]['last_frame'] = frame_id
                self.tracklets[id]['label'] = label
                self.tracklets[id]['velocity'] = (
                    self.tracklets[id]['velocity'] *
                    self.tracklets[id]['acc_frame'] + velocity) / (
                        self.tracklets[id]['acc_frame'] + 1)
                self.tracklets[id]['acc_frame'] += 1
            else:
                self.tracklets[id] = dict(
                    bbox=bbox,
                    embed=embed,
                    label=label,
                    last_frame=frame_id,
                    velocity=torch.zeros_like(bbox),
                    acc_frame=0)

        backdrop_inds = torch.nonzero(ids == -1, as_tuple=False).squeeze(1)
        ious = bbox_overlaps(bboxes[backdrop_inds, :-1], bboxes[:, :-1])
        for i, ind in enumerate(backdrop_inds):
            if (ious[i, :ind] > self.nms_backdrop_iou_thr).any():
                backdrop_inds[i] = -1
        backdrop_inds = backdrop_inds[backdrop_inds > -1]

        self.backdrops.insert(
            0,
            dict(
                bboxes=bboxes[backdrop_inds],
                embeds=embeds[backdrop_inds],
                labels=labels[backdrop_inds]))

        # pop memo
        invalid_ids = []
        for k, v in self.tracklets.items():
            if frame_id - v['last_frame'] >= self.memo_tracklet_frames:
                invalid_ids.append(k)
        for invalid_id in invalid_ids:
            self.tracklets.pop(invalid_id)

        if len(self.backdrops) > self.memo_backdrop_frames:
            self.backdrops.pop()

    @property
    def memo(self):
        memo_embeds = []
        memo_ids = []
        memo_bboxes = []
        memo_labels = []
        memo_vs = []
        for k, v in self.tracklets.items():
            memo_bboxes.append(v['bbox'][None, :])
            memo_embeds.append(v['embed'][None, :])
            memo_ids.append(k)
            memo_labels.append(v['label'].view(1, 1))
            memo_vs.append(v['velocity'][None, :])
        memo_ids = torch.tensor(memo_ids, dtype=torch.long).view(1, -1)

        for backdrop in self.backdrops:
            backdrop_ids = torch.full((1, backdrop['embeds'].size(0)),
                                      -1,
                                      dtype=torch.long)
            backdrop_vs = torch.zeros_like(backdrop['bboxes'])
            memo_bboxes.append(backdrop['bboxes'])
            memo_embeds.append(backdrop['embeds'])
            memo_ids = torch.cat([memo_ids, backdrop_ids], dim=1)
            memo_labels.append(backdrop['labels'][:, None])
            memo_vs.append(backdrop_vs)

        memo_bboxes = torch.cat(memo_bboxes, dim=0)
        memo_embeds = torch.cat(memo_embeds, dim=0)
        memo_labels = torch.cat(memo_labels, dim=0).squeeze(1)
        memo_vs = torch.cat(memo_vs, dim=0)
        return memo_bboxes, memo_labels, memo_embeds, memo_ids.squeeze(
            0), memo_vs

    def match(self, bboxes, labels, track_feats, frame_id, asso_tau=-1):

        _, inds = bboxes[:, -1].sort(descending=True)
        bboxes = bboxes[inds, :]
        labels = labels[inds]
        embeds = track_feats[inds, :]

        # hack we do not consider the nms since we use
        # # duplicate removal for potential backdrops and cross classes
        valids = bboxes.new_ones((bboxes.size(0)))
        ious = bbox_overlaps(bboxes[:, :-1], bboxes[:, :-1])
        for i in range(1, bboxes.size(0)):
            thr = self.nms_backdrop_iou_thr if bboxes[
                i, -1] < self.obj_score_thr else self.nms_class_iou_thr
            if (ious[i, :i] > thr).any():
                valids[i] = 0
        valids = valids == 1
        bboxes = bboxes[valids, :]
        labels = labels[valids]
        embeds = embeds[valids, :]

        # init ids container
        ids = torch.full((bboxes.size(0), ), -1, dtype=torch.long)

        # match if buffer is not empty
        if bboxes.size(0) > 0 and not self.empty:
            (memo_bboxes, memo_labels, memo_embeds, memo_ids,
             memo_vs) = self.memo

            if self.match_metric == 'bisoftmax':
                feats = torch.mm(embeds, memo_embeds.t())
                d2t_scores = feats.softmax(dim=1)
                t2d_scores = feats.softmax(dim=0)
                scores = (d2t_scores + t2d_scores) / 2
            elif self.match_metric == 'softmax':
                feats = torch.mm(embeds, memo_embeds.t())
                scores = feats.softmax(dim=1)
            elif self.match_metric == 'cosine':
                scores = torch.mm(
                    F.normalize(embeds, p=2, dim=1),
                    F.normalize(memo_embeds, p=2, dim=1).t())
            else:
                raise NotImplementedError

            if self.with_cats:
                cat_same = labels.view(-1, 1) == memo_labels.view(1, -1)
                scores *= cat_same.float().to(scores.device)

            for i in range(bboxes.size(0)):
                conf, memo_ind = torch.max(scores[i, :], dim=0)
                id = memo_ids[memo_ind]
                if conf > self.match_score_thr:
                    if id > -1:
                        if bboxes[i, -1] > self.obj_score_thr:
                            ids[i] = id
                            scores[:i, memo_ind] = 0
                            scores[i + 1:, memo_ind] = 0
                        else:
                            if conf > self.nms_conf_thr:
                                ids[i] = -2
        new_inds = (ids == -1) & (bboxes[:, 4] > self.init_score_thr).cpu()
        num_news = new_inds.sum()
        ids[new_inds] = torch.arange(
            self.num_tracklets,
            self.num_tracklets + num_news,
            dtype=torch.long)
        self.num_tracklets += num_news

        self.update_memo(ids, bboxes, embeds, labels, frame_id)

        return bboxes, labels, ids


================================================
FILE: knet/video/qdtrack/trackers/tao_tracker.py
================================================
import os
import random
from collections import defaultdict

import cv2
import mmcv
import numpy as np
import seaborn as sns
import torch
from mmcv.image import imread, imwrite
from mmcv.visualization import color_val, imshow
from mmdet.core import bbox_overlaps

from knet.video.qdtrack.track.similarity import cal_similarity
from ..builder import TRACKERS


@TRACKERS.register_module()
class TaoTracker(object):

    def __init__(self,
                 init_score_thr=0.0001,
                 obj_score_thr=0.0001,
                 match_score_thr=0.5,
                 memo_frames=10,
                 momentum_embed=0.8,
                 momentum_obj_score=0.5,
                 obj_score_diff_thr=1.0,
                 distractor_nms_thr=0.3,
                 distractor_score_thr=0.5,
                 match_metric='bisoftmax',
                 match_with_cosine=True):
        self.init_score_thr = init_score_thr
        self.obj_score_thr = obj_score_thr
        self.match_score_thr = match_score_thr

        self.memo_frames = memo_frames
        self.momentum_embed = momentum_embed
        self.momentum_obj_score = momentum_obj_score
        self.obj_score_diff_thr = obj_score_diff_thr
        self.distractor_nms_thr = distractor_nms_thr
        self.distractor_score_thr = distractor_score_thr
        assert match_metric in ['bisoftmax', 'cosine']
        self.match_metric = match_metric
        self.match_with_cosine = match_with_cosine

        self.reset()

    def reset(self):
        self.num_tracklets = 0
        self.tracklets = dict()
        # for analysis
        self.pred_tracks = defaultdict(lambda: defaultdict(list))
        self.gt_tracks = defaultdict(lambda: defaultdict(list))

    @property
    def valid_ids(self):
        valid_ids = []
        for k, v in self.gt_tracks.items():
            valid_ids.extend(v['ids'])
        return list(set(valid_ids))

    @property
    def empty(self):
        return False if self.tracklets else True

    def update_memo(self, ids, bboxes, labels, embeds, frame_id):
        tracklet_inds = ids > -1

        # update memo
        for id, bbox, embed, label in zip(ids[tracklet_inds],
                                          bboxes[tracklet_inds],
                                          embeds[tracklet_inds],
                                          labels[tracklet_inds]):
            id = int(id)
            if id in self.tracklets:
                self.tracklets[id]['bboxes'].append(bbox)
                self.tracklets[id]['labels'].append(label)
                self.tracklets[id]['embeds'] = (
                                                       1 - self.momentum_embed
                                               ) * self.tracklets[id]['embeds'] + self.momentum_embed * embed
                self.tracklets[id]['frame_ids'].append(frame_id)
            else:
                self.tracklets[id] = dict(
                    bboxes=[bbox],
                    labels=[label],
                    embeds=embed,
                    frame_ids=[frame_id])

        # pop memo
        invalid_ids = []
        for k, v in self.tracklets.items():
            if frame_id - v['frame_ids'][-1] >= self.memo_frames:
                invalid_ids.append(k)
        for invalid_id in invalid_ids:
            self.tracklets.pop(invalid_id)

    @property
    def memo(self):
        memo_ids = []
        memo_bboxes = []
        memo_labels = []
        memo_embeds = []
        for k, v in self.tracklets.items():
            memo_ids.append(k)
            memo_bboxes.append(v['bboxes'][-1][None, :])
            memo_labels.append(v['labels'][-1].view(1, 1))
            memo_embeds.append(v['embeds'][None, :])
        memo_ids = torch.tensor(memo_ids, dtype=torch.long).view(1, -1)

        memo_bboxes = torch.cat(memo_bboxes, dim=0)
        memo_embeds = torch.cat(memo_embeds, dim=0)
        memo_labels = torch.cat(memo_labels, dim=0).squeeze(1)
        return memo_bboxes, memo_labels, memo_embeds, memo_ids.squeeze(0)

    def init_tracklets(self, ids, obj_scores):
        new_objs = (ids == -1) & (obj_scores > self.init_score_thr).cpu()
        num_new_objs = new_objs.sum()
        ids[new_objs] = torch.arange(
            self.num_tracklets,
            self.num_tracklets + num_new_objs,
            dtype=torch.long)
        self.num_tracklets += num_new_objs
        return ids

    def match(self,
              bboxes,
              labels,
              track_feats,
              frame_id,
              temperature=-1,
              **kwargs):
        if track_feats is None:
            ids = torch.full((bboxes.size(0), ), -1, dtype=torch.long)
            return bboxes, labels, ids

        # all objects is valid here
        valid_inds = labels > -1
        # nms
        low_inds = torch.nonzero(
            bboxes[:, -1] < self.distractor_score_thr,
            as_tuple=False).squeeze(1)
        cat_same = labels[low_inds].view(-1, 1) == labels.view(1, -1)
        ious = bbox_overlaps(bboxes[low_inds, :-1], bboxes[:, :-1])
        ious *= cat_same.to(ious.device)
        for i, ind in enumerate(low_inds):
            if (ious[i, :ind] > self.distractor_nms_thr).any():
                valid_inds[ind] = False
        bboxes = bboxes[valid_inds]
        labels = labels[valid_inds]
        embeds = track_feats[valid_inds]

        # match if buffer is not empty
        if bboxes.size(0) > 0 and not self.empty:
            memo_bboxes, memo_labels, memo_embeds, memo_ids = self.memo

            if self.match_metric == 'bisoftmax':
                sims = cal_similarity(
                    embeds,
                    memo_embeds,
                    method='dot_product',
                    temperature=temperature)
                cat_same = labels.view(-1, 1) == memo_labels.view(1, -1)
                exps = torch.exp(sims) * cat_same.to(sims.device)
                d2t_scores = exps / (exps.sum(dim=1).view(-1, 1) + 1e-6)
                t2d_scores = exps / (exps.sum(dim=0).view(1, -1) + 1e-6)
                cos_scores = cal_similarity(
                    embeds, memo_embeds, method='cosine')
                cos_scores *= cat_same.to(cos_scores.device)
                scores = (d2t_scores + t2d_scores) / 2
                if self.match_with_cosine:
                    scores = (scores + cos_scores) / 2
            elif self.match_metric == 'cosine':
                cos_scores = cal_similarity(
                    embeds, memo_embeds, method='cosine')
                cat_same = labels.view(-1, 1) == memo_labels.view(1, -1)
                scores = cos_scores * cat_same.float().to(cos_scores.device)
            else:
                raise NotImplementedError()
            if 'metas' in kwargs:
                raw_scores = scores.clone()

            obj_score_diffs = torch.abs(
                bboxes[:, -1].view(-1, 1).expand_as(scores) -
                memo_bboxes[:, -1].view(1, -1).expand_as(scores))

            num_objs = bboxes.size(0)
            ids = torch.full((num_objs, ), -1, dtype=torch.long)
            for i in range(num_objs):
                if bboxes[i, -1] < self.obj_score_thr:
                    continue
                conf, memo_ind = torch.max(scores[i, :], dim=0)
                obj_score_diff = obj_score_diffs[i, memo_ind]
                if (conf > self.match_score_thr) and (obj_score_diff <
                                                      self.obj_score_diff_thr):
                    ids[i] = memo_ids[memo_ind]
                    scores[:i, memo_ind] = 0
                    scores[i + 1:, memo_ind] = 0
                    m = self.momentum_obj_score
                    bboxes[i, -1] = m * bboxes[i, -1] + (
                            1 - m) * memo_bboxes[memo_ind, -1]
        else:
            ids = torch.full((bboxes.size(0), ), -1, dtype=torch.long)
        # init tracklets
        ids = self.init_tracklets(ids, bboxes[:, -1])
        self.update_memo(ids, bboxes, labels, embeds, frame_id)

        # ----------------
        if 'metas' in kwargs and kwargs['metas'].analyze:
            metas = kwargs['metas']
            gt_bboxes, gt_labels, gt_ids = [
                metas['bboxes'], metas['labels'], metas['instance_ids']
            ]
            gt_bboxes = torch.cat(
                (gt_bboxes, torch.zeros(gt_bboxes.size(0), 1)), dim=1)

            if bboxes.size(0) == 0 or gt_bboxes.size(0) == 0:
                return bboxes, labels, ids

            fns = torch.ones(gt_bboxes.size(0), dtype=torch.long)
            fps = torch.ones(bboxes.size(0), dtype=torch.long)
            sw_fps = torch.zeros(bboxes.size(0), dtype=torch.long)
            idsw = torch.zeros(bboxes.size(0), dtype=torch.long)

            ious = bbox_overlaps(bboxes[:, :4], gt_bboxes[:, :4])
            same_cat = labels.view(-1, 1) == gt_labels.view(1, -1)
            ious *= same_cat.float().to(ious.device)

            gt_inds = torch.full(ids.size(), -1, dtype=torch.long)
            for i, bbox in enumerate(bboxes):
                max_iou, j = ious[i].max(dim=0)
                if max_iou > 0.5:
                    fps[i], fns[j] = 0, 0
                    gt_inds[i] = j
                    ious[:, j] = -1

                    gt_id = int(gt_ids[j])
                    pred_id = int(ids[i])
                    if len(self.gt_tracks[gt_id]['ids']) > 0:
                        if pred_id != self.gt_tracks[gt_id]['ids'][-1]:
                            idsw[i] = 1
                    else:
                        if pred_id in self.pred_tracks:
                            idsw[i] = 1
                    self.gt_tracks[gt_id]['scores'].append(
                        float(f'{bbox[-1]:.3f}'))
                    self.gt_tracks[gt_id]['ids'].append(pred_id)
                    self.gt_tracks[gt_id]['frame_ids'].append(
                        metas.img_info['frame_id'])

            for i, id in enumerate(ids):
                id = int(id)

                self.pred_tracks[id]['scores'].append(
                    float(f'{bboxes[i, -1]:.3f}'))
                if metas.img_info['frame_id'] > 0:
                    memo_ind = torch.nonzero(
                        memo_ids == id, as_tuple=False).squeeze(1)
                else:
                    memo_ind = []
                if len(memo_ind) > 0:
                    self.pred_tracks[id]['match_scores'].append(
                        float(f'{raw_scores[i, memo_ind[0]]:.3f}'))
                else:
                    self.pred_tracks[id]['match_scores'].append(-1)
                if gt_inds[i] == -1:
                    self.pred_tracks[id]['ids'].append(-1)
                else:
                    self.pred_tracks[id]['ids'].append(int(gt_ids[gt_inds[i]]))
                self.pred_tracks[id]['frame_ids'].append(
                    metas.img_info['frame_id'])

                if fps[i]:
                    if id in self.valid_ids:
                        sw_fps[i] = 1
                    continue

            fp_inds = sw_fps == 1  # red
            fn_inds = fns == 1  # yellow
            idsw_inds = idsw == 1  # cyan
            tp_inds = fps == 0  # green
            tp_inds[idsw_inds] = 0

            os.makedirs(metas.out_file.rsplit('/', 1)[0], exist_ok=True)
            img = metas.img_name
            # black
            if idsw_inds.any():
                sw_ids = ids[idsw_inds]
                memo_inds = (memo_ids.view(-1, 1) == sw_ids.view(
                    1, -1)).sum(dim=1) > 0
                img = imshow_tracklets(
                    img,
                    memo_bboxes[memo_inds].numpy(),
                    memo_labels[memo_inds].numpy(),
                    memo_ids[memo_inds].numpy(),
                    color='magenta',
                    show=False)
            img = imshow_tracklets(
                img,
                bboxes[tp_inds].numpy(),
                labels[tp_inds].numpy(),
                ids[tp_inds].numpy(),
                color='green',
                show=False)
            img = imshow_tracklets(
                img,
                bboxes[fp_inds].numpy(),
                labels[fp_inds].numpy(),
                ids[fp_inds].numpy(),
                color='red',
                show=False)
            img = imshow_tracklets(
                img,
                bboxes=gt_bboxes[fn_inds, :].numpy(),
                labels=gt_labels[fn_inds].numpy(),
                color='yellow',
                show=False)
            img = imshow_tracklets(
                img,
                bboxes[idsw_inds].numpy(),
                labels[idsw_inds].numpy(),
                ids[idsw_inds].numpy(),
                color='cyan',
                show=False,
                out_file=metas.out_file)

        return bboxes, labels, ids


def random_color(seed):
    random.seed(seed)
    colors = sns.color_palette()
    color = random.choice(colors)
    return color


def imshow_tracklets(img,
                     bboxes,
                     labels=None,
                     ids=None,
                     thickness=2,
                     font_scale=0.4,
                     show=False,
                     win_name='',
                     color=None,
                     out_file=None):
    assert bboxes.ndim == 2
    assert labels.ndim == 1
    assert bboxes.shape[0] == labels.shape[0]
    # assert bboxes.shape[1] == 4 or bboxes.shape[1] == 5
    if isinstance(img, str):
        img = imread(img)
    i = 0
    if bboxes.shape[0] == 0:
        if out_file is not None:
            imwrite(img, out_file)
        return img
    if isinstance(bboxes, torch.Tensor):
        bboxes = bboxes.numpy()
        labels = labels.numpy()
        ids = ids.numpy()
    for bbox, label in zip(bboxes, labels):
        x1, y1, x2, y2, _ = bbox.astype(np.int32)
        if ids is not None:
            if color is None:
                bbox_color = random_color(ids[i])
                bbox_color = [int(255 * _c) for _c in bbox_color][::-1]
            else:
                bbox_color = mmcv.color_val(color)
            img[y1:y1 + 12, x1:x1 + 20, :] = bbox_color
            cv2.putText(
                img,
                str(ids[i]), (x1, y1 + 10),
                cv2.FONT_HERSHEY_COMPLEX,
                font_scale,
                color=color_val('black'))
        else:
            if color is None:
                bbox_color = color_val('green')
            else:
                bbox_color = mmcv.color_val(color)

        cv2.rectangle(img, (x1, y1), (x2, y2), bbox_color, thickness=thickness)

        if bbox[-1] < 0:
            bbox[-1] = np.nan
        # label_text = '{:.02f}'.format(bbox[-1])
        # img[y1 - 12:y1, x1:x1 + 30, :] = bbox_color
        # cv2.putText(
        #     img,
        #     label_text, (x1, y1 - 2),
        #     cv2.FONT_HERSHEY_COMPLEX,
        #     font_scale,
        #     color=color_val('black'))

        i += 1

    if show:
        imshow(img, win_name)
    if out_file is not None:
        imwrite(img, out_file)

    return img


================================================
FILE: knet/video/track_heads.py
================================================
"""
    This file implements several tracking heads
"""
import numpy as np
import torch
import torch.nn as nn
from mmcv.cnn import ConvModule, normal_init
from mmdet.models.builder import HEADS, build_head, build_loss, build_roi_extractor
from mmdet.models.losses import accuracy
from mmdet.core import multi_apply, bbox2roi
from knet.video.qdtrack.track import cal_similarity
from unitrack.utils.mask import mask2box, batch_mask2boxlist, bboxlist2roi


@HEADS.register_module()
class QueryTrackHead(nn.Module):
    """Tracking head, predict tracking features and match with reference objects
       Use dynamic option to deal with different number of objects in different
       images. A non-match entry is added to the reference objects with all-zero
       features. Object matched with the non-match entry is considered as a new
       object.
    """

    def __init__(self,
                 num_fcs=2,
                 in_channels=256,
                 fc_out_channels=1024,
                 match_coeff=None,
                 bbox_dummy_iou=0,
                 dynamic=True,
                 loss_match=dict(
                     type='CrossEntropyLoss',
                     use_sigmoid=False,
                     loss_weight=1.0)):

        super(QueryTrackHead, self).__init__()
        self.in_channels = in_channels
        self.match_coeff = match_coeff
        self.bbox_dummy_iou = bbox_dummy_iou
        self.num_fcs = num_fcs
        self.fcs = nn.ModuleList()
        for i in range(num_fcs):
            out_channels = (in_channels
                           if i < num_fcs - 1  else fc_out_channels)
            fc = nn.Linear(in_channels, out_channels)
            self.fcs.append(fc)
        self.relu = nn.ReLU(inplace=True)
        self.debug_imgs = None
        self.dynamic = dynamic
        assert self.dynamic == True, "Naive tracking embedding head must be dynamic"
        #### modification
        self.loss_match = build_loss(loss_match)

    def init_weights(self):
        for fc in self.fcs:
            nn.init.normal_(fc.weight, 0, 0.01)
            nn.init.constant_(fc.bias, 0)

    def compute_comp_scores(self, match_ll, bbox_scores, bbox_ious, label_delta, add_bbox_dummy=False):
        # compute comprehensive matching score based on matchig likelihood,
        # bbox confidence, and ious
        if add_bbox_dummy:
            bbox_iou_dummy = torch.ones(bbox_ious.size(0), 1).to(torch.cuda.current_device()) * self.bbox_dummy_iou
            bbox_ious = torch.cat((bbox_iou_dummy, bbox_ious), dim=1)
            label_dummy = torch.ones(bbox_ious.size(0), 1).to(torch.cuda.current_device())
            label_delta = torch.cat((label_dummy, label_delta), dim=1)
        if self.match_coeff is None:
            return match_ll
        else:
            # match coeff needs to be length of 3
            assert (len(self.match_coeff) == 3)
            return (match_ll +
                    self.match_coeff[0] * torch.log(bbox_scores) +
                    self.match_coeff[1] * bbox_ious +
                    self.match_coeff[2] * label_delta)

    def forward(self, x, ref_x, x_n, ref_x_n):
        # x and ref_x are the grouped bbox features of current and reference frame
        # x_n are the numbers of proposals in the current images in the mini-batch,
        # ref_x_n are the numbers of ground truth bboxes in the reference images.
        # here we compute a correlation matrix of x and ref_x
        # we also add a all 0 column denote no matching
        assert len(x_n) == len(ref_x_n)  # ==> the batch size should be the same.
        b, N, d = x.size()
        x = x.reshape(b*N, d)
        ref_x = ref_x.reshape(b*N, d)
        for idx, fc in enumerate(self.fcs):
            x = fc(x)
            ref_x = fc(ref_x)
            if idx < len(self.fcs) - 1:
                x = self.relu(x)
                ref_x = self.relu(ref_x)
        n = len(x_n)
        x_split = torch.split(x, x_n, dim=0)
        ref_x_split = torch.split(ref_x, ref_x_n, dim=0)
        prods = []
        for i in range(n):
            prod = torch.mm(x_split[i], torch.transpose(ref_x_split[i], 0, 1))
            prods.append(prod)
        if self.dynamic:
            match_score = []
            for prod in prods:
                m = prod.size(0)
                dummy = torch.zeros(m, 1).to(torch.cuda.current_device())

                prod_ext = torch.cat([dummy, prod], dim=1)
                match_score.append(prod_ext)

        return match_score

    def loss(self,
             match_score,
             sampling_results):
        losses = dict()
        n = len(match_score)
        x_n = [s.size(0) for s in match_score]
        ids, id_weights = self.get_targets(sampling_results)
        ids = torch.split(ids, x_n, dim=0)
        id_weights = torch.split(id_weights, x_n, dim=0)
        loss_match = 0.0
        match_acc = 0.0
        n_total = 0

        for score, cur_ids, cur_weights in zip(match_score, ids, id_weights):
            valid_idx = torch.nonzero(cur_weights).squeeze()
            if len(valid_idx.size()) == 0:
                continue
            n_valid = valid_idx.size(0)
            n_total += n_valid
            loss_match_per_batch = self.loss_match(score, cur_ids, cur_weights)
            match_acc += accuracy(
                torch.index_select(score, 0, valid_idx),
                torch.index_select(cur_ids, 0, valid_idx)) * n_valid
            loss_match += loss_match_per_batch
        if loss_match == 0.0:
            losses['loss_match'] = ids[0].sum() * 0
        else:
            losses['loss_match'] = loss_match / n
        return losses

    def get_targets(self,
                    sampling_results,
                    concat=True,
                    ):
        pos_inds_list = [res.pos_inds for res in sampling_results]
        neg_inds_list = [res.neg_inds for res in sampling_results]
        pos_mask_list = [res.pos_masks for res in sampling_results]
        neg_mask_list = [res.neg_masks for res in sampling_results]
        pos_gt_pid_list = [res.pos_gt_pids for res in sampling_results]
        ids, id_weights = multi_apply(
            self._get_target_single,
            pos_inds_list,
            neg_inds_list,
            pos_mask_list,
            neg_mask_list,
            pos_gt_pid_list)
        if concat:
            ids = torch.cat(ids, 0)
            id_weights = torch.cat(id_weights, 0)

        return ids, id_weights

    def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask, pos_gt_pid_list):

        num_pos = pos_mask.size(0)
        num_neg = neg_mask.size(0)
        num_samples = num_pos + num_neg

        ids = pos_mask.new_zeros((num_samples,), dtype=torch.long)
        ids_weights = pos_mask.new_zeros((num_samples,))
        if num_pos > 0:
            ids[pos_inds] = pos_gt_pid_list
            ids_weights[pos_inds] = 1.0

        if num_neg > 0:
            ids_weights[neg_inds] = 0.0

        return ids, ids_weights


@HEADS.register_module()
class TrackHeadWithROIAlign(nn.Module):
    """Tracking head, predict tracking features and match with reference objects
       Use dynamic option to deal with different number of objects in different
       images. A non-match entry is added to the reference objects with all-zero
       features. Object matched with the non-match entry is considered as a new
       object.
    """

    def __init__(self,
                 num_fcs=2,
                 in_channels=256,
                 fc_out_channels=1024,
                 match_coeff=None,
                 bbox_dummy_iou=0,
                 dynamic=True,
                 bbox_roi_extractor=dict(
                     type='SingleRoIExtractor',
                     roi_layer=dict(
                         type='RoIAlign', output_size=7, sampling_ratio=2),
                     out_channels=256,
                     featmap_strides=[4, 8, 16, 32]),
                 loss_match=dict(
                     type='CrossEntropyLoss',
                     use_sigmoid=False,
                     loss_weight=1.0)):

        super(TrackHeadWithROIAlign, self).__init__()
        assert bbox_roi_extractor is not None
        self.in_channels = in_channels
        self.match_coeff = match_coeff
        self.bbox_dummy_iou = bbox_dummy_iou
        self.num_fcs = num_fcs
        self.fcs = nn.ModuleList()

        for i in range(num_fcs):
            out_channels = (in_channels
                           if i < num_fcs - 1  else fc_out_channels)
            fc = nn.Linear(in_channels, out_channels)
            self.fcs.append(fc)
        self.relu = nn.ReLU(inplace=True)
        self.debug_imgs = None
        self.dynamic = dynamic
        assert self.dynamic == True, "Naive tracking embedding head must be dynamic"

        self.bbox_roi_extractor = build_roi_extractor(
                bbox_roi_extractor)
        #### modification
        self.loss_match = build_loss(loss_match)

    def init_weights(self):
        for fc in self.fcs:
            nn.init.normal_(fc.weight, 0, 0.01)
            nn.init.constant_(fc.bias, 0)

    def compute_comp_scores(self, match_ll, bbox_scores, bbox_ious, label_delta, add_bbox_dummy=False):
        # compute comprehensive matching score based on matchig likelihood,
        # bbox confidence, and ious
        if add_bbox_dummy:
            bbox_iou_dummy = torch.ones(bbox_ious.size(0), 1).to(torch.cuda.current_device()) * self.bbox_dummy_iou
            bbox_ious = torch.cat((bbox_iou_dummy, bbox_ious), dim=1)
            label_dummy = torch.ones(bbox_ious.size(0), 1).to(torch.cuda.current_device())
            label_delta = torch.cat((label_dummy, label_delta), dim=1)
        if self.match_coeff is None:
            return match_ll
        else:
            # match coeff needs to be length of 3
            assert (len(self.match_coeff) == 3)
            return (match_ll +
                    self.match_coeff[0] * torch.log(bbox_scores) +
                    self.match_coeff[1] * bbox_ious +
                    self.match_coeff[2] * label_delta)

    def forward(self, x, ref_x, mask_pred, ref_mask_pred, x_n, ref_x_n):
        """
        Args:
            x: backbone feature of current frame
            ref_x: backbone feature of reference frame
            mask_pred: mask prediction of current frame
            ref_mask_pred: reference mask prediction
            x_n: number of proposal
            ref_x_n:  number of proposal in ref frame

        Returns:

        """
        # print("mask shape ",mask_pred.shape)
        bbox_pred = batch_mask2boxlist(mask_pred)
        ref_bbox_pred = batch_mask2boxlist(ref_mask_pred)

        # rois = bboxlist2roi(bbox_pred)
        # ref_rois = bboxlist2roi(ref_bbox_pred)

        x = self.bbox_roi_extractor(
            x[:self.bbox_roi_extractor.num_inputs], rois)

        ref_x = self.bbox_roi_extractor(
                ref_x[:self.bbox_roi_extractor.num_inputs], ref_rois)
        # x and ref_x are the grouped bbox features of current and reference frame
        # x_n are the numbers of proposals in the current images in the mini-batch,
        # ref_x_n are the numbers of ground truth bboxes in the reference images.
        # here we compute a correlation matrix of x and ref_x
        # we also add a all 0 column denote no matching

        b, N, d = x.size()
        x = x.reshape(b*N, d)
        ref_x = ref_x.reshape(b*N, d)
        for idx, fc in enumerate(self.fcs):
            x = fc(x)
            ref_x = fc(ref_x)
            if idx < len(self.fcs) - 1:
                x = self.relu(x)
                ref_x = self.relu(ref_x)
        n = len(x_n)
        x_split = torch.split(x, x_n, dim=0)
        ref_x_split = torch.split(ref_x, ref_x_n, dim=0)
        prods = []
        for i in range(n):
            prod = torch.mm(x_split[i], torch.transpose(ref_x_split[i], 0, 1))
            prods.append(prod)
        if self.dynamic:
            match_score = []
            for prod in prods:
                m = prod.size(0)
                dummy = torch.zeros(m, 1).to(torch.cuda.current_device())

                prod_ext = torch.cat([dummy, prod], dim=1)
                match_score.append(prod_ext)

        return match_score

    def loss(self,
             match_score,
             sampling_results):
        losses = dict()
        n = len(match_score)
        x_n = [s.size(0) for s in match_score]
        ids, id_weights = self.get_targets(sampling_results)
        ids = torch.split(ids, x_n, dim=0)
        id_weights = torch.split(id_weights, x_n, dim=0)
        loss_match = torch.zeros(0).to(torch.cuda.current_device())
        match_acc = 0.
        n_total = 0

        for score, cur_ids, cur_weights in zip(match_score, ids, id_weights):
            valid_idx = torch.nonzero(cur_weights).squeeze()
            if len(valid_idx.size()) == 0:
                continue
            n_valid = valid_idx.size(0)
            n_total += n_valid
            loss_match += self.loss_match(
                score, cur_ids, cur_weights)
            match_acc += accuracy(
                torch.index_select(score, 0, valid_idx),
                torch.index_select(cur_ids, 0, valid_idx)) * n_valid
        losses['loss_match'] = loss_match / n
        if n_total > 0:
            losses['match_acc'] = match_acc / n_total
        return losses

    def get_targets(self,
                    sampling_results,
                    concat=True,
                    ):
        pos_inds_list = [res.pos_inds for res in sampling_results]
        neg_inds_list = [res.neg_inds for res in sampling_results]
        pos_mask_list = [res.pos_masks for res in sampling_results]
        neg_mask_list = [res.neg_masks for res in sampling_results]
        pos_gt_pid_list = [res.pos_gt_pids for res in sampling_results]
        ids, id_weights = multi_apply(
            self._get_target_single,
            pos_inds_list,
            neg_inds_list,
            pos_mask_list,
            neg_mask_list,
            pos_gt_pid_list)
        if concat:
            ids = torch.cat(ids, 0)
            id_weights = torch.cat(id_weights, 0)

        return ids, id_weights

    def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask, pos_gt_pid_list):

        num_pos = pos_mask.size(0)
        num_neg = neg_mask.size(0)
        num_samples = num_pos + num_neg

        ids = pos_mask.new_zeros((num_samples,), dtype=torch.long)
        ids_weights = pos_mask.new_zeros((num_samples,))
        if num_pos > 0:
            ids[pos_inds] = pos_gt_pid_list
            ids_weights[pos_inds] = 1.0

        if num_neg > 0:
            ids_weights[neg_inds] = 0.0

        return ids, ids_weights


@HEADS.register_module()
class QuasiDenseMaskEmbedHead(nn.Module):

    def __init__(self,
                 num_convs=4,
                 num_fcs=1,
                 roi_feat_size=7,
                 in_channels=256,
                 conv_out_channels=256,
                 fc_out_channels=1024,
                 embed_channels=256,
                 conv_cfg=None,
                 norm_cfg=None,
                 softmax_temp=-1,
                 loss_track=dict(
                     type='MultiPosCrossEntropyLoss', loss_weight=0.25),
                 loss_track_aux=dict(
                     type='L2Loss',
                     sample_ratio=3,
                     margin=0.3,
                     loss_weight=1.0,
                     hard_mining=True)):
        super(QuasiDenseMaskEmbedHead, self).__init__()
        self.num_convs = num_convs
        self.num_fcs = num_fcs
        self.roi_feat_size = roi_feat_size
        self.in_channels = in_channels
        self.conv_out_channels = conv_out_channels
        self.fc_out_channels = fc_out_channels
        self.embed_channels = embed_channels
        self.conv_cfg = conv_cfg
        self.norm_cfg = norm_cfg
        self.relu = nn.ReLU(inplace=True)
        self.convs, self.fcs, last_layer_dim = self._add_conv_fc_branch(
            self.num_convs, self.num_fcs, self.in_channels)
        self.fc_embed = nn.Linear(last_layer_dim, embed_channels)

        self.softmax_temp = softmax_temp
        self.loss_track = build_loss(loss_track)
        if loss_track_aux is not None:
            self.loss_track_aux = build_loss(loss_track_aux)
        else:
            self.loss_track_aux = None

    def _add_conv_fc_branch(self, num_convs, num_fcs, in_channels):
        last_layer_dim = in_channels
        # add branch specific conv layers
        convs = nn.ModuleList()
        if num_convs > 0:
            for i in range(num_convs):
                conv_in_channels = (
                    last_layer_dim if i == 0 else self.conv_out_channels)
                convs.append(
                    ConvModule(
                        conv_in_channels,
                        self.conv_out_channels,
                        3,
                        padding=1,
                        conv_cfg=self.conv_cfg,
                        norm_cfg=self.norm_cfg))
            last_layer_dim = self.conv_out_channels
        # add branch specific fc layers
        fcs = nn.ModuleList()
        if num_fcs > 0:
            last_layer_dim *= (self.roi_feat_size * self.roi_feat_size)
            for i in range(num_fcs):
                fc_in_channels = (
                    last_layer_dim if i == 0 else self.fc_out_channels)
                fcs.append(nn.Linear(fc_in_channels, self.fc_out_channels))
            last_layer_dim = self.fc_out_channels
        return convs, fcs, last_layer_dim

    def init_weights(self):
        for m in self.fcs:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.constant_(m.bias, 0)
        nn.init.normal_(self.fc_embed.weight, 0, 0.01)
        nn.init.constant_(self.fc_embed.bias, 0)

    def forward(self, x):
        if self.num_convs > 0:
            for i, conv in enumerate(self.convs):
                x = conv(x)
        x = x.view(x.size(0), -1)

        if self.num_fcs > 0:
            for i, fc in enumerate(self.fcs):
                x = self.relu(fc(x))
        x = self.fc_embed(x)
        return x

    def get_track_targets(self, gt_match_indices, key_sampling_results,
                          ref_sampling_results):
        track_targets = []
        track_weights = []
        for _gt_match_indices, key_res, ref_res in zip(gt_match_indices,
                                                       key_sampling_results,
                                                       ref_sampling_results):
            targets = _gt_match_indices.new_zeros(
                (key_res.pos_masks.size(0), ref_res.pos_masks.size(0)),
                dtype=torch.int)
            _match_indices = _gt_match_indices[key_res.pos_assigned_gt_inds]
            pos2pos = (_match_indices.view(
                -1, 1) == ref_res.pos_assigned_gt_inds.view(1, -1)).int()
            targets[:, :pos2pos.size(1)] = pos2pos
            weights = (targets.sum(dim=1) > 0).float()
            track_targets.append(targets)
            track_weights.append(weights)
        return track_targets, track_weights

    def match(self, key_embeds, ref_embeds, key_sampling_results,
              ref_sampling_results):

        num_key_rois = [res.pos_masks.size(0) for res in key_sampling_results]
        key_embeds = torch.split(key_embeds, num_key_rois)
        num_ref_rois = [res.pos_masks.size(0) for res in ref_sampling_results]
        ref_embeds = torch.split(ref_embeds, num_ref_rois)

        dists, cos_dists = [], []
        for key_embed, ref_embed in zip(key_embeds, ref_embeds):
            dist = cal_similarity(
                key_embed,
                ref_embed,
                method='dot_product',
                temperature=self.softmax_temp)
            dists.append(dist)
            if self.loss_track_aux is not None:
                cos_dist = cal_similarity(
                    key_embed, ref_embed, method='cosine')
                cos_dists.append(cos_dist)
            else:
                cos_dists.append(None)
        return dists, cos_dists

    def loss(self, dists, cos_dists, targets, weights):
        losses = dict()

        loss_track = 0.
        loss_track_aux = 0.
        for _dists, _cos_dists, _targets, _weights in zip(
                dists, cos_dists, targets, weights):
            loss_track += self.loss_track(
                _dists, _targets, _weights, avg_factor=_weights.sum())
            if self.loss_track_aux is not None:
                loss_track_aux += self.loss_track_aux(_cos_dists, _targets)
        losses['loss_track'] = loss_track / len(dists)

        if self.loss_track_aux is not None:
            losses['loss_track_aux'] = loss_track_aux / len(dists)

        return losses

    @staticmethod
    def random_choice(gallery, num):
        """Random select some elements from the gallery.

        It seems that Pytorch's implementation is slower than numpy so we use
        numpy to randperm the indices.
        """
        assert len(gallery) >= num
        if isinstance(gallery, list):
            gallery = np.array(gallery)
        cands = np.arange(len(gallery))
        np.random.shuffle(cands)
        rand_inds = cands[:num]
        if not isinstance(gallery, np.ndarray):
            rand_inds = torch.from_numpy(rand_inds).long().to(gallery.device)
        return gallery[rand_inds]


@HEADS.register_module()
class QuasiDenseMaskEmbedHeadGTMask(nn.Module):

    def __init__(self,
                 num_convs=4,
                 num_fcs=1,
                 roi_feat_size=7,
                 in_channels=256,
                 conv_out_channels=256,
                 fc_out_channels=1024,
                 embed_channels=256,
                 conv_cfg=None,
                 norm_cfg=None,
                 softmax_temp=-1,
                 loss_track=dict(
                     type='MultiPosCrossEntropyLoss', loss_weight=0.25),
                 loss_track_aux=dict(
                     type='L2Loss',
                     sample_ratio=3,
                     margin=0.3,
                     loss_weight=1.0,
                     hard_mining=True)):
        super(QuasiDenseMaskEmbedHeadGTMask, self).__init__()
        self.num_convs = num_convs
        self.num_fcs = num_fcs
        self.roi_feat_size = roi_feat_size
        self.in_channels = in_channels
        self.conv_out_channels = conv_out_channels
        self.fc_out_channels = fc_out_channels
        self.embed_channels = embed_channels
        self.conv_cfg = conv_cfg
        self.norm_cfg = norm_cfg
        self.relu = nn.ReLU(inplace=True)
        self.convs, self.fcs, last_layer_dim = self._add_conv_fc_branch(
            self.num_convs, self.num_fcs, self.in_channels)
        self.fc_embed = nn.Linear(last_layer_dim, embed_channels)

        self.softmax_temp = softmax_temp
        self.loss_track = build_loss(loss_track)
        if loss_track_aux is not None:
            self.loss_track_aux = build_loss(loss_track_aux)
        else:
            self.loss_track_aux = None

    def _add_conv_fc_branch(self, num_convs, num_fcs, in_channels):
        last_layer_dim = in_channels
        # add branch specific conv layers
        convs = nn.ModuleList()
        if num_convs > 0:
            for i in range(num_convs):
                conv_in_channels = (
                    last_layer_dim if i == 0 else self.conv_out_channels)
                convs.append(
                    ConvModule(
                        conv_in_channels,
                        self.conv_out_channels,
                        3,
                        padding=1,
                        conv_cfg=self.conv_cfg,
                        norm_cfg=self.norm_cfg))
            last_layer_dim = self.conv_out_channels
        # add branch specific fc layers
        fcs = nn.ModuleList()
        if num_fcs > 0:
            last_layer_dim *= (self.roi_feat_size * self.roi_feat_size)
            for i in range(num_fcs):
                fc_in_channels = (
                    last_layer_dim if i == 0 else self.fc_out_channels)
                fcs.append(nn.Linear(fc_in_channels, self.fc_out_channels))
            last_layer_dim = self.fc_out_channels
        return convs, fcs, last_layer_dim

    def init_weights(self):
        for m in self.fcs:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.constant_(m.bias, 0)
        nn.init.normal_(self.fc_embed.weight, 0, 0.01)
        nn.init.constant_(self.fc_embed.bias, 0)

    def forward(self, x):
        if self.num_convs > 0:
            for i, conv in enumerate(self.convs):
                x = conv(x)
        x = x.view(x.size(0), -1)

        if self.num_fcs > 0:
            for i, fc in enumerate(self.fcs):
                x = self.relu(fc(x))
        x = self.fc_embed(x)
        return x

    def get_track_targets(self, gt_match_indices, key_sampling_results,
                          ref_sampling_results):
        track_targets = []
        track_weights = []
        for _gt_match_indices, key_res, ref_res in zip(gt_match_indices,
                                                       key_sampling_results,
                                                       ref_sampling_results):
            targets = _gt_match_indices.new_zeros(
                (key_res.pos_masks.size(0), ref_res.pos_masks.size(0)),
                dtype=torch.int)
            _match_indices = _gt_match_indices[key_res.pos_assigned_gt_inds]
            pos2pos = (_match_indices.view(
                -1, 1) == ref_res.pos_assigned_gt_inds.view(1, -1)).int()
            targets[:, :pos2pos.size(1)] = pos2pos
            weights = (targets.sum(dim=1) > 0).float()
            track_targets.append(targets)
            track_weights.append(weights)
        return track_targets, track_weights

    def match(self, key_embeds, ref_embeds, key_sampling_results,
              ref_sampling_results):
        num_key_rois = [res.pos_masks.size(0) for res in key_sampling_results]
        key_embeds = torch.split(key_embeds, num_key_rois)
        num_ref_rois = [res.pos_masks.size(0) for res in ref_sampling_results]
        ref_embeds = torch.split(ref_embeds, num_ref_rois)

        dists, cos_dists = [], []
        for key_embed, ref_embed in zip(key_embeds, ref_embeds):
            dist = cal_similarity(
                key_embed,
                ref_embed,
                method='dot_product',
                temperature=self.softmax_temp)
            dists.append(dist)
            if self.loss_track_aux is not None:
                cos_dist = cal_similarity(
                    key_embed, ref_embed, method='cosine')
                cos_dists.append(cos_dist)
            else:
                cos_dists.append(None)
        return dists, cos_dists

    def loss(self, dists, cos_dists, targets, weights):
        losses = dict()

        loss_track = 0.
        loss_track_aux = 0.
        for _dists, _cos_dists, _targets, _weights in zip(
                dists, cos_dists, targets, weights):
            loss_track += self.loss_track(
                _dists, _targets, _weights, avg_factor=_weights.sum())
            if self.loss_track_aux is not None:
                loss_track_aux += self.loss_track_aux(_cos_dists, _targets)
        losses['loss_track'] = loss_track / len(dists)

        if self.loss_track_aux is not None:
            losses['loss_track_aux'] = loss_track_aux / len(dists)

        return losses

    @staticmethod
    def random_choice(gallery, num):
        """Random select some elements from the gallery.

        It seems that Pytorch's implementation is slower than numpy so we use
        numpy to randperm the indices.
        """
        assert len(gallery) >= num
        if isinstance(gallery, list):
            gallery = np.array(gallery)
        cands = np.arange(len(gallery))
        np.random.shuffle(cands)
        rand_inds = cands[:num]
        if not isinstance(gallery, np.ndarray):
            rand_inds = torch.from_numpy(rand_inds).long().to(gallery.device)
        return gallery[rand_inds]

================================================
FILE: knet/video/tracker.py
================================================
"""
This is a simple mask based tracker
Copyright (c) https://github.com/xingyizhou/CenterTrack
Modified by Xiangtai Li

"""
# coding: utf-8
import torch
from scipy.optimize import linear_sum_assignment
from .util import generalized_box_iou, masks_to_boxes
import copy


class SimpleMaskTracker(object):
    def __init__(self, score_thresh, max_age=32):
        self.score_thresh = score_thresh
        self.max_age = max_age
        self.id_count = 0
        self.tracks_dict = dict()
        self.tracks = list()
        self.unmatched_tracks = list()
        self.reset_all()

    def reset_all(self):
        self.id_count = 0
        self.tracks_dict = dict()
        self.tracks = list()
        self.unmatched_tracks = list()

    def init_track(self, results):

        scores = results["scores"] # (n,)
        masks = results["masks"]  # (n,h,w)

        ret = list()
        ret_dict = dict()
        for idx in range(scores.shape[0]):
            if scores[idx] >= self.score_thresh:
                self.id_count += 1
                obj = dict()
                obj["score"] = float(scores[idx])
                obj["mask"] = masks[idx]
                obj["tracking_id"] = self.id_count
                obj['active'] = 1
                obj['age'] = 1
                ret.append(obj)
                ret_dict[idx] = obj

        self.tracks = ret
        self.tracks_dict = ret_dict
        return copy.deepcopy(ret)

    def step(self, output_results, track_results):
        """
        Args:
            output_results: Current Frame Output including the tracked results
        Returns:
        """
        scores = output_results["scores"]  # (n,h,w)
        bboxes = output_results["masks"]  # (n,h,w)
        # track_bboxes = track_results["masks"]  # (m,h,w)

        results = list()
        results_dict = dict()

        # tracks = list()
        # for idx in range(scores.shape[0]):
        #     if idx in self.tracks_dict and idx < len(track_bboxes):
        #         self.tracks_dict[idx]["mask"] = track_bboxes[idx]
        #
        #     if scores[idx] >= self.score_thresh:
        #         obj = dict()
        #         obj["score"] = float(scores[idx])
        #         obj["mask"] = bboxes[idx]
        #         results.append(obj)
        #         results_dict[idx] = obj

        tracks = [v for v in self.tracks_dict.values()] + self.unmatched_tracks
        N = len(results)
        M = len(tracks)

        ret = list()
        unmatched_tracks = [t for t in range(M)]
        unmatched_dets = [d for d in range(N)]
        if N > 0 and M > 0:
            det_box = masks_to_boxes(torch.stack([torch.tensor(obj['mask']) for obj in results], dim=0))  # N x h * w
            track_box = masks_to_boxes(torch.stack([torch.tensor(obj['mask']) for obj in tracks], dim=0))  # M x h * w
            cost_bbox = 1.0 - generalized_box_iou(det_box, track_box)  # N x M

            matched_indices = linear_sum_assignment(cost_bbox)
            unmatched_dets = [d for d in range(N) if not (d in matched_indices[0])]
            unmatched_tracks = [d for d in range(M) if not (d in matched_indices[1])]

            matches = [[], []]
            for (m0, m1) in zip(matched_indices[0], matched_indices[1]):
                if cost_bbox[m0, m1] > 1.2:
                    unmatched_dets.append(m0)
                    unmatched_tracks.append(m1)
                else:
                    matches[0].append(m0)
                    matches[1].append(m1)

            # handle the matched tracks
            for (m0, m1) in zip(matches[0], matches[1]):
                track = results[m0]
                track['tracking_id'] = tracks[m1]['tracking_id']
                track['age'] = 1
                track['active'] = 1
                ret.append(track)

        for i in unmatched_dets:
            track = results[i]
            self.id_count += 1
            track['tracking_id'] = self.id_count
            track['age'] = 1
            track['active'] = 1
            ret.append(track)

        curent_track = ret

        # handle the remaining tracks
        ret_unmatched_tracks = []
        for i in unmatched_tracks:
            track = tracks[i]
            if track['age'] < self.max_age:
                track['age'] += 1
                track['active'] = 0
                ret.append(track)
                ret_unmatched_tracks.append(track)

        self.tracks = ret
        self.tracks_dict = results_dict
        self.unmatched_tracks = ret_unmatched_tracks
        return curent_track


================================================
FILE: knet/video/util.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
Utilities for bounding box manipulation and GIoU.
"""
import torch
from torchvision.ops.boxes import box_area


def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(-1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
         (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=-1)


def box_xyxy_to_cxcywh(x):
    x0, y0, x1, y1 = x.unbind(-1)
    b = [(x0 + x1) / 2, (y0 + y1) / 2,
         (x1 - x0), (y1 - y0)]
    return torch.stack(b, dim=-1)


# modified from torchvision to also return the union
def box_iou(boxes1, boxes2):
    area1 = box_area(boxes1)
    area2 = box_area(boxes2)

    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]

    wh = (rb - lt).clamp(min=0)  # [N,M,2]
    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]

    union = area1[:, None] + area2 - inter

    iou = inter / union
    return iou, union


def generalized_box_iou(boxes1, boxes2):
    """
    Generalized IoU from https://giou.stanford.edu/

    The boxes should be in [x0, y0, x1, y1] format

    Returns a [N, M] pairwise matrix, where N = len(boxes1)
    and M = len(boxes2)
    """
    # degenerate boxes gives inf / nan results
    # so do an early check
    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
    iou, union = box_iou(boxes1, boxes2)

    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])

    wh = (rb - lt).clamp(min=0)  # [N,M,2]
    area = wh[:, :, 0] * wh[:, :, 1]

    return iou - (area - union) / area


def masks_to_boxes(masks):
    """Compute the bounding boxes around the provided masks

    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.

    Returns a [N, 4] tensors, with the boxes in xyxy format
    """
    if masks.numel() == 0:
        return torch.zeros((0, 4), device=masks.device)

    h, w = masks.shape[-2:]

    y = torch.arange(0, h, dtype=torch.float)
    x = torch.arange(0, w, dtype=torch.float)
    y, x = torch.meshgrid(y, x)

    x_mask = (masks * x.unsqueeze(0))
    x_max = x_mask.flatten(1).max(-1)[0]
    x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]

    y_mask = (masks * y.unsqueeze(0))
    y_max = y_mask.flatten(1).max(-1)[0]
    y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]

    return torch.stack([x_min, y_min, x_max, y_max], 1)


================================================
FILE: knet_vis/__init__.py
================================================


================================================
FILE: knet_vis/det/__init__.py
================================================


================================================
FILE: knet_vis/det/kernel_head.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import (ConvModule, bias_init_with_prob, normal_init)
from mmdet.core import build_assigner, build_sampler, multi_apply, reduce_mean
from mmdet.models.builder import HEADS, build_loss, build_neck
from mmdet.models.losses import accuracy
from mmdet.utils import get_root_logger


@HEADS.register_module()
class ConvKernelHead(nn.Module):

    def __init__(self,
                 num_proposals=100,
                 in_channels=256,
                 out_channels=256,
                 num_heads=8,
                 num_cls_fcs=1,
                 num_seg_convs=1,
                 num_loc_convs=1,
                 att_dropout=False,
                 localization_fpn=None,
                 conv_kernel_size=1,
                 norm_cfg=dict(type='GN', num_groups=32),
                 semantic_fpn=True,
                 train_cfg=None,
                 num_classes=80,
                 xavier_init_kernel=False,
                 kernel_init_std=0.01,
                 use_binary=False,
                 proposal_feats_with_obj=False,
                 loss_mask=None,
                 loss_seg=None,
                 loss_cls=None,
                 loss_dice=None,
                 loss_rank=None,
                 feat_downsample_stride=1,
                 feat_refine_stride=1,
                 feat_refine=True,
                 with_embed=False,
                 feat_embed_only=False,
                 conv_normal_init=False,
                 mask_out_stride=4,
                 hard_target=False,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 ignore_label=255,
                 thing_label_in_seg=0,
                 cat_stuff_mask=False,
                 **kwargs):
        super(ConvKernelHead, self).__init__()
        self.num_proposals = num_proposals
        self.num_cls_fcs = num_cls_fcs
        self.train_cfg = train_cfg
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.num_classes = num_classes
        self.proposal_feats_with_obj = proposal_feats_with_obj
        self.sampling = False
        self.localization_fpn = build_neck(localization_fpn)
        self.semantic_fpn = semantic_fpn
        self.norm_cfg = norm_cfg
        self.num_heads = num_heads
        self.att_dropout = att_dropout
        self.mask_out_stride = mask_out_stride
        self.hard_target = hard_target
        self.conv_kernel_size = conv_kernel_size
        self.xavier_init_kernel = xavier_init_kernel
        self.kernel_init_std = kernel_init_std
        self.feat_downsample_stride = feat_downsample_stride
        self.feat_refine_stride = feat_refine_stride
        self.conv_normal_init = conv_normal_init
        self.feat_refine = feat_refine
        self.with_embed = with_embed
        self.feat_embed_only = feat_embed_only
        self.num_loc_convs = num_loc_convs
        self.num_seg_convs = num_seg_convs
        self.use_binary = use_binary
        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.ignore_label = ignore_label
        self.thing_label_in_seg = thing_label_in_seg
        self.cat_stuff_mask = cat_stuff_mask

        if loss_mask is not None:
            self.loss_mask = build_loss(loss_mask)
        else:
            self.loss_mask = loss_mask

        if loss_dice is not None:
            self.loss_dice = build_loss(loss_dice)
        else:
            self.loss_dice = loss_dice

        if loss_seg is not None:
            self.loss_seg = build_loss(loss_seg)
        else:
            self.loss_seg = loss_seg
        if loss_cls is not None:
            self.loss_cls = build_loss(loss_cls)
        else:
            self.loss_cls = loss_cls

        if loss_rank is not None:
            self.loss_rank = build_loss(loss_rank)
        else:
            self.loss_rank = loss_rank

        if self.train_cfg:
            self.assigner = build_assigner(self.train_cfg.assigner)
            # use PseudoSampler when sampling is False
            if self.sampling and hasattr(self.train_cfg, 'sampler'):
                sampler_cfg = self.train_cfg.sampler
            else:
                sampler_cfg = dict(type='MaskPseudoSampler')
            self.sampler = build_sampler(sampler_cfg, context=self)
        self._init_layers()

    def _init_layers(self):
        """Initialize a sparse set of proposal boxes and proposal features."""
        self.init_kernels = nn.Conv2d(
            self.out_channels,
            self.num_proposals,
            self.conv_kernel_size,
            padding=int(self.conv_kernel_size // 2),
            bias=False)

        if self.semantic_fpn:
            if self.loss_seg.use_sigmoid:
                self.conv_seg = nn.Conv2d(self.out_channels, self.num_classes,
                                          1)
            else:
                self.conv_seg = nn.Conv2d(self.out_channels,
                                          self.num_classes + 1, 1)

        if self.feat_downsample_stride > 1 and self.feat_refine:
            self.ins_downsample = ConvModule(
                self.in_channels,
                self.out_channels,
                3,
                stride=self.feat_refine_stride,
                padding=1,
                norm_cfg=self.norm_cfg)
            self.seg_downsample = ConvModule(
                self.in_channels,
                self.out_channels,
                3,
                stride=self.feat_refine_stride,
                padding=1,
                norm_cfg=self.norm_cfg)

        self.loc_convs = nn.ModuleList()
        for i in range(self.num_loc_convs):
            self.loc_convs.append(
                ConvModule(
                    self.in_channels,
                    self.out_channels,
                    1,
                    norm_cfg=self.norm_cfg))

        self.seg_convs = nn.ModuleList()
        for i in range(self.num_seg_convs):
            self.seg_convs.append(
                ConvModule(
                    self.in_channels,
                    self.out_channels,
                    1,
                    norm_cfg=self.norm_cfg))

    def init_weights(self):
        self.localization_fpn.init_weights()

        if self.feat_downsample_stride > 1 and self.conv_normal_init:
            logger = get_root_logger()
            logger.info('Initialize convs in KPN head by normal std 0.01')
            for conv in [self.loc_convs, self.seg_convs]:
                for m in conv.modules():
                    if isinstance(m, nn.Conv2d):
                        normal_init(m, std=0.01)

        if self.semantic_fpn:
            bias_seg = bias_init_with_prob(0.01)
            if self.loss_seg.use_sigmoid:
                normal_init(self.conv_seg, std=0.01, bias=bias_seg)
            else:
                normal_init(self.conv_seg, mean=0, std=0.01)
        if self.xavier_init_kernel:
            logger = get_root_logger()
            logger.info('Initialize kernels by xavier uniform')
            nn.init.xavier_uniform_(self.init_kernels.weight)
        else:
            logger = get_root_logger()
            logger.info(
                f'Initialize kernels by normal std: {self.kernel_init_std}')
            normal_init(self.init_kernels, mean=0, std=self.kernel_init_std)

    def _decode_init_proposals(self, img, img_metas):
        num_imgs = len(img_metas)

        localization_feats = self.localization_fpn(img)
        if isinstance(localization_feats, list):
            loc_feats = localization_feats[0]
        else:
            loc_feats = localization_feats
        for conv in self.loc_convs:
            loc_feats = conv(loc_feats)
        if self.feat_downsample_stride > 1 and self.feat_refine:
            loc_feats = self.ins_downsample(loc_feats)
        mask_preds = self.init_kernels(loc_feats)

        if self.semantic_fpn:
            if isinstance(localization_feats, list):
                semantic_feats = localization_feats[1]
            else:
                semantic_feats = localization_feats
            for conv in self.seg_convs:
                semantic_feats = conv(semantic_feats)
            if self.feat_downsample_stride > 1 and self.feat_refine:
                semantic_feats = self.seg_downsample(semantic_feats)
        else:
            semantic_feats = None

        if semantic_feats is not None:
            seg_preds = self.conv_seg(semantic_feats)
        else:
            seg_preds = None

        proposal_feats = self.init_kernels.weight.clone()
        proposal_feats = proposal_feats[None].expand(num_imgs,
                                                     *proposal_feats.size())

        if semantic_feats is not None:
            x_feats = semantic_feats + loc_feats
        else:
            x_feats = loc_feats

        if self.proposal_feats_with_obj:
            sigmoid_masks = mask_preds.sigmoid()
            nonzero_inds = sigmoid_masks > 0.5
            if self.use_binary:
                sigmoid_masks = nonzero_inds.float()
            else:
                sigmoid_masks = nonzero_inds.float() * sigmoid_masks
            obj_feats = torch.einsum('bnhw,bchw->bnc', sigmoid_masks, x_feats)

        cls_scores = None

        if self.proposal_feats_with_obj:
            proposal_feats = proposal_feats + obj_feats.view(
                num_imgs, self.num_proposals, self.out_channels, 1, 1)

        if self.cat_stuff_mask and not self.training:
            mask_preds = torch.cat(
                [mask_preds, seg_preds[:, self.num_thing_classes:]], dim=1)
            stuff_kernels = self.conv_seg.weight[self.
                                                 num_thing_classes:].clone()
            stuff_kernels = stuff_kernels[None].expand(num_imgs,
                                                       *stuff_kernels.size())
            proposal_feats = torch.cat([proposal_feats, stuff_kernels], dim=1)

        return proposal_feats, x_feats, mask_preds, cls_scores, seg_preds

    def forward_train(self,
                      img,
                      img_metas,
                      gt_masks,
                      gt_labels,
                      gt_sem_seg=None,
                      gt_sem_cls=None):
        """Forward function in training stage."""
        num_imgs = len(img_metas)
        results = self._decode_init_proposals(img, img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = results
        if self.feat_downsample_stride > 1:
            scaled_mask_preds = F.interpolate(
                mask_preds,
                scale_factor=self.feat_downsample_stride,
                mode='bilinear',
                align_corners=False)
            if seg_preds is not None:
                scaled_seg_preds = F.interpolate(
                    seg_preds,
                    scale_factor=self.feat_downsample_stride,
                    mode='bilinear',
                    align_corners=False)
        else:
            scaled_mask_preds = mask_preds
            scaled_seg_preds = seg_preds

        if self.hard_target:
            gt_masks = [x.bool().float() for x in gt_masks]
        else:
            gt_masks = gt_masks

        sampling_results = []
        if cls_scores is None:
            detached_cls_scores = [None] * num_imgs
        else:
            detached_cls_scores = cls_scores.detach()

        for i in range(num_imgs):
            assign_result = self.assigner.assign(scaled_mask_preds[i].detach(),
                                                 detached_cls_scores[i],
                                                 gt_masks[i], gt_labels[i],
                                                 img_metas[i])
            sampling_result = self.sampler.sample(assign_result,
                                                  scaled_mask_preds[i],
                                                  gt_masks[i])
            sampling_results.append(sampling_result)

        mask_targets = self.get_targets(
            sampling_results,
            gt_masks,
            self.train_cfg,
            True,
            gt_sem_seg=gt_sem_seg,
            gt_sem_cls=gt_sem_cls)

        losses = self.loss(scaled_mask_preds, cls_scores, scaled_seg_preds,
                           proposal_feats, *mask_targets)

        if self.cat_stuff_mask and self.training:
            mask_preds = torch.cat(
                [mask_preds, seg_preds[:, self.num_thing_classes:]], dim=1)
            stuff_kernels = self.conv_seg.weight[self.
                                                 num_thing_classes:].clone()
            stuff_kernels = stuff_kernels[None].expand(num_imgs,
                                                       *stuff_kernels.size())
            proposal_feats = torch.cat([proposal_feats, stuff_kernels], dim=1)

        return losses, proposal_feats, x_feats, mask_preds, cls_scores

    def loss(self,
             mask_pred,
             cls_scores,
             seg_preds,
             proposal_feats,
             labels,
             label_weights,
             mask_targets,
             mask_weights,
             seg_targets,
             reduction_override=None,
             **kwargs):
        losses = dict()
        bg_class_ind = self.num_classes
        # note in spare rcnn num_gt == num_pos
        pos_inds = (labels >= 0) & (labels < bg_class_ind)
        num_preds = mask_pred.shape[0] * mask_pred.shape[1]

        if cls_scores is not None:
            num_pos = pos_inds.sum().float()
            avg_factor = reduce_mean(num_pos)
            assert mask_pred.shape[0] == cls_scores.shape[0]
            assert mask_pred.shape[1] == cls_scores.shape[1]
            losses['loss_rpn_cls'] = self.loss_cls(
                cls_scores.view(num_preds, -1),
                labels,
                label_weights,
                avg_factor=avg_factor,
                reduction_override=reduction_override)
            losses['rpn_pos_acc'] = accuracy(
                cls_scores.view(num_preds, -1)[pos_inds], labels[pos_inds])

        bool_pos_inds = pos_inds.type(torch.bool)
        # 0~self.num_classes-1 are FG, self.num_classes is BG
        # do not perform bounding box regression for BG anymore.
        H, W = mask_pred.shape[-2:]
        if pos_inds.any():
            pos_mask_pred = mask_pred.reshape(num_preds, H, W)[bool_pos_inds]
            pos_mask_targets = mask_targets[bool_pos_inds]
            losses['loss_rpn_mask'] = self.loss_mask(pos_mask_pred,
                                                     pos_mask_targets)
            losses['loss_rpn_dice'] = self.loss_dice(pos_mask_pred,
                                                     pos_mask_targets)

            if self.loss_rank is not None:
                batch_size = mask_pred.size(0)
                rank_target = mask_targets.new_full((batch_size, H, W),
                                                    self.ignore_label,
                                                    dtype=torch.long)
                rank_inds = pos_inds.view(batch_size,
                                          -1).nonzero(as_tuple=False)
                batch_mask_targets = mask_targets.view(batch_size, -1, H,
                                                       W).bool()
                for i in range(batch_size):
                    curr_inds = (rank_inds[:, 0] == i)
                    curr_rank = rank_inds[:, 1][curr_inds]
                    for j in curr_rank:
                        rank_target[i][batch_mask_targets[i][j]] = j
                losses['loss_rpn_rank'] = self.loss_rank(
                    mask_pred, rank_target, ignore_index=self.ignore_label)

        else:
            losses['loss_rpn_mask'] = mask_pred.sum() * 0
            losses['loss_rpn_dice'] = mask_pred.sum() * 0
            if self.loss_rank is not None:
                losses['loss_rank'] = mask_pred.sum() * 0

        if seg_preds is not None:
            if self.loss_seg.use_sigmoid:
                cls_channel = seg_preds.shape[1]
                flatten_seg = seg_preds.view(
                    -1, cls_channel,
                    H * W).permute(0, 2, 1).reshape(-1, cls_channel)
                flatten_seg_target = seg_targets.view(-1)
                num_dense_pos = (flatten_seg_target >= 0) & (
                    flatten_seg_target < bg_class_ind)
                num_dense_pos = num_dense_pos.sum().float().clamp(min=1.0)
                losses['loss_rpn_seg'] = self.loss_seg(
                    flatten_seg,
                    flatten_seg_target,
                    avg_factor=num_dense_pos)
            else:
                cls_channel = seg_preds.shape[1]
                flatten_seg = seg_preds.view(-1, cls_channel, H * W).permute(
                    0, 2, 1).reshape(-1, cls_channel)
                flatten_seg_target = seg_targets.view(-1)
                losses['loss_rpn_seg'] = self.loss_seg(flatten_seg,
                                                       flatten_seg_target)

        return losses

    def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask,
                           pos_gt_mask, pos_gt_labels, gt_sem_seg, gt_sem_cls,
                           cfg):
        num_pos = pos_mask.size(0)
        num_neg = neg_mask.size(0)
        num_samples = num_pos + num_neg
        H, W = pos_mask.shape[-2:]
        # original implementation uses new_zeros since BG are set to be 0
        # now use empty & fill because BG cat_id = num_classes,
        # FG cat_id = [0, num_classes-1]
        labels = pos_mask.new_full((num_samples, ),
                                   self.num_classes,
                                   dtype=torch.long)
        label_weights = pos_mask.new_zeros(num_samples)
        mask_targets = pos_mask.new_zeros(num_samples, H, W)
        mask_weights = pos_mask.new_zeros(num_samples, H, W)
        seg_targets = pos_mask.new_full((H, W),
                                        self.num_classes,
                                        dtype=torch.long)

        if gt_sem_cls is not None and gt_sem_seg is not None:
            gt_sem_seg = gt_sem_seg.bool()
            for sem_mask, sem_cls in zip(gt_sem_seg, gt_sem_cls):
                seg_targets[sem_mask] = sem_cls.long()

        if num_pos > 0:
            labels[pos_inds] = pos_gt_labels
            pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
            label_weights[pos_inds] = pos_weight
            mask_targets[pos_inds, ...] = pos_gt_mask
            mask_weights[pos_inds, ...] = 1
            for i in range(num_pos):
                seg_targets[pos_gt_mask[i].bool()] = pos_gt_labels[i]

        if num_neg > 0:
            label_weights[neg_inds] = 1.0

        return labels, label_weights, mask_targets, mask_weights, seg_targets

    def get_targets(self,
                    sampling_results,
                    gt_mask,
                    rpn_train_cfg,
                    concat=True,
                    gt_sem_seg=None,
                    gt_sem_cls=None):
        num_imgs = len(sampling_results)
        pos_inds_list = [res.pos_inds for res in sampling_results]
        neg_inds_list = [res.neg_inds for res in sampling_results]
        pos_mask_list = [res.pos_masks for res in sampling_results]
        neg_mask_list = [res.neg_masks for res in sampling_results]
        pos_gt_mask_list = [res.pos_gt_masks for res in sampling_results]
        pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results]
        if gt_sem_seg is None:
            gt_sem_seg = [None] * num_imgs
            gt_sem_cls = [None] * num_imgs
        results = multi_apply(
            self._get_target_single,
            pos_inds_list,
            neg_inds_list,
            pos_mask_list,
            neg_mask_list,
            pos_gt_mask_list,
            pos_gt_labels_list,
            gt_sem_seg,
            gt_sem_cls,
            cfg=rpn_train_cfg)
        (labels, label_weights, mask_targets, mask_weights,
         seg_targets) = results
        if concat:
            labels = torch.cat(labels, 0)
            label_weights = torch.cat(label_weights, 0)
            mask_targets = torch.cat(mask_targets, 0)
            mask_weights = torch.cat(mask_weights, 0)
            seg_targets = torch.stack(seg_targets, 0)
        return labels, label_weights, mask_targets, mask_weights, seg_targets

    def simple_test_rpn(self, img, img_metas):
        """Forward function in testing stage."""
        return self._decode_init_proposals(img, img_metas)

    def forward_dummy(self, img, img_metas):
        """Dummy forward function.

        Used in flops calculation.
        """
        return self._decode_init_proposals(img, img_metas)


================================================
FILE: knet_vis/det/kernel_iter_head.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

from mmdet.core import build_assigner, build_sampler
from mmdet.datasets.coco_panoptic import INSTANCE_OFFSET
from mmdet.models.builder import HEADS, build_head
from mmdet.models.roi_heads import BaseRoIHead
from .mask_pseudo_sampler import MaskPseudoSampler


@HEADS.register_module()
class KernelIterHead(BaseRoIHead):

    def __init__(self,
                 num_stages=6,
                 recursive=False,
                 assign_stages=5,
                 stage_loss_weights=(1, 1, 1, 1, 1, 1),
                 proposal_feature_channel=256,
                 merge_cls_scores=False,
                 do_panoptic=False,
                 post_assign=False,
                 hard_target=False,
                 num_proposals=100,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 thing_label_in_seg=0,
                 mask_head=dict(
                     type='KernelUpdateHead',
                     num_classes=80,
                     num_fcs=2,
                     num_heads=8,
                     num_cls_fcs=1,
                     num_reg_fcs=3,
                     feedforward_channels=2048,
                     hidden_channels=256,
                     dropout=0.0,
                     roi_feat_size=7,
                     ffn_act_cfg=dict(type='ReLU', inplace=True)),
                 mask_out_stride=4,
                 train_cfg=None,
                 test_cfg=None,
                 **kwargs):
        assert mask_head is not None
        assert len(stage_loss_weights) == num_stages
        self.num_stages = num_stages
        self.stage_loss_weights = stage_loss_weights
        self.proposal_feature_channel = proposal_feature_channel
        self.merge_cls_scores = merge_cls_scores
        self.recursive = recursive
        self.post_assign = post_assign
        self.mask_out_stride = mask_out_stride
        self.hard_target = hard_target
        self.assign_stages = assign_stages
        self.do_panoptic = do_panoptic
        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.num_classes = num_thing_classes + num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.thing_label_in_seg = thing_label_in_seg
        self.num_proposals = num_proposals
        super(KernelIterHead, self).__init__(
            mask_head=mask_head,
            train_cfg=train_cfg,
            test_cfg=test_cfg,
            **kwargs)
        # train_cfg would be None when run the test.py
        if train_cfg is not None:
            for stage in range(num_stages):
                assert isinstance(
                    self.mask_sampler[stage], MaskPseudoSampler), \
                    'Sparse Mask only support `MaskPseudoSampler`'

    def init_bbox_head(self, mask_roi_extractor, mask_head):
        """Initialize box head and box roi extractor.

        Args:
            mask_roi_extractor (dict): Config of box roi extractor.
            mask_head (dict): Config of box in box head.
        """
        pass

    def init_assigner_sampler(self):
        """Initialize assigner and sampler for each stage."""
        self.mask_assigner = []
        self.mask_sampler = []
        if self.train_cfg is not None:
            for idx, rcnn_train_cfg in enumerate(self.train_cfg):
                self.mask_assigner.append(
                    build_assigner(rcnn_train_cfg.assigner))
                self.current_stage = idx
                self.mask_sampler.append(
                    build_sampler(rcnn_train_cfg.sampler, context=self))

    def init_weights(self):
        for i in range(self.num_stages):
            self.mask_head[i].init_weights()

    def init_mask_head(self, mask_roi_extractor, mask_head):
        """Initialize mask head and mask roi extractor.

        Args:
            mask_roi_extractor (dict): Config of mask roi extractor.
            mask_head (dict): Config of mask in mask head.
        """
        self.mask_head = nn.ModuleList()
        if not isinstance(mask_head, list):
            mask_head = [mask_head for _ in range(self.num_stages)]
        assert len(mask_head) == self.num_stages
        for head in mask_head:
            self.mask_head.append(build_head(head))
        if self.recursive:
            for i in range(self.num_stages):
                self.mask_head[i] = self.mask_head[0]

    def _mask_forward(self, stage, x, object_feats, mask_preds, img_metas):
        mask_head = self.mask_head[stage]
        cls_score, mask_preds, object_feats = mask_head(
            x, object_feats, mask_preds, img_metas=img_metas)
        if mask_head.mask_upsample_stride > 1 and (stage == self.num_stages - 1
                                                   or self.training):
            scaled_mask_preds = F.interpolate(
                mask_preds,
                scale_factor=mask_head.mask_upsample_stride,
                align_corners=False,
                mode='bilinear')
        else:
            scaled_mask_preds = mask_preds
        mask_results = dict(
            cls_score=cls_score,
            mask_preds=mask_preds,
            scaled_mask_preds=scaled_mask_preds,
            object_feats=object_feats)

        return mask_results

    def forward_train(self,
                      x,
                      proposal_feats,
                      mask_preds,
                      cls_score,
                      img_metas,
                      gt_masks,
                      gt_labels,
                      gt_bboxes_ignore=None,
                      imgs_whwh=None,
                      gt_bboxes=None,
                      gt_sem_seg=None,
                      gt_sem_cls=None):

        num_imgs = len(img_metas)
        if self.mask_head[0].mask_upsample_stride > 1:
            prev_mask_preds = F.interpolate(
                mask_preds.detach(),
                scale_factor=self.mask_head[0].mask_upsample_stride,
                mode='bilinear',
                align_corners=False)
        else:
            prev_mask_preds = mask_preds.detach()

        if cls_score is not None:
            prev_cls_score = cls_score.detach()
        else:
            prev_cls_score = [None] * num_imgs

        if self.hard_target:
            gt_masks = [x.bool().float() for x in gt_masks]
        else:
            gt_masks = gt_masks

        object_feats = proposal_feats
        all_stage_loss = {}
        all_stage_mask_results = []
        assign_results = []
        for stage in range(self.num_stages):
            mask_results = self._mask_forward(stage, x, object_feats,
                                              mask_preds, img_metas)
            all_stage_mask_results.append(mask_results)
            mask_preds = mask_results['mask_preds']
            scaled_mask_preds = mask_results['scaled_mask_preds']
            cls_score = mask_results['cls_score']
            object_feats = mask_results['object_feats']

            if self.post_assign:
                prev_mask_preds = scaled_mask_preds.detach()
                prev_cls_score = cls_score.detach()

            sampling_results = []
            if stage < self.assign_stages:
                assign_results = []
            for i in range(num_imgs):
                if stage < self.assign_stages:
                    mask_for_assign = prev_mask_preds[i][:self.num_proposals]
                    if prev_cls_score[i] is not None:
                        cls_for_assign = prev_cls_score[
                            i][:self.num_proposals, :self.num_thing_classes]
                    else:
                        cls_for_assign = None
                    assign_result = self.mask_assigner[stage].assign(
                        mask_for_assign, cls_for_assign, gt_masks[i],
                        gt_labels[i], img_metas[i])
                    assign_results.append(assign_result)
                sampling_result = self.mask_sampler[stage].sample(
                    assign_results[i], scaled_mask_preds[i], gt_masks[i])
                sampling_results.append(sampling_result)
            mask_targets = self.mask_head[stage].get_targets(
                sampling_results,
                self.train_cfg[stage],
                True,
                gt_sem_seg=gt_sem_seg,
                gt_sem_cls=gt_sem_cls)

            single_stage_loss = self.mask_head[stage].loss(
                object_feats,
                cls_score,
                scaled_mask_preds,
                *mask_targets,
                imgs_whwh=imgs_whwh)
            for key, value in single_stage_loss.items():
                all_stage_loss[f's{stage}_{key}'] = value * \
                                    self.stage_loss_weights[stage]

            if not self.post_assign:
                prev_mask_preds = scaled_mask_preds.detach()
                prev_cls_score = cls_score.detach()

        return all_stage_loss

    def simple_test(self,
                    x,
                    proposal_feats,
                    mask_preds,
                    cls_score,
                    img_metas,
                    imgs_whwh=None,
                    rescale=False):

        # Decode initial proposals
        num_imgs = len(img_metas)
        # num_proposals = proposal_feats.size(1)

        object_feats = proposal_feats
        for stage in range(self.num_stages):
            mask_results = self._mask_forward(stage, x, object_feats,
                                              mask_preds, img_metas)
            object_feats = mask_results['object_feats']
            cls_score = mask_results['cls_score']
            mask_preds = mask_results['mask_preds']
            scaled_mask_preds = mask_results['scaled_mask_preds']

        num_classes = self.mask_head[-1].num_classes
        results = []

        if self.mask_head[-1].loss_cls.use_sigmoid:
            cls_score = cls_score.sigmoid()
        else:
            cls_score = cls_score.softmax(-1)[..., :-1]

        if self.do_panoptic:
            for img_id in range(num_imgs):
                single_result = self.get_panoptic(cls_score[img_id],
                                                  scaled_mask_preds[img_id],
                                                  self.test_cfg,
                                                  img_metas[img_id])
                results.append(single_result)
        else:
            for img_id in range(num_imgs):
                cls_score_per_img = cls_score[img_id]
                # h, quite tricky here, a bounding box can predict multiple results with different labels
                scores_per_img, topk_indices = cls_score_per_img.flatten(0, 1).topk(
                        self.test_cfg.max_per_img, sorted=True)
                mask_indices = topk_indices // num_classes
                # Use the following when torch >= 1.9.0
                # mask_indices = torch.div(topk_indices, num_classes, rounding_mode='trunc')
                labels_per_img = topk_indices % num_classes
                masks_per_img = scaled_mask_preds[img_id][mask_indices]
                single_result = self.mask_head[-1].get_seg_masks(
                    masks_per_img, labels_per_img, scores_per_img,
                    self.test_cfg, img_metas[img_id])
                results.append(single_result)
        return results

    def aug_test(self, features, proposal_list, img_metas, rescale=False):
        raise NotImplementedError('SparseMask does not support `aug_test`')

    def forward_dummy(self, x, proposal_boxes, proposal_feats, img_metas):
        """Dummy forward function when do the flops computing."""
        all_stage_mask_results = []
        num_imgs = len(img_metas)
        num_proposals = proposal_feats.size(1)
        C, H, W = x.shape[-3:]
        mask_preds = proposal_feats.bmm(x.view(num_imgs, C, -1)).view(
            num_imgs, num_proposals, H, W)
        object_feats = proposal_feats
        for stage in range(self.num_stages):
            mask_results = self._mask_forward(stage, x, object_feats,
                                              mask_preds, img_metas)
            all_stage_mask_results.append(mask_results)
        return all_stage_mask_results

    def get_panoptic(self, cls_scores, mask_preds, test_cfg, img_meta):
        # resize mask predictions back
        scores = cls_scores[:self.num_proposals][:, :self.num_thing_classes]
        thing_scores, thing_labels = scores.max(dim=1)
        stuff_scores = cls_scores[
            self.num_proposals:][:, self.num_thing_classes:].diag()
        stuff_labels = torch.arange(
            0, self.num_stuff_classes) + self.num_thing_classes
        stuff_labels = stuff_labels.to(thing_labels.device)

        total_masks = self.mask_head[-1].rescale_masks(mask_preds, img_meta)
        total_scores = torch.cat([thing_scores, stuff_scores], dim=0)
        total_labels = torch.cat([thing_labels, stuff_labels], dim=0)

        panoptic_result = self.merge_stuff_thing(total_masks, total_labels,
                                                 total_scores,
                                                 test_cfg.merge_stuff_thing)
        return dict(pan_results=panoptic_result)

    def merge_stuff_thing(self,
                          total_masks,
                          total_labels,
                          total_scores,
                          merge_cfg=None):

        H, W = total_masks.shape[-2:]
        panoptic_seg = total_masks.new_full((H, W),
                                            self.num_classes,
                                            dtype=torch.long)

        cur_prob_masks = total_scores.view(-1, 1, 1) * total_masks
        cur_mask_ids = cur_prob_masks.argmax(0)

        # sort instance outputs by scores
        sorted_inds = torch.argsort(-total_scores)
        current_segment_id = 0

        for k in sorted_inds:
            pred_class = total_labels[k].item()
            isthing = pred_class < self.num_thing_classes
            if isthing and total_scores[k] < merge_cfg.instance_score_thr:
                continue

            mask = cur_mask_ids == k
            mask_area = mask.sum().item()
            original_area = (total_masks[k] >= 0.5).sum().item()

            if mask_area > 0 and original_area > 0:
                if mask_area / original_area < merge_cfg.overlap_thr:
                    continue

                panoptic_seg[mask] = total_labels[k] \
                    + current_segment_id * INSTANCE_OFFSET
                current_segment_id += 1

        return panoptic_seg.cpu().numpy()


================================================
FILE: knet_vis/det/kernel_update_head.py
================================================
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import (ConvModule, bias_init_with_prob, build_activation_layer,
                      build_norm_layer)
from mmcv.cnn.bricks.transformer import (FFN, MultiheadAttention,
                                         build_transformer_layer)
from mmcv.runner import force_fp32

from mmdet.core import multi_apply
from mmdet.models.builder import HEADS, build_loss
from mmdet.models.dense_heads.atss_head import reduce_mean
from mmdet.models.losses import accuracy
from mmdet.utils import get_root_logger

from mmtrack.transform import outs2results

@HEADS.register_module()
class KernelUpdateHead(nn.Module):

    def __init__(self,
                 num_classes=80,
                 num_ffn_fcs=2,
                 num_heads=8,
                 num_cls_fcs=1,
                 num_mask_fcs=3,
                 feedforward_channels=2048,
                 in_channels=256,
                 out_channels=256,
                 dropout=0.0,
                 mask_thr=0.5,
                 act_cfg=dict(type='ReLU', inplace=True),
                 ffn_act_cfg=dict(type='ReLU', inplace=True),
                 conv_kernel_size=3,
                 feat_transform_cfg=None,
                 hard_mask_thr=0.5,
                 kernel_init=False,
                 with_ffn=True,
                 mask_out_stride=4,
                 relative_coors=False,
                 relative_coors_off=False,
                 feat_gather_stride=1,
                 mask_transform_stride=1,
                 mask_upsample_stride=1,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 ignore_label=255,
                 thing_label_in_seg=0,
                 kernel_updator_cfg=dict(
                     type='DynamicConv',
                     in_channels=256,
                     feat_channels=64,
                     out_channels=256,
                     input_feat_shape=1,
                     act_cfg=dict(type='ReLU', inplace=True),
                     norm_cfg=dict(type='LN')),
                 loss_rank=None,
                 loss_mask=dict(
                     type='CrossEntropyLoss', use_mask=True, loss_weight=1.0),
                 loss_dice=dict(type='DiceLoss', loss_weight=3.0),
                 loss_cls=dict(
                     type='FocalLoss',
                     use_sigmoid=True,
                     gamma=2.0,
                     alpha=0.25,
                     loss_weight=2.0)):
        super(KernelUpdateHead, self).__init__()
        self.num_classes = num_classes
        self.loss_cls = build_loss(loss_cls)
        self.loss_mask = build_loss(loss_mask)
        self.loss_dice = build_loss(loss_dice)
        if loss_rank is not None:
            self.loss_rank = build_loss(loss_rank)
        else:
            self.loss_rank = loss_rank

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.mask_thr = mask_thr
        self.fp16_enabled = False
        self.dropout = dropout

        self.num_heads = num_heads
        self.hard_mask_thr = hard_mask_thr
        self.kernel_init = kernel_init
        self.with_ffn = with_ffn
        self.mask_out_stride = mask_out_stride
        self.relative_coors = relative_coors
        self.relative_coors_off = relative_coors_off
        self.conv_kernel_size = conv_kernel_size
        self.feat_gather_stride = feat_gather_stride
        self.mask_transform_stride = mask_transform_stride
        self.mask_upsample_stride = mask_upsample_stride

        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.ignore_label = ignore_label
        self.thing_label_in_seg = thing_label_in_seg

        self.attention = MultiheadAttention(in_channels * conv_kernel_size**2,
                                            num_heads, dropout)
        self.attention_norm = build_norm_layer(
            dict(type='LN'), in_channels * conv_kernel_size**2)[1]

        self.kernel_update_conv = build_transformer_layer(kernel_updator_cfg)

        if feat_transform_cfg is not None:
            kernel_size = feat_transform_cfg.pop('kernel_size', 1)
            self.feat_transform = ConvModule(
                in_channels,
                in_channels,
                kernel_size,
                stride=feat_gather_stride,
                padding=int(feat_gather_stride // 2),
                **feat_transform_cfg)
        else:
            self.feat_transform = None

        if self.with_ffn:
            self.ffn = FFN(
                in_channels,
                feedforward_channels,
                num_ffn_fcs,
                act_cfg=ffn_act_cfg,
                ffn_drop=dropout)
            self.ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1]

        self.cls_fcs = nn.ModuleList()
        for _ in range(num_cls_fcs):
            self.cls_fcs.append(
                nn.Linear(in_channels, in_channels, bias=False))
            self.cls_fcs.append(
                build_norm_layer(dict(type='LN'), in_channels)[1])
            self.cls_fcs.append(build_activation_layer(act_cfg))

        if self.loss_cls.use_sigmoid:
            self.fc_cls = nn.Linear(in_channels, self.num_classes)
        else:
            self.fc_cls = nn.Linear(in_channels, self.num_classes + 1)

        self.mask_fcs = nn.ModuleList()
        for _ in range(num_mask_fcs):
            self.mask_fcs.append(
                nn.Linear(in_channels, in_channels, bias=False))
            self.mask_fcs.append(
                build_norm_layer(dict(type='LN'), in_channels)[1])
            self.mask_fcs.append(build_activation_layer(act_cfg))

        self.fc_mask = nn.Linear(in_channels, out_channels)

    def init_weights(self):
        """Use xavier initialization for all weight parameter and set
        classification head bias as a specific value when use focal loss."""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
            else:
                # adopt the default initialization for
                # the weight and bias of the layer norm
                pass
        if self.loss_cls.use_sigmoid:
            bias_init = bias_init_with_prob(0.01)
            nn.init.constant_(self.fc_cls.bias, bias_init)
        if self.kernel_init:
            logger = get_root_logger()
            logger.info(
                'mask kernel in mask head is normal initialized by std 0.01')
            nn.init.normal_(self.fc_mask.weight, mean=0, std=0.01)

    def forward(self,
                x,
                proposal_feat,
                mask_preds,
                prev_cls_score=None,
                mask_shape=None,
                img_metas=None):

        N, num_proposals = proposal_feat.shape[:2]
        if self.feat_transform is not None:
            x = self.feat_transform(x)
        C, H, W = x.shape[-3:]

        mask_h, mask_w = mask_preds.shape[-2:]
        if mask_h != H or mask_w != W:
            gather_mask = F.interpolate(
                mask_preds, (H, W), align_corners=False, mode='bilinear')
        else:
            gather_mask = mask_preds

        sigmoid_masks = gather_mask.sigmoid()
        nonzero_inds = sigmoid_masks > self.hard_mask_thr
        sigmoid_masks = nonzero_inds.float()

        # einsum is faster than bmm by 30%
        x_feat = torch.einsum('bnhw,bchw->bnc', sigmoid_masks, x)

        # obj_feat in shape [B, N, C, K, K] -> [B, N, C, K*K] -> [B, N, K*K, C]
        proposal_feat = proposal_feat.reshape(N, num_proposals,
                                              self.in_channels,
                                              -1).permute(0, 1, 3, 2)
        obj_feat = self.kernel_update_conv(x_feat, proposal_feat)

        # [B, N, K*K, C] -> [B, N, K*K*C] -> [N, B, K*K*C]
        obj_feat = obj_feat.reshape(N, num_proposals, -1).permute(1, 0, 2)
        obj_feat = self.attention_norm(self.attention(obj_feat))
        # [N, B, K*K*C] -> [B, N, K*K*C]
        obj_feat = obj_feat.permute(1, 0, 2)

        # obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C]
        obj_feat = obj_feat.reshape(N, num_proposals, -1, self.in_channels)

        # FFN
        if self.with_ffn:
            obj_feat = self.ffn_norm(self.ffn(obj_feat))

        cls_feat = obj_feat.sum(-2)
        mask_feat = obj_feat

        for cls_layer in self.cls_fcs:
            cls_feat = cls_layer(cls_feat)
        for reg_layer in self.mask_fcs:
            mask_feat = reg_layer(mask_feat)

        cls_score = self.fc_cls(cls_feat).view(N, num_proposals, -1)
        # [B, N, K*K, C] -> [B, N, C, K*K]
        mask_feat = self.fc_mask(mask_feat).permute(0, 1, 3, 2)

        if (self.mask_transform_stride == 2 and self.feat_gather_stride == 1):
            mask_x = F.interpolate(
                x, scale_factor=0.5, mode='bilinear', align_corners=False)
            H, W = mask_x.shape[-2:]
            raise NotImplementedError
        else:
            mask_x = x
        # group conv is 5x faster than unfold and uses about 1/5 memory
        # Group conv vs. unfold vs. concat batch, 2.9ms :13.5ms :3.8ms
        # Group conv vs. unfold vs. concat batch, 278 : 1420 : 369
        # fold_x = F.unfold(
        #     mask_x,
        #     self.conv_kernel_size,
        #     padding=int(self.conv_kernel_size // 2))
        # mask_feat = mask_feat.reshape(N, num_proposals, -1)
        # new_mask_preds = torch.einsum('bnc,bcl->bnl', mask_feat, fold_x)
        # [B, N, C, K*K] -> [B*N, C, K, K]
        mask_feat = mask_feat.reshape(N, num_proposals, C,
                                      self.conv_kernel_size,
                                      self.conv_kernel_size)
        # [B, C, H, W] -> [1, B*C, H, W]
        new_mask_preds = []
        for i in range(N):
            new_mask_preds.append(
                F.conv2d(
                    mask_x[i:i + 1],
                    mask_feat[i],
                    padding=int(self.conv_kernel_size // 2)))

        new_mask_preds = torch.cat(new_mask_preds, dim=0)
        new_mask_preds = new_mask_preds.reshape(N, num_proposals, H, W)
        if self.mask_transform_stride == 2:
            new_mask_preds = F.interpolate(
                new_mask_preds,
                scale_factor=2,
                mode='bilinear',
                align_corners=False)

        if mask_shape is not None and mask_shape[0] != H:
            new_mask_preds = F.interpolate(
                new_mask_preds,
                mask_shape,
                align_corners=False,
                mode='bilinear')

        return cls_score, new_mask_preds, obj_feat.permute(0, 1, 3, 2).reshape(
            N, num_proposals, self.in_channels, self.conv_kernel_size,
            self.conv_kernel_size)

    @force_fp32(apply_to=('cls_score', 'mask_pred'))
    def loss(self,
             object_feats,
             cls_score,
             mask_pred,
             labels,
             label_weights,
             mask_targets,
             mask_weights,
             imgs_whwh=None,
             reduction_override=None,
             **kwargs):

        losses = dict()
        bg_class_ind = self.num_classes
        # note in spare rcnn num_gt == num_pos
        pos_inds = (labels >= 0) & (labels < bg_class_ind)
        num_pos = pos_inds.sum().float()
        avg_factor = reduce_mean(num_pos).clamp_(min=1.0)

        num_preds = mask_pred.shape[0] * mask_pred.shape[1]
        assert mask_pred.shape[0] == cls_score.shape[0]
        assert mask_pred.shape[1] == cls_score.shape[1]

        if cls_score is not None:
            if cls_score.numel() > 0:
                losses['loss_cls'] = self.loss_cls(
                    cls_score.view(num_preds, -1),
                    labels,
                    label_weights,
                    avg_factor=avg_factor,
                    reduction_override=reduction_override)
                losses['pos_acc'] = accuracy(
                    cls_score.view(num_preds, -1)[pos_inds], labels[pos_inds])
        if mask_pred is not None:
            bool_pos_inds = pos_inds.type(torch.bool)
            # 0~self.num_classes-1 are FG, self.num_classes is BG
            # do not perform bounding box regression for BG anymore.
            H, W = mask_pred.shape[-2:]
            if pos_inds.any():
                pos_mask_pred = mask_pred.reshape(num_preds, H,
                                                  W)[bool_pos_inds]
                pos_mask_targets = mask_targets[bool_pos_inds]
                losses['loss_mask'] = self.loss_mask(pos_mask_pred,
                                                     pos_mask_targets)
                losses['loss_dice'] = self.loss_dice(pos_mask_pred,
                                                     pos_mask_targets)

                if self.loss_rank is not None:
                    batch_size = mask_pred.size(0)
                    rank_target = mask_targets.new_full((batch_size, H, W),
                                                        self.ignore_label,
                                                        dtype=torch.long)
                    rank_inds = pos_inds.view(batch_size,
                                              -1).nonzero(as_tuple=False)
                    batch_mask_targets = mask_targets.view(
                        batch_size, -1, H, W).bool()
                    for i in range(batch_size):
                        curr_inds = (rank_inds[:, 0] == i)
                        curr_rank = rank_inds[:, 1][curr_inds]
                        for j in curr_rank:
                            rank_target[i][batch_mask_targets[i][j]] = j
                    losses['loss_rank'] = self.loss_rank(
                        mask_pred, rank_target, ignore_index=self.ignore_label)
            else:
                losses['loss_mask'] = mask_pred.sum() * 0
                losses['loss_dice'] = mask_pred.sum() * 0
                if self.loss_rank is not None:
                    losses['loss_rank'] = mask_pred.sum() * 0

        return losses

    def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask,
                           pos_gt_mask, pos_gt_labels, gt_sem_seg, gt_sem_cls,
                           cfg):

        num_pos = pos_mask.size(0)
        num_neg = neg_mask.size(0)
        num_samples = num_pos + num_neg
        H, W = pos_mask.shape[-2:]
        # original implementation uses new_zeros since BG are set to be 0
        # now use empty & fill because BG cat_id = num_classes,
        # FG cat_id = [0, num_classes-1]
        labels = pos_mask.new_full((num_samples, ),
                                   self.num_classes,
                                   dtype=torch.long)
        label_weights = pos_mask.new_zeros((num_samples, self.num_classes))
        mask_targets = pos_mask.new_zeros(num_samples, H, W)
        mask_weights = pos_mask.new_zeros(num_samples, H, W)
        if num_pos > 0:
            labels[pos_inds] = pos_gt_labels
            pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
            label_weights[pos_inds] = pos_weight
            pos_mask_targets = pos_gt_mask
            mask_targets[pos_inds, ...] = pos_mask_targets
            mask_weights[pos_inds, ...] = 1

        if num_neg > 0:
            label_weights[neg_inds] = 1.0

        if gt_sem_cls is not None and gt_sem_seg is not None:
            sem_labels = pos_mask.new_full((self.num_stuff_classes, ),
                                           self.num_classes,
                                           dtype=torch.long)
            sem_targets = pos_mask.new_zeros(self.num_stuff_classes, H, W)
            sem_weights = pos_mask.new_zeros(self.num_stuff_classes, H, W)
            sem_stuff_weights = torch.eye(
                self.num_stuff_classes, device=pos_mask.device)
            sem_thing_weights = pos_mask.new_zeros(
                (self.num_stuff_classes, self.num_thing_classes))
            sem_label_weights = torch.cat(
                [sem_thing_weights, sem_stuff_weights], dim=-1)
            if len(gt_sem_cls > 0):
                sem_inds = gt_sem_cls - self.num_thing_classes
                sem_inds = sem_inds.long()
                sem_labels[sem_inds] = gt_sem_cls.long()
                sem_targets[sem_inds] = gt_sem_seg
                sem_weights[sem_inds] = 1

            label_weights[:, self.num_thing_classes:] = 0
            labels = torch.cat([labels, sem_labels])
            label_weights = torch.cat([label_weights, sem_label_weights])
            mask_targets = torch.cat([mask_targets, sem_targets])
            mask_weights = torch.cat([mask_weights, sem_weights])

        return labels, label_weights, mask_targets, mask_weights

    def get_targets(self,
                    sampling_results,
                    rcnn_train_cfg,
                    concat=True,
                    gt_sem_seg=None,
                    gt_sem_cls=None):
        num_imgs = len(sampling_results)
        pos_inds_list = [res.pos_inds for res in sampling_results]
        neg_inds_list = [res.neg_inds for res in sampling_results]
        pos_mask_list = [res.pos_masks for res in sampling_results]
        neg_mask_list = [res.neg_masks for res in sampling_results]
        pos_gt_mask_list = [res.pos_gt_masks for res in sampling_results]
        pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results]
        if gt_sem_seg is None:
            gt_sem_seg = [None] * num_imgs
            gt_sem_cls = [None] * num_imgs

        labels, label_weights, mask_targets, mask_weights = multi_apply(
            self._get_target_single,
            pos_inds_list,
            neg_inds_list,
            pos_mask_list,
            neg_mask_list,
            pos_gt_mask_list,
            pos_gt_labels_list,
            gt_sem_seg,
            gt_sem_cls,
            cfg=rcnn_train_cfg)
        if concat:
            labels = torch.cat(labels, 0)
            label_weights = torch.cat(label_weights, 0)
            mask_targets = torch.cat(mask_targets, 0)
            mask_weights = torch.cat(mask_weights, 0)
        return labels, label_weights, mask_targets, mask_weights

    def rescale_masks(self, masks_per_img, img_meta):
        h, w, _ = img_meta['img_shape']
        masks_per_img = F.interpolate(
            masks_per_img.unsqueeze(0).sigmoid(),
            size=img_meta['batch_input_shape'],
            mode='bilinear',
            align_corners=False)

        masks_per_img = masks_per_img[:, :, :h, :w]
        ori_shape = img_meta['ori_shape']
        seg_masks = F.interpolate(
            masks_per_img,
            size=ori_shape[:2],
            mode='bilinear',
            align_corners=False).squeeze(0)
        return seg_masks

    def get_seg_masks(self, masks_per_img, labels_per_img, scores_per_img,
                      test_cfg, img_meta):
        # resize mask predictions back
        seg_masks = self.rescale_masks(masks_per_img, img_meta)
        seg_masks = seg_masks > test_cfg.mask_thr
        bbox_result, segm_result = self.segm2result(seg_masks, labels_per_img,
                                                    scores_per_img)
        return bbox_result, segm_result

    def segm2result(self, mask_preds, det_labels, cls_scores):
        num_classes = self.num_classes
        bbox_result = None
        segm_result = [[] for _ in range(num_classes)]
        mask_preds = mask_preds.cpu().numpy()
        det_labels = det_labels.cpu().numpy()
        cls_scores = cls_scores.cpu().numpy()
        num_ins = mask_preds.shape[0]
        # fake bboxes
        bboxes = np.zeros((num_ins, 5), dtype=np.float32)
        bboxes[:, -1] = cls_scores
        bbox_result = [bboxes[det_labels == i, :] for i in range(num_classes)]
        for idx in range(num_ins):
            segm_result[det_labels[idx]].append(mask_preds[idx])
        return bbox_result, segm_result

    def get_seg_masks_tracking(self, masks_per_img, labels_per_img, scores_per_img, ids_per_img,
                      test_cfg, img_meta):
        num_ins = masks_per_img.shape[0]
        # resize mask predictions back
        seg_masks = self.rescale_masks(masks_per_img, img_meta)
        seg_masks = seg_masks > test_cfg.mask_thr
        # fake bboxes
        bboxes = torch.zeros((num_ins, 5), dtype=torch.float32)
        bboxes[:, -1] = scores_per_img
        tracks = outs2results(
            bboxes=bboxes,
            labels=labels_per_img,
            masks=seg_masks,
            ids=ids_per_img,
            num_classes=self.num_classes,
        )
        return tracks['bbox_results'], tracks['mask_results']


================================================
FILE: knet_vis/det/knet.py
================================================
import torch
import torch.nn.functional as F

from mmdet.models.builder import DETECTORS
from mmdet.models.detectors import TwoStageDetector
from mmdet.utils import get_root_logger
from .utils import sem2ins_masks


@DETECTORS.register_module()
class KNet(TwoStageDetector):

    def __init__(self,
                 *args,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 thing_label_in_seg=0,
                 **kwargs):
        super(KNet, self).__init__(*args, **kwargs)
        assert self.with_rpn, 'KNet does not support external proposals'
        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.thing_label_in_seg = thing_label_in_seg
        logger = get_root_logger()
        logger.info(f'Model: \n{self}')

    def forward_train(self,
                      img,
                      img_metas,
                      gt_bboxes=None,
                      gt_labels=None,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      proposals=None,
                      gt_semantic_seg=None,
                      **kwargs):

        super(TwoStageDetector, self).forward_train(img, img_metas)
        assert proposals is None, 'KNet does not support' \
                                  ' external proposals'
        assert gt_masks is not None

        # gt_masks and gt_semantic_seg are not padded when forming batch
        gt_masks_tensor = []
        gt_sem_seg = []
        gt_sem_cls = []
        # batch_input_shape shoud be the same across images
        pad_H, pad_W = img_metas[0]['batch_input_shape']
        assign_H = pad_H // self.mask_assign_stride
        assign_W = pad_W // self.mask_assign_stride

        for i, gt_mask in enumerate(gt_masks):
            mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device)
            if gt_mask.width != pad_W or gt_mask.height != pad_H:
                pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height)
                mask_tensor = F.pad(mask_tensor, pad_wh, value=0)

            if gt_semantic_seg is not None:
                # gt_semantic seg is padded by 255 and
                # zero indicating the first class
                sem_labels, sem_seg = sem2ins_masks(
                    gt_semantic_seg[i],
                    num_thing_classes=self.num_thing_classes)
                if sem_seg.shape[0] == 0:
                    gt_sem_seg.append(
                        mask_tensor.new_zeros(
                            (mask_tensor.size(0), assign_H, assign_W)))
                else:
                    gt_sem_seg.append(
                        F.interpolate(
                            sem_seg[None], (assign_H, assign_W),
                            mode='bilinear',
                            align_corners=False)[0])
                gt_sem_cls.append(sem_labels)

            else:
                gt_sem_seg = None
                gt_sem_cls = None

            if mask_tensor.shape[0] == 0:
                gt_masks_tensor.append(
                    mask_tensor.new_zeros(
                        (mask_tensor.size(0), assign_H, assign_W)))
            else:
                gt_masks_tensor.append(
                    F.interpolate(
                        mask_tensor[None], (assign_H, assign_W),
                        mode='bilinear',
                        align_corners=False)[0])

        gt_masks = gt_masks_tensor
        x = self.extract_feat(img)
        rpn_results = self.rpn_head.forward_train(x, img_metas, gt_masks,
                                                  gt_labels, gt_sem_seg,
                                                  gt_sem_cls)
        (rpn_losses, proposal_feats, x_feats, mask_preds,
         cls_scores) = rpn_results

        losses = self.roi_head.forward_train(
            x_feats,
            proposal_feats,
            mask_preds,
            cls_scores,
            img_metas,
            gt_masks,
            gt_labels,
            gt_bboxes_ignore=gt_bboxes_ignore,
            gt_bboxes=gt_bboxes,
            gt_sem_seg=gt_sem_seg,
            gt_sem_cls=gt_sem_cls,
            imgs_whwh=None)

        losses.update(rpn_losses)
        return losses

    def simple_test(self, img, img_metas, rescale=False):
        x = self.extract_feat(img)
        rpn_results = self.rpn_head.simple_test_rpn(x, img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results
        segm_results = self.roi_head.simple_test(
            x_feats,
            proposal_feats,
            mask_preds,
            cls_scores,
            img_metas,
            imgs_whwh=None,
            rescale=rescale)
        return segm_results

    def forward_dummy(self, img):
        """Used for computing network flops.

        See `mmdetection/tools/get_flops.py`
        """
        # backbone
        x = self.extract_feat(img)
        # rpn
        num_imgs = len(img)
        dummy_img_metas = [
            dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs)
        ]
        rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results
        # roi_head
        roi_outs = self.roi_head.forward_dummy(x_feats, proposal_feats,
                                               dummy_img_metas)
        return roi_outs


================================================
FILE: knet_vis/det/mask_hungarian_assigner.py
================================================
import numpy as np
import torch

from mmdet.core import AssignResult, BaseAssigner
from mmdet.core.bbox.builder import BBOX_ASSIGNERS
from mmdet.core.bbox.match_costs.builder import MATCH_COST, build_match_cost

try:
    from scipy.optimize import linear_sum_assignment
except ImportError:
    linear_sum_assignment = None


@MATCH_COST.register_module()
class DiceCost(object):
    """DiceCost.

     Args:
         weight (int | float, optional): loss_weight
         pred_act (bool): Whether to activate the prediction
            before calculating cost

     Examples:
         >>> from mmdet.core.bbox.match_costs.match_cost import BBoxL1Cost
         >>> import torch
         >>> self = BBoxL1Cost()
         >>> bbox_pred = torch.rand(1, 4)
         >>> gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
         >>> factor = torch.tensor([10, 8, 10, 8])
         >>> self(bbox_pred, gt_bboxes, factor)
         tensor([[1.6172, 1.6422]])
    """

    def __init__(self,
                 weight=1.,
                 pred_act=False,
                 act_mode='sigmoid',
                 eps=1e-3):
        self.weight = weight
        self.pred_act = pred_act
        self.act_mode = act_mode
        self.eps = eps

    def dice_loss(cls, input, target, eps=1e-3):
        input = input.reshape(input.size()[0], -1)
        target = target.reshape(target.size()[0], -1).float()
        # einsum saves 10x memory
        # a = torch.sum(input[:, None] * target[None, ...], -1)
        a = torch.einsum('nh,mh->nm', input, target)
        b = torch.sum(input * input, 1) + eps
        c = torch.sum(target * target, 1) + eps
        d = (2 * a) / (b[:, None] + c[None, ...])
        # 1 is a constance that will not affect the matching, so ommitted
        return -d

    def __call__(self, mask_preds, gt_masks):
        """
        Args:
            bbox_pred (Tensor): Predicted boxes with normalized coordinates
                (cx, cy, w, h), which are all in range [0, 1]. Shape
                [num_query, 4].
            gt_bboxes (Tensor): Ground truth boxes with normalized
                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].

        Returns:
            torch.Tensor: bbox_cost value with weight
        """
        if self.pred_act and self.act_mode == 'sigmoid':
            mask_preds = mask_preds.sigmoid()
        elif self.pred_act:
            mask_preds = mask_preds.softmax(dim=0)
        dice_cost = self.dice_loss(mask_preds, gt_masks, self.eps)
        return dice_cost * self.weight


@MATCH_COST.register_module()
class MaskCost(object):
    """MaskCost.

    Args:
        weight (int | float, optional): loss_weight
    """

    def __init__(self, weight=1., pred_act=False, act_mode='sigmoid'):
        self.weight = weight
        self.pred_act = pred_act
        self.act_mode = act_mode

    def __call__(self, cls_pred, target):
        """
        Args:
            cls_pred (Tensor): Predicted classification logits, shape
                [num_query, num_class].
            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).

        Returns:
            torch.Tensor: cls_cost value with weight
        """
        if self.pred_act and self.act_mode == 'sigmoid':
            cls_pred = cls_pred.sigmoid()
        elif self.pred_act:
            cls_pred = cls_pred.softmax(dim=0)

        _, H, W = target.shape
        # flatten_cls_pred = cls_pred.view(num_proposals, -1)
        # eingum is ~10 times faster than matmul
        pos_cost = torch.einsum('nhw,mhw->nm', cls_pred, target)
        neg_cost = torch.einsum('nhw,mhw->nm', 1 - cls_pred, 1 - target)
        cls_cost = -(pos_cost + neg_cost) / (H * W)
        return cls_cost * self.weight


@BBOX_ASSIGNERS.register_module()
class MaskHungarianAssigner(BaseAssigner):
    """Computes one-to-one matching between predictions and ground truth.

    This class computes an assignment between the targets and the predictions
    based on the costs. The costs are weighted sum of three components:
    classfication cost, regression L1 cost and regression iou cost. The
    targets don't include the no_object, so generally there are more
    predictions than targets. After the one-to-one matching, the un-matched
    are treated as backgrounds. Thus each query prediction will be assigned
    with `0` or a positive integer indicating the ground truth index:

    - 0: negative sample, no assigned gt
    - positive integer: positive sample, index (1-based) of assigned gt

    Args:
        cls_weight (int | float, optional): The scale factor for classification
            cost. Default 1.0.
        bbox_weight (int | float, optional): The scale factor for regression
            L1 cost. Default 1.0.
        iou_weight (int | float, optional): The scale factor for regression
            iou cost. Default 1.0.
        iou_calculator (dict | optional): The config for the iou calculation.
            Default type `BboxOverlaps2D`.
        iou_mode (str | optional): "iou" (intersection over union), "iof"
                (intersection over foreground), or "giou" (generalized
                intersection over union). Default "giou".
    """

    def __init__(self,
                 cls_cost=dict(type='ClassificationCost', weight=1.),
                 mask_cost=dict(type='SigmoidCost', weight=1.0),
                 dice_cost=dict(),
                 boundary_cost=None,
                 topk=1):
        self.cls_cost = build_match_cost(cls_cost)
        self.mask_cost = build_match_cost(mask_cost)
        self.dice_cost = build_match_cost(dice_cost)
        if boundary_cost is not None:
            self.boundary_cost = build_match_cost(boundary_cost)
        else:
            self.boundary_cost = None
        self.topk = topk

    def assign(self,
               bbox_pred,
               cls_pred,
               gt_bboxes,
               gt_labels,
               img_meta=None,
               gt_bboxes_ignore=None,
               eps=1e-7):
        """Computes one-to-one matching based on the weighted costs.

        This method assign each query prediction to a ground truth or
        background. The `assigned_gt_inds` with -1 means don't care,
        0 means negative sample, and positive number is the index (1-based)
        of assigned gt.
        The assignment is done in the following steps, the order matters.

        1. assign every prediction to -1
        2. compute the weighted costs
        3. do Hungarian matching on CPU based on the costs
        4. assign all to 0 (background) first, then for each matched pair
           between predictions and gts, treat this prediction as foreground
           and assign the corresponding gt index (plus 1) to it.

        Args:
            bbox_pred (Tensor): Predicted boxes with normalized coordinates
                (cx, cy, w, h), which are all in range [0, 1]. Shape
                [num_query, 4].
            cls_pred (Tensor): Predicted classification logits, shape
                [num_query, num_class].
            gt_bboxes (Tensor): Ground truth boxes with unnormalized
                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
            img_meta (dict): Meta information for current image.
            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
                labelled as `ignored`. Default None.
            eps (int | float, optional): A value added to the denominator for
                numerical stability. Default 1e-7.

        Returns:
            :obj:`AssignResult`: The assigned result.
        """
        assert gt_bboxes_ignore is None, \
            'Only case when gt_bboxes_ignore is None is supported.'
        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)

        # 1. assign -1 by default
        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
                                              -1,
                                              dtype=torch.long)
        assigned_labels = bbox_pred.new_full((num_bboxes, ),
                                             -1,
                                             dtype=torch.long)
        if num_gts == 0 or num_bboxes == 0:
            # No ground truth or boxes, return empty assignment
            if num_gts == 0:
                # No ground truth, assign all to background
                assigned_gt_inds[:] = 0
            return AssignResult(
                num_gts, assigned_gt_inds, None, labels=assigned_labels)

        # 2. compute the weighted costs
        # classification and bboxcost.
        if self.cls_cost.weight != 0 and cls_pred is not None:
            cls_cost = self.cls_cost(cls_pred, gt_labels)
        else:
            cls_cost = 0
        if self.mask_cost.weight != 0:
            reg_cost = self.mask_cost(bbox_pred, gt_bboxes)
        else:
            reg_cost = 0
        if self.dice_cost.weight != 0:
            dice_cost = self.dice_cost(bbox_pred, gt_bboxes)
        else:
            dice_cost = 0
        if self.boundary_cost is not None and self.boundary_cost.weight != 0:
            b_cost = self.boundary_cost(bbox_pred, gt_bboxes)
        else:
            b_cost = 0
        cost = cls_cost + reg_cost + dice_cost + b_cost

        # 3. do Hungarian matching on CPU using linear_sum_assignment
        cost = cost.detach().cpu()
        if linear_sum_assignment is None:
            raise ImportError('Please run "pip install scipy" '
                              'to install scipy first.')
        if self.topk == 1:
            matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
        else:
            topk_matched_row_inds = []
            topk_matched_col_inds = []
            for i in range(self.topk):
                matched_row_inds, matched_col_inds = linear_sum_assignment(
                    cost)
                topk_matched_row_inds.append(matched_row_inds)
                topk_matched_col_inds.append(matched_col_inds)
                cost[matched_row_inds] = 1e10
            matched_row_inds = np.concatenate(topk_matched_row_inds)
            matched_col_inds = np.concatenate(topk_matched_col_inds)

        matched_row_inds = torch.from_numpy(matched_row_inds).to(
            bbox_pred.device)
        matched_col_inds = torch.from_numpy(matched_col_inds).to(
            bbox_pred.device)

        # 4. assign backgrounds and foregrounds
        # assign all indices to backgrounds first
        assigned_gt_inds[:] = 0
        # assign foregrounds based on matching results
        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
        return AssignResult(
            num_gts, assigned_gt_inds, None, labels=assigned_labels)


================================================
FILE: knet_vis/det/mask_pseudo_sampler.py
================================================
import torch

from mmdet.core.bbox import BaseSampler, SamplingResult
from mmdet.core.bbox.builder import BBOX_SAMPLERS


class MaskSamplingResult(SamplingResult):
    """Bbox sampling result.

    Example:
        >>> # xdoctest: +IGNORE_WANT
        >>> from mmdet.core.bbox.samplers.sampling_result import *  # NOQA
        >>> self = SamplingResult.random(rng=10)
        >>> print(f'self = {self}')
        self = <SamplingResult({
            'neg_masks': torch.Size([12, 4]),
            'neg_inds': tensor([ 0,  1,  2,  4,  5,  6,  7,  8,  9, 10, 11, 12]),
            'num_gts': 4,
            'pos_assigned_gt_inds': tensor([], dtype=torch.int64),
            'pos_masks': torch.Size([0, 4]),
            'pos_inds': tensor([], dtype=torch.int64),
            'pos_is_gt': tensor([], dtype=torch.uint8)
        })>
    """

    def __init__(self, pos_inds, neg_inds, masks, gt_masks, assign_result,
                 gt_flags):
        self.pos_inds = pos_inds
        self.neg_inds = neg_inds
        self.pos_masks = masks[pos_inds]
        self.neg_masks = masks[neg_inds]
        self.pos_is_gt = gt_flags[pos_inds]

        self.num_gts = gt_masks.shape[0]
        self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1

        if gt_masks.numel() == 0:
            # hack for index error case
            assert self.pos_assigned_gt_inds.numel() == 0
            self.pos_gt_masks = torch.empty_like(gt_masks)
        else:
            self.pos_gt_masks = gt_masks[self.pos_assigned_gt_inds, :]

        if assign_result.labels is not None:
            self.pos_gt_labels = assign_result.labels[pos_inds]
        else:
            self.pos_gt_labels = None

    @property
    def masks(self):
        """torch.Tensor: concatenated positive and negative boxes"""
        return torch.cat([self.pos_masks, self.neg_masks])

    def __nice__(self):
        data = self.info.copy()
        data['pos_masks'] = data.pop('pos_masks').shape
        data['neg_masks'] = data.pop('neg_masks').shape
        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
        body = '    ' + ',\n    '.join(parts)
        return '{\n' + body + '\n}'

    @property
    def info(self):
        """Returns a dictionary of info about the object."""
        return {
            'pos_inds': self.pos_inds,
            'neg_inds': self.neg_inds,
            'pos_masks': self.pos_masks,
            'neg_masks': self.neg_masks,
            'pos_is_gt': self.pos_is_gt,
            'num_gts': self.num_gts,
            'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
        }


@BBOX_SAMPLERS.register_module()
class MaskPseudoSampler(BaseSampler):
    """A pseudo sampler that does not do sampling actually."""

    def __init__(self, **kwargs):
        pass

    def _sample_pos(self, **kwargs):
        """Sample positive samples."""
        raise NotImplementedError

    def _sample_neg(self, **kwargs):
        """Sample negative samples."""
        raise NotImplementedError

    def sample(self, assign_result, masks, gt_masks, **kwargs):
        """Directly returns the positive and negative indices  of samples.

        Args:
            assign_result (:obj:`AssignResult`): Assigned results
            masks (torch.Tensor): Bounding boxes
            gt_masks (torch.Tensor): Ground truth boxes

        Returns:
            :obj:`SamplingResult`: sampler results
        """
        pos_inds = torch.nonzero(
            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
        neg_inds = torch.nonzero(
            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
        gt_flags = masks.new_zeros(masks.shape[0], dtype=torch.uint8)
        sampling_result = MaskSamplingResult(pos_inds, neg_inds, masks,
                                             gt_masks, assign_result, gt_flags)
        return sampling_result


================================================
FILE: knet_vis/det/semantic_fpn_wrapper.py
================================================
import torch
import torch.nn as nn
from mmcv.cnn import ConvModule, normal_init
from mmdet.models.builder import NECKS
from mmcv.cnn.bricks.transformer import build_positional_encoding
from mmdet.utils import get_root_logger


@NECKS.register_module()
class SemanticFPNWrapper(nn.Module):
    """Implementation of Semantic FPN used in Panoptic FPN.

    Args:
        in_channels ([type]): [description]
        feat_channels ([type]): [description]
        out_channels ([type]): [description]
        start_level ([type]): [description]
        end_level ([type]): [description]
        cat_coors (bool, optional): [description]. Defaults to False.
        fuse_by_cat (bool, optional): [description]. Defaults to False.
        conv_cfg ([type], optional): [description]. Defaults to None.
        norm_cfg ([type], optional): [description]. Defaults to None.
    """

    def __init__(self,
                 in_channels,
                 feat_channels,
                 out_channels,
                 start_level,
                 end_level,
                 cat_coors=False,
                 positional_encoding=None,
                 cat_coors_level=3,
                 fuse_by_cat=False,
                 return_list=False,
                 upsample_times=3,
                 with_pred=True,
                 num_aux_convs=0,
                 act_cfg=dict(type='ReLU', inplace=True),
                 out_act_cfg=dict(type='ReLU'),
                 conv_cfg=None,
                 norm_cfg=None):
        super(SemanticFPNWrapper, self).__init__()

        self.in_channels = in_channels
        self.feat_channels = feat_channels
        self.start_level = start_level
        self.end_level = end_level
        assert start_level >= 0 and end_level >= start_level
        self.out_channels = out_channels
        self.conv_cfg = conv_cfg
        self.norm_cfg = norm_cfg
        self.act_cfg = act_cfg
        self.cat_coors = cat_coors
        self.cat_coors_level = cat_coors_level
        self.fuse_by_cat = fuse_by_cat
        self.return_list = return_list
        self.upsample_times = upsample_times
        self.with_pred = with_pred
        if positional_encoding is not None:
            self.positional_encoding = build_positional_encoding(
                positional_encoding)
        else:
            self.positional_encoding = None

        self.convs_all_levels = nn.ModuleList()
        for i in range(self.start_level, self.end_level + 1):
            convs_per_level = nn.Sequential()
            if i == 0:
                if i == self.cat_coors_level and self.cat_coors:
                    chn = self.in_channels + 2
                else:
                    chn = self.in_channels
                if upsample_times == self.end_level - i:
                    one_conv = ConvModule(
                        chn,
                        self.feat_channels,
                        3,
                        padding=1,
                        conv_cfg=self.conv_cfg,
                        norm_cfg=self.norm_cfg,
                        act_cfg=self.act_cfg,
                        inplace=False)
                    convs_per_level.add_module('conv' + str(i), one_conv)
                else:
                    for i in range(self.end_level - upsample_times):
                        one_conv = ConvModule(
                            chn,
                            self.feat_channels,
                            3,
                            padding=1,
                            stride=2,
                            conv_cfg=self.conv_cfg,
                            norm_cfg=self.norm_cfg,
                            act_cfg=self.act_cfg,
                            inplace=False)
                        convs_per_level.add_module('conv' + str(i), one_conv)
                self.convs_all_levels.append(convs_per_level)
                continue

            for j in range(i):
                if j == 0:
                    if i == self.cat_coors_level and self.cat_coors:
                        chn = self.in_channels + 2
                    else:
                        chn = self.in_channels
                    one_conv = ConvModule(
                        chn,
                        self.feat_channels,
                        3,
                        padding=1,
                        conv_cfg=self.conv_cfg,
                        norm_cfg=self.norm_cfg,
                        act_cfg=self.act_cfg,
                        inplace=False)
                    convs_per_level.add_module('conv' + str(j), one_conv)
                    if j < upsample_times - (self.end_level - i):
                        one_upsample = nn.Upsample(
                            scale_factor=2,
                            mode='bilinear',
                            align_corners=False)
                        convs_per_level.add_module('upsample' + str(j),
                                                   one_upsample)
                    continue

                one_conv = ConvModule(
                    self.feat_channels,
                    self.feat_channels,
                    3,
                    padding=1,
                    conv_cfg=self.conv_cfg,
                    norm_cfg=self.norm_cfg,
                    act_cfg=self.act_cfg,
                    inplace=False)
                convs_per_level.add_module('conv' + str(j), one_conv)
                if j < upsample_times - (self.end_level - i):
                    one_upsample = nn.Upsample(
                        scale_factor=2, mode='bilinear', align_corners=False)
                    convs_per_level.add_module('upsample' + str(j),
                                               one_upsample)

            self.convs_all_levels.append(convs_per_level)

        if fuse_by_cat:
            in_channels = self.feat_channels * len(self.convs_all_levels)
        else:
            in_channels = self.feat_channels

        if self.with_pred:
            self.conv_pred = ConvModule(
                in_channels,
                self.out_channels,
                1,
                padding=0,
                conv_cfg=self.conv_cfg,
                act_cfg=out_act_cfg,
                norm_cfg=self.norm_cfg)

        self.num_aux_convs = num_aux_convs
        self.aux_convs = nn.ModuleList()
        for i in range(num_aux_convs):
            self.aux_convs.append(
                ConvModule(
                    in_channels,
                    self.out_channels,
                    1,
                    padding=0,
                    conv_cfg=self.conv_cfg,
                    act_cfg=out_act_cfg,
                    norm_cfg=self.norm_cfg))

    def init_weights(self):
        logger = get_root_logger()
        logger.info('Use normal intialization for semantic FPN')
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                normal_init(m, std=0.01)

    def generate_coord(self, input_feat):
        x_range = torch.linspace(
            -1, 1, input_feat.shape[-1], device=input_feat.device)
        y_range = torch.linspace(
            -1, 1, input_feat.shape[-2], device=input_feat.device)
        y, x = torch.meshgrid(y_range, x_range)
        y = y.expand([input_feat.shape[0], 1, -1, -1])
        x = x.expand([input_feat.shape[0], 1, -1, -1])
        coord_feat = torch.cat([x, y], 1)
        return coord_feat

    def forward(self, inputs):
        mlvl_feats = []
        for i in range(self.start_level, self.end_level + 1):
            input_p = inputs[i]
            if i == self.cat_coors_level:
                if self.positional_encoding is not None:
                    ignore_mask = input_p.new_zeros(
                        (input_p.shape[0], input_p.shape[-2],
                         input_p.shape[-1]),
                        dtype=torch.bool)
                    positional_encoding = self.positional_encoding(ignore_mask)
                    input_p = input_p + positional_encoding
                if self.cat_coors:
                    coord_feat = self.generate_coord(input_p)
                    input_p = torch.cat([input_p, coord_feat], 1)

            mlvl_feats.append(self.convs_all_levels[i](input_p))

        if self.fuse_by_cat:
            feature_add_all_level = torch.cat(mlvl_feats, dim=1)
        else:
            feature_add_all_level = sum(mlvl_feats)

        if self.with_pred:
            out = self.conv_pred(feature_add_all_level)
        else:
            out = feature_add_all_level

        if self.num_aux_convs > 0:
            outs = [out]
            for conv in self.aux_convs:
                outs.append(conv(feature_add_all_level))
            return outs

        if self.return_list:
            return [out]
        else:
            return out


================================================
FILE: knet_vis/det/utils.py
================================================
import torch


def sem2ins_masks(gt_sem_seg,
                  num_thing_classes=80):
    """Convert semantic segmentation mask to binary masks

    Args:
        gt_sem_seg (torch.Tensor): Semantic masks to be converted.
            [0, num_thing_classes-1] is the classes of things,
            [num_thing_classes:] is the classes of stuff.
        num_thing_classes (int, optional): Number of thing classes.
            Defaults to 80.

    Returns:
        tuple[torch.Tensor]: (mask_labels, bin_masks).
            Mask labels and binary masks of stuff classes.
    """
    # gt_sem_seg is zero-started, where zero indicates the first class
    # since mmdet>=2.17.0, see more discussion in
    # https://mmdetection.readthedocs.io/en/latest/conventions.html#coco-panoptic-dataset  # noqa
    classes = torch.unique(gt_sem_seg)
    # classes ranges from 0 - N-1, where the class IDs in
    # [0, num_thing_classes - 1] are IDs of thing classes
    masks = []
    labels = []

    for i in classes:
        # skip ignore class 255 and "thing classes" in semantic seg
        if i == 255 or i < num_thing_classes:
            continue
        labels.append(i)
        masks.append(gt_sem_seg == i)

    if len(labels) > 0:
        labels = torch.stack(labels)
        masks = torch.cat(masks)
    else:
        labels = gt_sem_seg.new_zeros(size=[0])
        masks = gt_sem_seg.new_zeros(
            size=[0, gt_sem_seg.shape[-2], gt_sem_seg.shape[-1]])
    return labels.long(), masks.float()


================================================
FILE: knet_vis/kernel_updator.py
================================================
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import build_activation_layer, build_norm_layer
from mmcv.cnn.bricks.transformer import TRANSFORMER_LAYER


@TRANSFORMER_LAYER.register_module()
class KernelUpdator(nn.Module):

    def __init__(self,
                 in_channels=256,
                 feat_channels=64,
                 out_channels=None,
                 input_feat_shape=3,
                 gate_sigmoid=True,
                 gate_norm_act=False,
                 activate_out=False,
                 act_cfg=dict(type='ReLU', inplace=True),
                 norm_cfg=dict(type='LN')):
        super(KernelUpdator, self).__init__()
        self.in_channels = in_channels
        self.feat_channels = feat_channels
        self.out_channels_raw = out_channels
        self.gate_sigmoid = gate_sigmoid
        self.gate_norm_act = gate_norm_act
        self.activate_out = activate_out
        if isinstance(input_feat_shape, int):
            input_feat_shape = [input_feat_shape] * 2
        self.input_feat_shape = input_feat_shape
        self.act_cfg = act_cfg
        self.norm_cfg = norm_cfg
        self.out_channels = out_channels if out_channels else in_channels

        self.num_params_in = self.feat_channels
        self.num_params_out = self.feat_channels
        self.dynamic_layer = nn.Linear(
            self.in_channels, self.num_params_in + self.num_params_out)
        self.input_layer = nn.Linear(self.in_channels,
                                     self.num_params_in + self.num_params_out,
                                     1)
        self.input_gate = nn.Linear(self.in_channels, self.feat_channels, 1)
        self.update_gate = nn.Linear(self.in_channels, self.feat_channels, 1)
        if self.gate_norm_act:
            self.gate_norm = build_norm_layer(norm_cfg, self.feat_channels)[1]

        self.norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1]
        self.norm_out = build_norm_layer(norm_cfg, self.feat_channels)[1]
        self.input_norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1]
        self.input_norm_out = build_norm_layer(norm_cfg, self.feat_channels)[1]

        self.activation = build_activation_layer(act_cfg)

        self.fc_layer = nn.Linear(self.feat_channels, self.out_channels, 1)
        self.fc_norm = build_norm_layer(norm_cfg, self.out_channels)[1]

    def forward(self, update_feature, input_feature):
        update_feature = update_feature.reshape(-1, self.in_channels)
        num_proposals = update_feature.size(0)
        parameters = self.dynamic_layer(update_feature)
        param_in = parameters[:, :self.num_params_in].view(
            -1, self.feat_channels)
        param_out = parameters[:, -self.num_params_out:].view(
            -1, self.feat_channels)

        input_feats = self.input_layer(
            input_feature.reshape(num_proposals, -1, self.feat_channels))
        input_in = input_feats[..., :self.num_params_in]
        input_out = input_feats[..., -self.num_params_out:]

        gate_feats = input_in * param_in.unsqueeze(-2)
        if self.gate_norm_act:
            gate_feats = self.activation(self.gate_norm(gate_feats))

        input_gate = self.input_norm_in(self.input_gate(gate_feats))
        update_gate = self.norm_in(self.update_gate(gate_feats))
        if self.gate_sigmoid:
            input_gate = input_gate.sigmoid()
            update_gate = update_gate.sigmoid()
        param_out = self.norm_out(param_out)
        input_out = self.input_norm_out(input_out)

        if self.activate_out:
            param_out = self.activation(param_out)
            input_out = self.activation(input_out)

        # param_out has shape (batch_size, feat_channels, out_channels)
        features = update_gate * param_out.unsqueeze(
            -2) + input_gate * input_out

        features = self.fc_layer(features)
        features = self.fc_norm(features)
        features = self.activation(features)

        return features


================================================
FILE: knet_vis/tracker/__init__.py
================================================


================================================
FILE: knet_vis/tracker/kernel_frame_head.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import (ConvModule, bias_init_with_prob, normal_init)
from mmdet.core import build_assigner, build_sampler, multi_apply, reduce_mean
from mmdet.models.builder import HEADS, build_loss, build_neck
from mmdet.models.losses import accuracy
from mmdet.utils import get_root_logger


@HEADS.register_module()
class ConvKernelHeadVolume(nn.Module):
    def __init__(self,
                 num_proposals=100,
                 in_channels=256,
                 out_channels=256,
                 num_heads=8,
                 num_cls_fcs=1,
                 num_seg_convs=1,
                 num_loc_convs=1,
                 att_dropout=False,
                 localization_fpn=None,
                 conv_kernel_size=1,
                 norm_cfg=dict(type='GN', num_groups=32),
                 semantic_fpn=True,
                 train_cfg=None,
                 num_classes=80,
                 xavier_init_kernel=False,
                 kernel_init_std=0.01,
                 use_binary=False,
                 proposal_feats_with_obj=False,
                 loss_mask=None,
                 loss_seg=None,
                 loss_cls=None,
                 loss_dice=None,
                 loss_rank=None,
                 feat_downsample_stride=1,
                 feat_refine_stride=1,
                 feat_refine=True,
                 with_embed=False,
                 feat_embed_only=False,
                 conv_normal_init=False,
                 mask_out_stride=4,
                 hard_target=False,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 ignore_label=255,
                 thing_label_in_seg=0,
                 cat_stuff_mask=False,
                 **kwargs):
        super().__init__()
        self.num_proposals = num_proposals
        self.num_cls_fcs = num_cls_fcs
        self.train_cfg = train_cfg
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.num_classes = num_classes
        self.proposal_feats_with_obj = proposal_feats_with_obj
        self.sampling = False
        self.localization_fpn = build_neck(localization_fpn)
        self.semantic_fpn = semantic_fpn
        self.norm_cfg = norm_cfg
        self.num_heads = num_heads
        self.att_dropout = att_dropout
        self.mask_out_stride = mask_out_stride
        self.hard_target = hard_target
        self.conv_kernel_size = conv_kernel_size
        self.xavier_init_kernel = xavier_init_kernel
        self.kernel_init_std = kernel_init_std
        self.feat_downsample_stride = feat_downsample_stride
        self.feat_refine_stride = feat_refine_stride
        self.conv_normal_init = conv_normal_init
        self.feat_refine = feat_refine
        self.with_embed = with_embed
        self.feat_embed_only = feat_embed_only
        self.num_loc_convs = num_loc_convs
        self.num_seg_convs = num_seg_convs
        self.use_binary = use_binary
        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.ignore_label = ignore_label
        self.thing_label_in_seg = thing_label_in_seg
        self.cat_stuff_mask = cat_stuff_mask

        if loss_mask is not None:
            self.loss_mask = build_loss(loss_mask)
        else:
            self.loss_mask = loss_mask

        if loss_dice is not None:
            self.loss_dice = build_loss(loss_dice)
        else:
            self.loss_dice = loss_dice

        if loss_seg is not None:
            self.loss_seg = build_loss(loss_seg)
        else:
            self.loss_seg = loss_seg
        if loss_cls is not None:
            self.loss_cls = build_loss(loss_cls)
        else:
            self.loss_cls = loss_cls

        if loss_rank is not None:
            self.loss_rank = build_loss(loss_rank)
        else:
            self.loss_rank = loss_rank

        if self.train_cfg:
            self.assigner = build_assigner(self.train_cfg.assigner)
            # use PseudoSampler when sampling is False
            if self.sampling and hasattr(self.train_cfg, 'sampler'):
                sampler_cfg = self.train_cfg.sampler
            else:
                sampler_cfg = dict(type='MaskPseudoSampler')
            self.sampler = build_sampler(sampler_cfg, context=self)
        self._init_layers()

    def _init_layers(self):
        """Initialize a sparse set of proposal boxes and proposal features."""
        self.init_kernels = nn.Conv2d(
            self.out_channels,
            self.num_proposals,
            self.conv_kernel_size,
            padding=int(self.conv_kernel_size // 2),
            bias=False)

        if self.semantic_fpn:
            if self.loss_seg.use_sigmoid:
                self.conv_seg = nn.Conv2d(self.out_channels, self.num_classes,
                                          1)
            else:
                self.conv_seg = nn.Conv2d(self.out_channels,
                                          self.num_classes + 1, 1)

        if self.feat_downsample_stride > 1 and self.feat_refine:
            self.ins_downsample = ConvModule(
                self.in_channels,
                self.out_channels,
                3,
                stride=self.feat_refine_stride,
                padding=1,
                norm_cfg=self.norm_cfg)
            self.seg_downsample = ConvModule(
                self.in_channels,
                self.out_channels,
                3,
                stride=self.feat_refine_stride,
                padding=1,
                norm_cfg=self.norm_cfg)

        self.loc_convs = nn.ModuleList()
        for i in range(self.num_loc_convs):
            self.loc_convs.append(
                ConvModule(
                    self.in_channels,
                    self.out_channels,
                    1,
                    norm_cfg=self.norm_cfg))

        self.seg_convs = nn.ModuleList()
        for i in range(self.num_seg_convs):
            self.seg_convs.append(
                ConvModule(
                    self.in_channels,
                    self.out_channels,
                    1,
                    norm_cfg=self.norm_cfg))

    def init_weights(self):
        self.localization_fpn.init_weights()

        if self.feat_downsample_stride > 1 and self.conv_normal_init:
            logger = get_root_logger()
            logger.info('Initialize convs in KPN head by normal std 0.01')
            for conv in [self.loc_convs, self.seg_convs]:
                for m in conv.modules():
                    if isinstance(m, nn.Conv2d):
                        normal_init(m, std=0.01)

        if self.semantic_fpn:
            bias_seg = bias_init_with_prob(0.01)
            if self.loss_seg.use_sigmoid:
                normal_init(self.conv_seg, std=0.01, bias=bias_seg)
            else:
                normal_init(self.conv_seg, mean=0, std=0.01)
        if self.xavier_init_kernel:
            logger = get_root_logger()
            logger.info('Initialize kernels by xavier uniform')
            nn.init.xavier_uniform_(self.init_kernels.weight)
        else:
            logger = get_root_logger()
            logger.info(
                f'Initialize kernels by normal std: {self.kernel_init_std}')
            normal_init(self.init_kernels, mean=0, std=self.kernel_init_std)

    def _decode_init_proposals(self, img, img_metas, ref_img_metas):
        num_imgs = len(img_metas)
        num_frames = len(ref_img_metas[0])

        if self.localization_fpn.__class__.__name__.endswith('3D'):
            localization_feats = self.localization_fpn(img, num_imgs, num_frames)
        else:
            localization_feats = self.localization_fpn(img)
        if isinstance(localization_feats, list):
            loc_feats = localization_feats[0]
        else:
            loc_feats = localization_feats
        for conv in self.loc_convs:
            loc_feats = conv(loc_feats)
        if self.feat_downsample_stride > 1 and self.feat_refine:
            loc_feats = self.ins_downsample(loc_feats)
        mask_preds = self.init_kernels(loc_feats)

        if self.semantic_fpn:
            if isinstance(localization_feats, list):
                semantic_feats = localization_feats[1]
            else:
                semantic_feats = localization_feats
            for conv in self.seg_convs:
                semantic_feats = conv(semantic_feats)
            if self.feat_downsample_stride > 1 and self.feat_refine:
                semantic_feats = self.seg_downsample(semantic_feats)
        else:
            semantic_feats = None

        if semantic_feats is not None:
            seg_preds = self.conv_seg(semantic_feats)
        else:
            seg_preds = None

        proposal_feats = self.init_kernels.weight.clone()
        proposal_feats = proposal_feats[None].expand(num_imgs * num_frames, *proposal_feats.size())

        if semantic_feats is not None:
            x_feats = semantic_feats + loc_feats
        else:
            x_feats = loc_feats

        if self.proposal_feats_with_obj:
            sigmoid_masks = mask_preds.sigmoid()
            nonzero_inds = sigmoid_masks > 0.5
            if self.use_binary:
                sigmoid_masks = nonzero_inds.float()
            else:
                sigmoid_masks = nonzero_inds.float() * sigmoid_masks
            obj_feats = torch.einsum('bnhw,bchw->bnc', sigmoid_masks, x_feats)

        cls_scores = None

        if self.proposal_feats_with_obj:
            proposal_feats = proposal_feats + obj_feats.view(
                num_imgs * num_frames, self.num_proposals, self.out_channels, 1, 1)

        if self.cat_stuff_mask and not self.training:
            mask_preds = torch.cat(
                [mask_preds, seg_preds[:, self.num_thing_classes:]], dim=1)
            stuff_kernels = self.conv_seg.weight[self.
                                                 num_thing_classes:].clone()
            stuff_kernels = stuff_kernels[None].expand(num_imgs * num_frames, *stuff_kernels.size())
            proposal_feats = torch.cat([proposal_feats, stuff_kernels], dim=1)

        return proposal_feats, x_feats, mask_preds, cls_scores, seg_preds

    def forward_train(self,
                      img,
                      img_metas,
                      ref_img_metas,
                      gt_masks,
                      gt_labels,
                      gt_instance_ids=None,
                      gt_sem_seg=None,
                      gt_sem_cls=None):
        """Forward function in training stage."""
        assert gt_instance_ids is not None
        num_imgs = len(img_metas)
        num_frames = len(ref_img_metas[0])
        results = self._decode_init_proposals(img, img_metas, ref_img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = results
        if self.feat_downsample_stride > 1:
            scaled_mask_preds = F.interpolate(
                mask_preds,
                scale_factor=self.feat_downsample_stride,
                mode='bilinear',
                align_corners=False)
            if seg_preds is not None:
                scaled_seg_preds = F.interpolate(
                    seg_preds,
                    scale_factor=self.feat_downsample_stride,
                    mode='bilinear',
                    align_corners=False)
        else:
            scaled_mask_preds = mask_preds
            scaled_seg_preds = seg_preds

        if self.hard_target:
            gt_masks = [x.bool().float() for x in gt_masks]
        else:
            gt_masks = gt_masks

        sampling_results = []
        if cls_scores is None:
            detached_cls_scores = [None] * num_imgs
        else:
            detached_cls_scores = cls_scores.detach()

        scaled_mask_preds = scaled_mask_preds.reshape((num_imgs, num_frames, *scaled_mask_preds.size()[1:]))

        num_cls = scaled_seg_preds.size(1)
        _h, _w = scaled_mask_preds.size()[-2:]
        scaled_seg_preds = scaled_seg_preds.reshape((num_imgs, num_frames, *scaled_seg_preds.size()[1:]))
        scaled_seg_preds = torch.einsum('nfshw->nsfhw', scaled_seg_preds).reshape((num_imgs, num_cls, num_frames * _h, _w))

        pred_masks_concat = []
        for i in range(num_imgs):
            assign_result, gt_masks_match = self.assigner.assign(scaled_mask_preds[i].detach(),
                                                 detached_cls_scores[i],
                                                 gt_masks[i], gt_labels[i],
                                                 gt_instance_ids[i])
            num_bboxes = scaled_mask_preds.size(2)
            h, w = scaled_mask_preds.shape[-2:]
            pred_masks_match = torch.einsum('fqhw->qfhw', scaled_mask_preds[i]).reshape((num_bboxes, -1, w))
            sampling_result = self.sampler.sample(assign_result,
                                                  pred_masks_match,
                                                  gt_masks_match)
            sampling_results.append(sampling_result)
            pred_masks_concat.append(pred_masks_match)
        pred_masks_concat = torch.stack(pred_masks_concat)

        mask_targets = self.get_targets(
            sampling_results,
            self.train_cfg,
            True,
            gt_sem_seg=gt_sem_seg,
            gt_sem_cls=gt_sem_cls)

        losses = self.loss(pred_masks_concat, cls_scores, scaled_seg_preds, None, *mask_targets)

        if self.cat_stuff_mask and self.training:
            mask_preds = torch.cat([mask_preds, seg_preds[:, self.num_thing_classes:]], dim=1)
            stuff_kernels = self.conv_seg.weight[self.num_thing_classes:].clone()
            stuff_kernels = stuff_kernels[None].expand(num_imgs * num_frames, *stuff_kernels.size())
            proposal_feats = torch.cat([proposal_feats, stuff_kernels], dim=1)

        return losses, proposal_feats, x_feats, mask_preds, cls_scores

    def loss(self,
             mask_pred,
             cls_scores,
             seg_preds,
             proposal_feats,
             labels,
             label_weights,
             mask_targets,
             mask_weights,
             seg_targets,
             reduction_override=None,
             **kwargs):
        losses = dict()
        bg_class_ind = self.num_classes
        # note in spare rcnn num_gt == num_pos
        pos_inds = (labels >= 0) & (labels < bg_class_ind)
        num_preds = mask_pred.shape[0] * mask_pred.shape[1]

        if cls_scores is not None:
            num_pos = pos_inds.sum().float()
            avg_factor = reduce_mean(num_pos)
            assert mask_pred.shape[0] == cls_scores.shape[0]
            assert mask_pred.shape[1] == cls_scores.shape[1]
            losses['loss_rpn_cls'] = self.loss_cls(
                cls_scores.view(num_preds, -1),
                labels,
                label_weights,
                avg_factor=avg_factor,
                reduction_override=reduction_override)
            losses['rpn_pos_acc'] = accuracy(
                cls_scores.view(num_preds, -1)[pos_inds], labels[pos_inds])

        bool_pos_inds = pos_inds.type(torch.bool)
        # 0~self.num_classes-1 are FG, self.num_classes is BG
        # do not perform bounding box regression for BG anymore.
        H, W = mask_pred.shape[-2:]
        if pos_inds.any():
            pos_mask_pred = mask_pred.reshape(num_preds, H, W)[bool_pos_inds]
            pos_mask_targets = mask_targets[bool_pos_inds]
            losses['loss_rpn_mask'] = self.loss_mask(pos_mask_pred,
                                                     pos_mask_targets)
            losses['loss_rpn_dice'] = self.loss_dice(pos_mask_pred,
                                                     pos_mask_targets)

            if self.loss_rank is not None:
                batch_size = mask_pred.size(0)
                rank_target = mask_targets.new_full((batch_size, H, W),
                                                    self.ignore_label,
                                                    dtype=torch.long)
                rank_inds = pos_inds.view(batch_size,
                                          -1).nonzero(as_tuple=False)
                batch_mask_targets = mask_targets.view(batch_size, -1, H,
                                                       W).bool()
                for i in range(batch_size):
                    curr_inds = (rank_inds[:, 0] == i)
                    curr_rank = rank_inds[:, 1][curr_inds]
                    for j in curr_rank:
                        rank_target[i][batch_mask_targets[i][j]] = j
                losses['loss_rpn_rank'] = self.loss_rank(
                    mask_pred, rank_target, ignore_index=self.ignore_label)

        else:
            losses['loss_rpn_mask'] = mask_pred.sum() * 0
            losses['loss_rpn_dice'] = mask_pred.sum() * 0
            if self.loss_rank is not None:
                losses['loss_rank'] = mask_pred.sum() * 0

        if seg_preds is not None:
            if self.loss_seg.use_sigmoid:
                cls_channel = seg_preds.shape[1]
                flatten_seg = seg_preds.view(
                    -1, cls_channel,
                    H * W).permute(0, 2, 1).reshape(-1, cls_channel)
                flatten_seg_target = seg_targets.view(-1)
                num_dense_pos = (flatten_seg_target >= 0) & (
                    flatten_seg_target < bg_class_ind)
                num_dense_pos = num_dense_pos.sum().float().clamp(min=1.0)
                losses['loss_rpn_seg'] = self.loss_seg(
                    flatten_seg,
                    flatten_seg_target,
                    avg_factor=num_dense_pos)
            else:
                cls_channel = seg_preds.shape[1]
                flatten_seg = seg_preds.view(-1, cls_channel, H * W).permute(
                    0, 2, 1).reshape(-1, cls_channel)
                flatten_seg_target = seg_targets.view(-1)
                losses['loss_rpn_seg'] = self.loss_seg(flatten_seg,
                                                       flatten_seg_target)

        return losses

    def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask,
                           pos_gt_mask, pos_gt_labels, gt_sem_seg, gt_sem_cls,
                           cfg):
        num_pos = pos_mask.size(0)
        num_neg = neg_mask.size(0)
        num_samples = num_pos + num_neg
        H, W = pos_mask.shape[-2:]
        # original implementation uses new_zeros since BG are set to be 0
        # now use empty & fill because BG cat_id = num_classes,
        # FG cat_id = [0, num_classes-1]
        labels = pos_mask.new_full((num_samples, ),
                                   self.num_classes,
                                   dtype=torch.long)
        label_weights = pos_mask.new_zeros(num_samples)
        mask_targets = pos_mask.new_zeros(num_samples, H, W)
        mask_weights = pos_mask.new_zeros(num_samples, H, W)
        seg_targets = pos_mask.new_full((H, W),
                                        self.num_classes,
                                        dtype=torch.long)

        if gt_sem_cls is not None and gt_sem_seg is not None:
            gt_sem_seg = gt_sem_seg.bool()
            for sem_mask, sem_cls in zip(gt_sem_seg, gt_sem_cls):
                seg_targets[sem_mask] = sem_cls.long()

        if num_pos > 0:
            labels[pos_inds] = pos_gt_labels
            pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
            label_weights[pos_inds] = pos_weight
            mask_targets[pos_inds, ...] = pos_gt_mask
            mask_weights[pos_inds, ...] = 1
            for i in range(num_pos):
                seg_targets[pos_gt_mask[i].bool()] = pos_gt_labels[i]

        if num_neg > 0:
            label_weights[neg_inds] = 1.0

        return labels, label_weights, mask_targets, mask_weights, seg_targets

    def get_targets(self,
                    sampling_results,
                    rpn_train_cfg,
                    concat=True,
                    gt_sem_seg=None,
                    gt_sem_cls=None):
        num_imgs = len(sampling_results)
        pos_inds_list = [res.pos_inds for res in sampling_results]
        neg_inds_list = [res.neg_inds for res in sampling_results]
        pos_mask_list = [res.pos_masks for res in sampling_results]
        neg_mask_list = [res.neg_masks for res in sampling_results]
        pos_gt_mask_list = [res.pos_gt_masks for res in sampling_results]
        pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results]
        if gt_sem_seg is None:
            gt_sem_seg = [None] * num_imgs
            gt_sem_cls = [None] * num_imgs
        results = multi_apply(
            self._get_target_single,
            pos_inds_list,
            neg_inds_list,
            pos_mask_list,
            neg_mask_list,
            pos_gt_mask_list,
            pos_gt_labels_list,
            gt_sem_seg,
            gt_sem_cls,
            cfg=rpn_train_cfg)
        (labels, label_weights, mask_targets, mask_weights,
         seg_targets) = results
        if concat:
            labels = torch.cat(labels, 0)
            label_weights = torch.cat(label_weights, 0)
            mask_targets = torch.cat(mask_targets, 0)
            mask_weights = torch.cat(mask_weights, 0)
            seg_targets = torch.stack(seg_targets, 0)
        return labels, label_weights, mask_targets, mask_weights, seg_targets

    def simple_test_rpn(self, img, img_metas, ref_img_metas):
        """Forward function in testing stage."""
        return self._decode_init_proposals(img, img_metas, ref_img_metas)

    def forward_dummy(self, img, img_metas, ref_img_metas):
        """Dummy forward function.

        Used in flops calculation.
        """
        return self._decode_init_proposals(img, img_metas,ref_img_metas)


================================================
FILE: knet_vis/tracker/kernel_frame_iter_head.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import build_norm_layer
from mmcv.cnn.bricks.transformer import MultiheadAttention, FFN

from mmdet.core import build_assigner, build_sampler
from mmdet.models.builder import HEADS, build_head
from mmdet.models.roi_heads import BaseRoIHead

from mmdet.utils import get_root_logger

@HEADS.register_module()
class KernelFrameIterHeadVideo(BaseRoIHead):
    def __init__(self,
                 mask_head=None,
                 with_mask_init=False,
                 num_stages=3,
                 stage_loss_weights=(1, 1, 1),
                 proposal_feature_channel=256,
                 assign_stages=5,
                 num_proposals=100,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 query_merge_method='mean',
                 train_cfg=None,
                 test_cfg=None,
                 pretrained=None,
                 init_cfg=None,
                 **kwargs):
        assert len(stage_loss_weights) == num_stages
        self.num_stages = num_stages
        self.stage_loss_weights = stage_loss_weights
        self.assign_stages = assign_stages
        self.num_proposals = num_proposals
        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.query_merge_method = query_merge_method
        self.proposal_feature_channel = proposal_feature_channel
        super().__init__(
            mask_head=mask_head,
            train_cfg=train_cfg,
            test_cfg=test_cfg,
            init_cfg=init_cfg,
            **kwargs
        )
        if self.query_merge_method == 'attention':
            self.init_query = nn.Embedding(self.num_proposals, self.proposal_feature_channel)
            _num_head = 8
            _drop_out = 0.
            self.query_merge_attn = MultiheadAttention(self.proposal_feature_channel, _num_head, _drop_out, batch_first=True)
            self.query_merge_norm = build_norm_layer(dict(type='LN'), self.proposal_feature_channel)[1]
            self.query_merge_ffn = FFN(
                self.proposal_feature_channel,
                self.proposal_feature_channel * 8,
                num_ffn_fcs=2,
                act_cfg=dict(type='ReLU', inplace=True),
                ffn_drop=0.)
            self.query_merge_ffn_norm = build_norm_layer(dict(type='LN'), self.proposal_feature_channel)[1]
        elif self.query_merge_method == 'attention_pos':
            self.init_query = nn.Embedding(self.num_proposals, self.proposal_feature_channel)
            self.query_pos = nn.Embedding(self.num_proposals, self.proposal_feature_channel)
            _num_head = 8
            _drop_out = 0.
            self.query_merge_attn = MultiheadAttention(self.proposal_feature_channel, _num_head, _drop_out,
                                                       batch_first=True)
            self.query_merge_norm = build_norm_layer(dict(type='LN'), self.proposal_feature_channel)[1]
            self.query_merge_ffn = FFN(
                self.proposal_feature_channel,
                self.proposal_feature_channel * 8,
                num_ffn_fcs=2,
                act_cfg=dict(type='ReLU', inplace=True),
                ffn_drop=0.)
            self.query_merge_ffn_norm = build_norm_layer(dict(type='LN'), self.proposal_feature_channel)[1]

        self.with_mask_init = with_mask_init
        if self.with_mask_init:
            self.fc_mask = nn.Linear(proposal_feature_channel, proposal_feature_channel)

        self.logger = get_root_logger()

    def init_mask_head(self, bbox_roi_extractor=None, mask_head=None):
        assert bbox_roi_extractor is None
        self.mask_head = nn.ModuleList()
        if not isinstance(mask_head, list):
            mask_head = [mask_head for _ in range(self.num_stages)]
        assert len(mask_head) == self.num_stages
        for idx, head in enumerate(mask_head):
            head.update(with_cls=(idx < self.assign_stages))
            self.mask_head.append(build_head(head))

    def init_assigner_sampler(self):
        """Initialize assigner and sampler for each stage."""
        self.mask_assigner = []
        self.mask_sampler = []
        if self.train_cfg is not None:
            for i in range(self.num_stages):
                self.mask_assigner.append(
                    build_assigner(self.train_cfg.assigner))
                self.current_stage = i
                self.mask_sampler.append(
                    build_sampler(self.train_cfg.sampler, context=self))

    def init_bbox_head(self, mask_roi_extractor, mask_head):
        """Initialize box head and box roi extractor.

        Args:
            mask_roi_extractor (dict): Config of box roi extractor.
            mask_head (dict): Config of box in box head.
        """
        raise NotImplementedError

    def _mask_forward(self, stage, x, object_feats, mask_preds):
        mask_head = self.mask_head[stage]
        cls_score, mask_preds, object_feats = mask_head(
            x, object_feats, mask_preds, img_metas=None,
            pos=self.query_pos.weight if self.query_merge_method == 'attention_pos' else None)
        if mask_head.mask_upsample_stride > 1 and (stage == self.num_stages - 1 or self.training):
            scaled_mask_preds = [
                F.interpolate(
                    mask_preds[i],
                    scale_factor=mask_head.mask_upsample_stride,
                    align_corners=False,
                    mode='bilinear'
                ) for i in range(mask_preds.size(0))
            ]
            scaled_mask_preds = torch.stack(scaled_mask_preds)
        else:
            scaled_mask_preds = mask_preds

        mask_results = dict(
            cls_score=cls_score,
            mask_preds=mask_preds,
            scaled_mask_preds=scaled_mask_preds,
            object_feats=object_feats
        )
        return mask_results

    def _query_fusion(self, obj_feats, num_imgs, num_frames):
        if self.query_merge_method == 'mean':
            object_feats = obj_feats.mean(1)
        elif self.query_merge_method == 'attention':
            assert obj_feats.size()[-2:] == (1,1), "Only supporting kernel size = 1"
            obj_feats = obj_feats.reshape((num_imgs, num_frames * self.num_proposals, self.proposal_feature_channel))
            init_query = self.init_query.weight.expand(num_imgs, *self.init_query.weight.size())
            obj_feats = self.query_merge_attn(query=init_query, key=obj_feats, value=obj_feats)
            obj_feats = self.query_merge_norm(obj_feats)
            object_feats = self.query_merge_ffn_norm(self.query_merge_ffn(obj_feats))
            object_feats = object_feats[..., None, None]
        elif self.query_merge_method == 'attention_pos':
            assert obj_feats.size()[-2:] == (1, 1), "Only supporting kernel size = 1"
            obj_feats = obj_feats.reshape((num_imgs, num_frames * self.num_proposals, self.proposal_feature_channel))
            init_query = self.init_query.weight.expand(num_imgs, *self.init_query.weight.size())
            query_pos = self.query_pos.weight.repeat(num_imgs, 1, 1)
            key_pos = query_pos.repeat(1, num_frames, 1)
            obj_feats = self.query_merge_attn(query=init_query, key=obj_feats, value=obj_feats,
                                              query_pos=query_pos, key_pos=key_pos)
            obj_feats = self.query_merge_norm(obj_feats)
            object_feats = self.query_merge_ffn_norm(self.query_merge_ffn(obj_feats))
            object_feats = object_feats[..., None, None]

        return object_feats

    def _mask_init(self, object_feats, x_feats, num_imgs):
        assert object_feats.size()[-2:] == (1, 1), "Only supporting kernel size = 1"
        object_feats = object_feats.flatten(-3, -1) # BNCKK -> BNC
        mask_feat = self.fc_mask(object_feats)[...,None, None]
        mask_preds = []
        for i in range(num_imgs):
            mask_preds.append(
                F.conv2d(
                    x_feats[i],
                    mask_feat[i],
                    padding=0)
            )

        mask_preds = torch.stack(mask_preds, dim=0)

        return mask_preds

    def forward_train(self,
                      x,
                      ref_img_metas,
                      cls_scores,
                      masks,
                      obj_feats,
                      ref_gt_masks,
                      ref_gt_labels,
                      ref_gt_instance_ids,
                      **kwargs):
        num_imgs = len(ref_img_metas)
        num_frames = len(ref_img_metas[0])
        if len(obj_feats.size()) == 6:
            object_feats = self._query_fusion(obj_feats, num_imgs, num_frames)
        else:
            object_feats = obj_feats

        all_stage_loss = {}
        if self.with_mask_init:
            mask_preds = self._mask_init(object_feats, x, num_imgs)
            assert self.training
            if self.mask_head[0].mask_upsample_stride > 1:
                scaled_mask_preds = [
                    F.interpolate(
                        mask_preds[i],
                        scale_factor=self.mask_head[0].mask_upsample_stride,
                        align_corners=False,
                        mode='bilinear'
                    ) for i in range(mask_preds.size(0))
                ]
                scaled_mask_preds = torch.stack(scaled_mask_preds)
            else:
                scaled_mask_preds = mask_preds
            _gt_masks_matches = []
            _assign_results = []
            _sampling_results = []
            _pred_masks_concat = []
            for i in range(num_imgs):
                mask_for_assign = scaled_mask_preds[i][:self.num_proposals].detach()
                cls_for_assign = None
                assign_result, gt_masks_match = self.mask_assigner[0].assign(
                    mask_for_assign, cls_for_assign, ref_gt_masks[i], ref_gt_labels[i], ref_gt_instance_ids[i])
                _gt_masks_matches.append(gt_masks_match)
                _assign_results.append(assign_result)
                num_bboxes = scaled_mask_preds.size(2)
                h, w = scaled_mask_preds.shape[-2:]
                pred_masks_match = torch.einsum('fqhw->qfhw', scaled_mask_preds[i]).reshape((num_bboxes, -1, w))
                sampling_result = self.mask_sampler[0].sample(
                    assign_result, pred_masks_match, gt_masks_match)
                _sampling_results.append(sampling_result)
                _pred_masks_concat.append(pred_masks_match)
            pred_masks_concat = torch.stack(_pred_masks_concat)
            mask_targets = self.mask_head[0].get_targets(
                _sampling_results,
                self.train_cfg,
                True,
                gt_sem_seg=None,
                gt_sem_cls=None
            )

            single_stage_loss = self.mask_head[0].loss(
                object_feats,
                None,
                pred_masks_concat,
                *mask_targets)
            for key, value in single_stage_loss.items():
                all_stage_loss[f'tracker_init_{key}'] = value * self.stage_loss_weights[0]
        else:
            mask_preds = masks


        assign_results = []
        for stage in range(self.num_stages):
            if stage == self.assign_stages:
                object_feats = object_feats[:, None].repeat(1, num_frames, 1, 1, 1, 1)
            mask_results = self._mask_forward(stage, x, object_feats, mask_preds)
            mask_preds = mask_results['mask_preds']
            scaled_mask_preds = mask_results['scaled_mask_preds']
            cls_score = mask_results['cls_score']
            object_feats = mask_results['object_feats']

            prev_mask_preds = scaled_mask_preds.detach()
            prev_cls_score = cls_score.detach() if cls_score is not None else None

            sampling_results = []
            pred_masks_concat = []
            if stage < self.assign_stages:
                assign_results = []
                gt_masks_matches = []
            for i in range(num_imgs):
                if stage < self.assign_stages:
                    mask_for_assign = prev_mask_preds[i][:, :self.num_proposals]
                    if prev_cls_score is not None:
                        cls_for_assign = prev_cls_score[i][:self.num_proposals, :self.num_thing_classes]
                    else:
                        cls_for_assign = None
                    assign_result, gt_masks_match = self.mask_assigner[stage].assign(
                        mask_for_assign, cls_for_assign, ref_gt_masks[i], ref_gt_labels[i], ref_gt_instance_ids[i])
                    gt_masks_matches.append(gt_masks_match)
                    assign_results.append(assign_result)
                num_bboxes = scaled_mask_preds.size(2)
                h, w = scaled_mask_preds.shape[-2:]
                pred_masks_match = torch.einsum('fqhw->qfhw', scaled_mask_preds[i]).reshape((num_bboxes, -1, w))
                sampling_result = self.mask_sampler[stage].sample(
                    assign_results[i], pred_masks_match, gt_masks_matches[i])
                sampling_results.append(sampling_result)
                pred_masks_concat.append(pred_masks_match)
            pred_masks_concat = torch.stack(pred_masks_concat)
            mask_targets = self.mask_head[stage].get_targets(
                sampling_results,
                self.train_cfg,
                True,
                gt_sem_seg=None,
                gt_sem_cls=None
            )

            single_stage_loss = self.mask_head[stage].loss(
                object_feats,
                cls_score,
                pred_masks_concat,
                *mask_targets)
            for key, value in single_stage_loss.items():
                all_stage_loss[f'tracker_s{stage}_{key}'] = value * self.stage_loss_weights[stage]

        features = {
            "obj_feats": object_feats,
            "x_feats": x,
            "cls_scores": cls_score,
            "masks": mask_preds,
        }
        return all_stage_loss, features

    def simple_test(self,
                    x,
                    img_metas,
                    ref_img_metas,
                    cls_scores,
                    masks,
                    obj_feats,
                    **kwargs):
        num_imgs = len(ref_img_metas)
        num_frames = len(ref_img_metas[0])

        if len(obj_feats.size()) == 6:
            object_feats = self._query_fusion(obj_feats, num_imgs, num_frames)
        else:
            object_feats = obj_feats

        if self.with_mask_init:
            mask_preds = self._mask_init(object_feats, x, num_imgs)
        else:
            mask_preds = masks

        cls_score = None
        for stage in range(self.num_stages):
            if stage == self.assign_stages:
                object_feats = object_feats[:, None].repeat(1, num_frames, 1, 1, 1, 1)
            mask_results = self._mask_forward(stage, x, object_feats, mask_preds)
            mask_preds = mask_results['mask_preds']
            scaled_mask_preds = mask_results['scaled_mask_preds']
            cls_score = mask_results['cls_score'] if mask_results['cls_score'] is not None else cls_score
            object_feats = mask_results['object_feats']

        num_classes = self.mask_head[-1].num_classes
        results = []
        if self.mask_head[-1].loss_cls.use_sigmoid:
            cls_score = cls_score.sigmoid()
        else:
            cls_score = cls_score.softmax(-1)[..., :-1]

        for img_id in range(num_imgs):
            result = []
            cls_score_per_img = cls_score[img_id]
            # h, quite tricky here, a bounding box can predict multiple results with different labels
            scores_per_img, topk_indices = cls_score_per_img.flatten(0, 1).topk(
                self.test_cfg.max_per_img, sorted=True)
            mask_indices = topk_indices // num_classes
            # Use the following when torch >= 1.9.0
            # mask_indices = torch.div(topk_indices, num_classes, rounding_mode='floor')
            labels_per_img = topk_indices % num_classes
            for frame_id in range(num_frames):
                masks_per_img = scaled_mask_preds[img_id][frame_id][mask_indices]
                single_result=self.mask_head[-1].get_seg_masks_tracking(
                    masks_per_img, labels_per_img, scores_per_img,
                    torch.arange(self.test_cfg.max_per_img),
                    self.test_cfg, img_metas[img_id])
                result.append(single_result)
            results.append(result)
        features = {
            "obj_feats": object_feats,
            "x_feats": x,
            "cls_scores": cls_score,
            "masks": mask_preds,
        }
        return results, features

    def init_weights(self):
        if self.init_cfg is not None and self.init_cfg['type'] == 'Pretrained' and self.init_cfg['prefix'] is not None:
            from mmcv.cnn import initialize
            self.logger.info(f"Customized loading the tracker.")
            initialize(self, self.init_cfg)
        else:
            super().init_weights()


================================================
FILE: knet_vis/tracker/kernel_head.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import (ConvModule, bias_init_with_prob, normal_init)
from mmdet.core import build_assigner, build_sampler, multi_apply, reduce_mean
from mmdet.models.builder import HEADS, build_loss, build_neck
from mmdet.models.losses import accuracy
from mmdet.utils import get_root_logger


@HEADS.register_module()
class ConvKernelHeadVideo(nn.Module):
    def __init__(self,
                 num_proposals=100,
                 in_channels=256,
                 out_channels=256,
                 num_heads=8,
                 num_cls_fcs=1,
                 num_seg_convs=1,
                 num_loc_convs=1,
                 att_dropout=False,
                 localization_fpn=None,
                 conv_kernel_size=1,
                 norm_cfg=dict(type='GN', num_groups=32),
                 semantic_fpn=True,
                 train_cfg=None,
                 num_classes=80,
                 xavier_init_kernel=False,
                 kernel_init_std=0.01,
                 use_binary=False,
                 proposal_feats_with_obj=False,
                 loss_mask=None,
                 loss_seg=None,
                 loss_cls=None,
                 loss_dice=None,
                 loss_rank=None,
                 feat_downsample_stride=1,
                 feat_refine_stride=1,
                 feat_refine=True,
                 with_embed=False,
                 feat_embed_only=False,
                 conv_normal_init=False,
                 mask_out_stride=4,
                 hard_target=False,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 ignore_label=255,
                 thing_label_in_seg=0,
                 cat_stuff_mask=False,
                 **kwargs):
        super().__init__()
        self.num_proposals = num_proposals
        self.num_cls_fcs = num_cls_fcs
        self.train_cfg = train_cfg
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.num_classes = num_classes
        self.proposal_feats_with_obj = proposal_feats_with_obj
        self.sampling = False
        self.localization_fpn = build_neck(localization_fpn)
        self.semantic_fpn = semantic_fpn
        self.norm_cfg = norm_cfg
        self.num_heads = num_heads
        self.att_dropout = att_dropout
        self.mask_out_stride = mask_out_stride
        self.hard_target = hard_target
        self.conv_kernel_size = conv_kernel_size
        self.xavier_init_kernel = xavier_init_kernel
        self.kernel_init_std = kernel_init_std
        self.feat_downsample_stride = feat_downsample_stride
        self.feat_refine_stride = feat_refine_stride
        self.conv_normal_init = conv_normal_init
        self.feat_refine = feat_refine
        self.with_embed = with_embed
        self.feat_embed_only = feat_embed_only
        self.num_loc_convs = num_loc_convs
        self.num_seg_convs = num_seg_convs
        self.use_binary = use_binary
        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.ignore_label = ignore_label
        self.thing_label_in_seg = thing_label_in_seg
        self.cat_stuff_mask = cat_stuff_mask

        if loss_mask is not None:
            self.loss_mask = build_loss(loss_mask)
        else:
            self.loss_mask = loss_mask

        if loss_dice is not None:
            self.loss_dice = build_loss(loss_dice)
        else:
            self.loss_dice = loss_dice

        if loss_seg is not None:
            self.loss_seg = build_loss(loss_seg)
        else:
            self.loss_seg = loss_seg
        if loss_cls is not None:
            self.loss_cls = build_loss(loss_cls)
        else:
            self.loss_cls = loss_cls

        if loss_rank is not None:
            self.loss_rank = build_loss(loss_rank)
        else:
            self.loss_rank = loss_rank

        if self.train_cfg:
            self.assigner = build_assigner(self.train_cfg.assigner)
            # use PseudoSampler when sampling is False
            if self.sampling and hasattr(self.train_cfg, 'sampler'):
                sampler_cfg = self.train_cfg.sampler
            else:
                sampler_cfg = dict(type='MaskPseudoSampler')
            self.sampler = build_sampler(sampler_cfg, context=self)
        self._init_layers()

    def _init_layers(self):
        """Initialize a sparse set of proposal boxes and proposal features."""
        self.init_kernels = nn.Conv2d(
            self.out_channels,
            self.num_proposals,
            self.conv_kernel_size,
            padding=int(self.conv_kernel_size // 2),
            bias=False)

        if self.semantic_fpn:
            if self.loss_seg.use_sigmoid:
                self.conv_seg = nn.Conv2d(self.out_channels, self.num_classes,
                                          1)
            else:
                self.conv_seg = nn.Conv2d(self.out_channels,
                                          self.num_classes + 1, 1)

        if self.feat_downsample_stride > 1 and self.feat_refine:
            self.ins_downsample = ConvModule(
                self.in_channels,
                self.out_channels,
                3,
                stride=self.feat_refine_stride,
                padding=1,
                norm_cfg=self.norm_cfg)
            self.seg_downsample = ConvModule(
                self.in_channels,
                self.out_channels,
                3,
                stride=self.feat_refine_stride,
                padding=1,
                norm_cfg=self.norm_cfg)

        self.loc_convs = nn.ModuleList()
        for i in range(self.num_loc_convs):
            self.loc_convs.append(
                ConvModule(
                    self.in_channels,
                    self.out_channels,
                    1,
                    norm_cfg=self.norm_cfg))

        self.seg_convs = nn.ModuleList()
        for i in range(self.num_seg_convs):
            self.seg_convs.append(
                ConvModule(
                    self.in_channels,
                    self.out_channels,
                    1,
                    norm_cfg=self.norm_cfg))

    def init_weights(self):
        self.localization_fpn.init_weights()

        if self.feat_downsample_stride > 1 and self.conv_normal_init:
            logger = get_root_logger()
            logger.info('Initialize convs in KPN head by normal std 0.01')
            for conv in [self.loc_convs, self.seg_convs]:
                for m in conv.modules():
                    if isinstance(m, nn.Conv2d):
                        normal_init(m, std=0.01)

        if self.semantic_fpn:
            bias_seg = bias_init_with_prob(0.01)
            if self.loss_seg.use_sigmoid:
                normal_init(self.conv_seg, std=0.01, bias=bias_seg)
            else:
                normal_init(self.conv_seg, mean=0, std=0.01)
        if self.xavier_init_kernel:
            logger = get_root_logger()
            logger.info('Initialize kernels by xavier uniform')
            nn.init.xavier_uniform_(self.init_kernels.weight)
        else:
            logger = get_root_logger()
            logger.info(
                f'Initialize kernels by normal std: {self.kernel_init_std}')
            normal_init(self.init_kernels, mean=0, std=self.kernel_init_std)

    def _decode_init_proposals(self, img, img_metas, ref_img_metas):
        num_imgs = len(img_metas)
        num_frames = len(ref_img_metas[0])

        if self.localization_fpn.__class__.__name__.endswith('3D'):
            localization_feats = self.localization_fpn(img, num_imgs, num_frames)
        else:
            localization_feats = self.localization_fpn(img)
        if isinstance(localization_feats, list):
            loc_feats = localization_feats[0]
        else:
            loc_feats = localization_feats
        for conv in self.loc_convs:
            loc_feats = conv(loc_feats)
        if self.feat_downsample_stride > 1 and self.feat_refine:
            loc_feats = self.ins_downsample(loc_feats)
        mask_preds = self.init_kernels(loc_feats)

        if self.semantic_fpn:
            if isinstance(localization_feats, list):
                semantic_feats = localization_feats[1]
            else:
                semantic_feats = localization_feats
            for conv in self.seg_convs:
                semantic_feats = conv(semantic_feats)
            if self.feat_downsample_stride > 1 and self.feat_refine:
                semantic_feats = self.seg_downsample(semantic_feats)
        else:
            semantic_feats = None

        if semantic_feats is not None:
            seg_preds = self.conv_seg(semantic_feats)
        else:
            seg_preds = None

        proposal_feats = self.init_kernels.weight.clone()
        proposal_feats = proposal_feats[None].expand(num_imgs * num_frames, *proposal_feats.size())

        if semantic_feats is not None:
            x_feats = semantic_feats + loc_feats
        else:
            x_feats = loc_feats

        if self.proposal_feats_with_obj:
            sigmoid_masks = mask_preds.sigmoid()
            nonzero_inds = sigmoid_masks > 0.5
            if self.use_binary:
                sigmoid_masks = nonzero_inds.float()
            else:
                sigmoid_masks = nonzero_inds.float() * sigmoid_masks
            obj_feats = torch.einsum('bnhw,bchw->bnc', sigmoid_masks, x_feats)

        cls_scores = None

        if self.proposal_feats_with_obj:
            proposal_feats = proposal_feats + obj_feats.view(
                num_imgs * num_frames, self.num_proposals, self.out_channels, 1, 1)

        if self.cat_stuff_mask and not self.training:
            mask_preds = torch.cat(
                [mask_preds, seg_preds[:, self.num_thing_classes:]], dim=1)
            stuff_kernels = self.conv_seg.weight[self.
                                                 num_thing_classes:].clone()
            stuff_kernels = stuff_kernels[None].expand(num_imgs * num_frames, *stuff_kernels.size())
            proposal_feats = torch.cat([proposal_feats, stuff_kernels], dim=1)

        return proposal_feats, x_feats, mask_preds, cls_scores, seg_preds

    def forward_train(self,
                      img,
                      img_metas,
                      ref_img_metas,
                      gt_masks,
                      gt_labels,
                      gt_instance_ids=None,
                      gt_sem_seg=None,
                      gt_sem_cls=None):
        """Forward function in training stage."""
        num_imgs = len(img_metas)
        num_frames = len(ref_img_metas[0])
        results = self._decode_init_proposals(img, img_metas, ref_img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores, seg_preds) = results
        if self.feat_downsample_stride > 1:
            scaled_mask_preds = F.interpolate(
                mask_preds,
                scale_factor=self.feat_downsample_stride,
                mode='bilinear',
                align_corners=False)
            if seg_preds is not None:
                scaled_seg_preds = F.interpolate(
                    seg_preds,
                    scale_factor=self.feat_downsample_stride,
                    mode='bilinear',
                    align_corners=False)
        else:
            scaled_mask_preds = mask_preds
            scaled_seg_preds = seg_preds

        if self.hard_target:
            gt_masks = [x.bool().float() for x in gt_masks]
        else:
            gt_masks = gt_masks

        sampling_results = []
        if cls_scores is None:
            detached_cls_scores = [[None] * num_frames] * num_imgs
        else:
            detached_cls_scores = cls_scores.detach()

        for i in range(num_imgs):
            for j in range(num_frames):
                assign_result = self.assigner.assign(scaled_mask_preds[i * num_frames + j].detach(),
                                                     detached_cls_scores[i][j],
                                                     gt_masks[i][j], gt_labels[i][:,1][gt_labels[i][:,0]==j],
                                                     ref_img_metas[i][j])
                sampling_result = self.sampler.sample(assign_result,
                                                      scaled_mask_preds[i * num_frames + j],
                                                      gt_masks[i][j])
                sampling_results.append(sampling_result)

        mask_targets = self.get_targets(
            sampling_results,
            self.train_cfg,
            True,
            gt_sem_seg=gt_sem_seg,
            gt_sem_cls=gt_sem_cls)

        losses = self.loss(scaled_mask_preds, cls_scores, scaled_seg_preds, proposal_feats, *mask_targets)

        if self.cat_stuff_mask and self.training:
            mask_preds = torch.cat([mask_preds, seg_preds[:, self.num_thing_classes:]], dim=1)
            stuff_kernels = self.conv_seg.weight[self.num_thing_classes:].clone()
            stuff_kernels = stuff_kernels[None].expand(num_imgs * num_frames, *stuff_kernels.size())
            proposal_feats = torch.cat([proposal_feats, stuff_kernels], dim=1)

        return losses, proposal_feats, x_feats, mask_preds, cls_scores

    def loss(self,
             mask_pred,
             cls_scores,
             seg_preds,
             proposal_feats,
             labels,
             label_weights,
             mask_targets,
             mask_weights,
             seg_targets,
             reduction_override=None,
             **kwargs):
        losses = dict()
        bg_class_ind = self.num_classes
        # note in spare rcnn num_gt == num_pos
        pos_inds = (labels >= 0) & (labels < bg_class_ind)
        num_preds = mask_pred.shape[0] * mask_pred.shape[1]

        if cls_scores is not None:
            num_pos = pos_inds.sum().float()
            avg_factor = reduce_mean(num_pos)
            assert mask_pred.shape[0] == cls_scores.shape[0]
            assert mask_pred.shape[1] == cls_scores.shape[1]
            losses['loss_rpn_cls'] = self.loss_cls(
                cls_scores.view(num_preds, -1),
                labels,
                label_weights,
                avg_factor=avg_factor,
                reduction_override=reduction_override)
            losses['rpn_pos_acc'] = accuracy(
                cls_scores.view(num_preds, -1)[pos_inds], labels[pos_inds])

        bool_pos_inds = pos_inds.type(torch.bool)
        # 0~self.num_classes-1 are FG, self.num_classes is BG
        # do not perform bounding box regression for BG anymore.
        H, W = mask_pred.shape[-2:]
        if pos_inds.any():
            pos_mask_pred = mask_pred.reshape(num_preds, H, W)[bool_pos_inds]
            pos_mask_targets = mask_targets[bool_pos_inds]
            losses['loss_rpn_mask'] = self.loss_mask(pos_mask_pred,
                                                     pos_mask_targets)
            losses['loss_rpn_dice'] = self.loss_dice(pos_mask_pred,
                                                     pos_mask_targets)

            if self.loss_rank is not None:
                batch_size = mask_pred.size(0)
                rank_target = mask_targets.new_full((batch_size, H, W),
                                                    self.ignore_label,
                                                    dtype=torch.long)
                rank_inds = pos_inds.view(batch_size,
                                          -1).nonzero(as_tuple=False)
                batch_mask_targets = mask_targets.view(batch_size, -1, H,
                                                       W).bool()
                for i in range(batch_size):
                    curr_inds = (rank_inds[:, 0] == i)
                    curr_rank = rank_inds[:, 1][curr_inds]
                    for j in curr_rank:
                        rank_target[i][batch_mask_targets[i][j]] = j
                losses['loss_rpn_rank'] = self.loss_rank(
                    mask_pred, rank_target, ignore_index=self.ignore_label)

        else:
            losses['loss_rpn_mask'] = mask_pred.sum() * 0
            losses['loss_rpn_dice'] = mask_pred.sum() * 0
            if self.loss_rank is not None:
                losses['loss_rank'] = mask_pred.sum() * 0

        if seg_preds is not None:
            if self.loss_seg.use_sigmoid:
                cls_channel = seg_preds.shape[1]
                flatten_seg = seg_preds.view(
                    -1, cls_channel,
                    H * W).permute(0, 2, 1).reshape(-1, cls_channel)
                flatten_seg_target = seg_targets.view(-1)
                num_dense_pos = (flatten_seg_target >= 0) & (
                    flatten_seg_target < bg_class_ind)
                num_dense_pos = num_dense_pos.sum().float().clamp(min=1.0)
                losses['loss_rpn_seg'] = self.loss_seg(
                    flatten_seg,
                    flatten_seg_target,
                    avg_factor=num_dense_pos)
            else:
                cls_channel = seg_preds.shape[1]
                flatten_seg = seg_preds.view(-1, cls_channel, H * W).permute(
                    0, 2, 1).reshape(-1, cls_channel)
                flatten_seg_target = seg_targets.view(-1)
                losses['loss_rpn_seg'] = self.loss_seg(flatten_seg,
                                                       flatten_seg_target)

        return losses

    def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask,
                           pos_gt_mask, pos_gt_labels, gt_sem_seg, gt_sem_cls,
                           cfg):
        num_pos = pos_mask.size(0)
        num_neg = neg_mask.size(0)
        num_samples = num_pos + num_neg
        H, W = pos_mask.shape[-2:]
        # original implementation uses new_zeros since BG are set to be 0
        # now use empty & fill because BG cat_id = num_classes,
        # FG cat_id = [0, num_classes-1]
        labels = pos_mask.new_full((num_samples, ),
                                   self.num_classes,
                                   dtype=torch.long)
        label_weights = pos_mask.new_zeros(num_samples)
        mask_targets = pos_mask.new_zeros(num_samples, H, W)
        mask_weights = pos_mask.new_zeros(num_samples, H, W)
        seg_targets = pos_mask.new_full((H, W),
                                        self.num_classes,
                                        dtype=torch.long)

        if gt_sem_cls is not None and gt_sem_seg is not None:
            gt_sem_seg = gt_sem_seg.bool()
            for sem_mask, sem_cls in zip(gt_sem_seg, gt_sem_cls):
                seg_targets[sem_mask] = sem_cls.long()

        if num_pos > 0:
            labels[pos_inds] = pos_gt_labels
            pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
            label_weights[pos_inds] = pos_weight
            mask_targets[pos_inds, ...] = pos_gt_mask
            mask_weights[pos_inds, ...] = 1
            for i in range(num_pos):
                seg_targets[pos_gt_mask[i].bool()] = pos_gt_labels[i]

        if num_neg > 0:
            label_weights[neg_inds] = 1.0

        return labels, label_weights, mask_targets, mask_weights, seg_targets

    def get_targets(self,
                    sampling_results,
                    rpn_train_cfg,
                    concat=True,
                    gt_sem_seg=None,
                    gt_sem_cls=None):
        num_imgs = len(sampling_results)
        pos_inds_list = [res.pos_inds for res in sampling_results]
        neg_inds_list = [res.neg_inds for res in sampling_results]
        pos_mask_list = [res.pos_masks for res in sampling_results]
        neg_mask_list = [res.neg_masks for res in sampling_results]
        pos_gt_mask_list = [res.pos_gt_masks for res in sampling_results]
        pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results]
        if gt_sem_seg is None:
            gt_sem_seg = [None] * num_imgs
            gt_sem_cls = [None] * num_imgs
        results = multi_apply(
            self._get_target_single,
            pos_inds_list,
            neg_inds_list,
            pos_mask_list,
            neg_mask_list,
            pos_gt_mask_list,
            pos_gt_labels_list,
            gt_sem_seg,
            gt_sem_cls,
            cfg=rpn_train_cfg)
        (labels, label_weights, mask_targets, mask_weights,
         seg_targets) = results
        if concat:
            labels = torch.cat(labels, 0)
            label_weights = torch.cat(label_weights, 0)
            mask_targets = torch.cat(mask_targets, 0)
            mask_weights = torch.cat(mask_weights, 0)
            seg_targets = torch.stack(seg_targets, 0)
        return labels, label_weights, mask_targets, mask_weights, seg_targets

    def simple_test_rpn(self, img, img_metas, ref_img_metas):
        """Forward function in testing stage."""
        return self._decode_init_proposals(img, img_metas, ref_img_metas)

    def forward_dummy(self, img, img_metas, ref_img_metas):
        """Dummy forward function.

        Used in flops calculation.
        """
        return self._decode_init_proposals(img, img_metas,ref_img_metas)


================================================
FILE: knet_vis/tracker/kernel_iter_head.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

from mmdet.core import build_assigner, build_sampler
from mmdet.datasets.coco_panoptic import INSTANCE_OFFSET
from mmdet.models.builder import HEADS, build_head
from mmdet.models.roi_heads import BaseRoIHead

from knet_vis.det.mask_pseudo_sampler import MaskPseudoSampler


@HEADS.register_module()
class KernelIterHeadVideo(BaseRoIHead):
    def __init__(self,
                 num_stages=6,
                 recursive=False,
                 assign_stages=5,
                 stage_loss_weights=(1, 1, 1, 1, 1, 1),
                 proposal_feature_channel=256,
                 merge_cls_scores=False,
                 do_panoptic=False,
                 post_assign=False,
                 hard_target=False,
                 num_proposals=100,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 thing_label_in_seg=0,
                 mask_head=dict(
                     type='KernelUpdateHead',
                     num_classes=80,
                     num_fcs=2,
                     num_heads=8,
                     num_cls_fcs=1,
                     num_reg_fcs=3,
                     feedforward_channels=2048,
                     hidden_channels=256,
                     dropout=0.0,
                     roi_feat_size=7,
                     ffn_act_cfg=dict(type='ReLU', inplace=True)),
                 mask_out_stride=4,
                 train_cfg=None,
                 test_cfg=None,
                 **kwargs):
        assert mask_head is not None
        assert len(stage_loss_weights) == num_stages
        self.num_stages = num_stages
        self.stage_loss_weights = stage_loss_weights
        self.proposal_feature_channel = proposal_feature_channel
        self.merge_cls_scores = merge_cls_scores
        self.recursive = recursive
        self.post_assign = post_assign
        self.mask_out_stride = mask_out_stride
        self.hard_target = hard_target
        self.assign_stages = assign_stages
        self.do_panoptic = do_panoptic
        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.num_classes = num_thing_classes + num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.thing_label_in_seg = thing_label_in_seg
        self.num_proposals = num_proposals
        super().__init__(
            mask_head=mask_head,
            train_cfg=train_cfg,
            test_cfg=test_cfg,
            **kwargs)
        # train_cfg would be None when run the test.py
        if train_cfg is not None:
            for stage in range(num_stages):
                assert isinstance(self.mask_sampler[stage], MaskPseudoSampler), \
                    'Sparse Mask only support `MaskPseudoSampler`'

    def init_bbox_head(self, mask_roi_extractor, mask_head):
        """Initialize box head and box roi extractor.

        Args:
            mask_roi_extractor (dict): Config of box roi extractor.
            mask_head (dict): Config of box in box head.
        """
        pass

    def init_assigner_sampler(self):
        """Initialize assigner and sampler for each stage."""
        self.mask_assigner = []
        self.mask_sampler = []
        if self.train_cfg is not None:
            for idx, rcnn_train_cfg in enumerate(self.train_cfg):
                self.mask_assigner.append(
                    build_assigner(rcnn_train_cfg.assigner))
                self.current_stage = idx
                self.mask_sampler.append(
                    build_sampler(rcnn_train_cfg.sampler, context=self))

    def init_weights(self):
        for i in range(self.num_stages):
            self.mask_head[i].init_weights()

    def init_mask_head(self, mask_roi_extractor, mask_head):
        """Initialize mask head and mask roi extractor.

        Args:
            mask_roi_extractor (dict): Config of mask roi extractor.
            mask_head (dict): Config of mask in mask head.
        """
        self.mask_head = nn.ModuleList()
        if not isinstance(mask_head, list):
            mask_head = [mask_head for _ in range(self.num_stages)]
        assert len(mask_head) == self.num_stages
        for head in mask_head:
            self.mask_head.append(build_head(head))
        if self.recursive:
            for i in range(self.num_stages):
                self.mask_head[i] = self.mask_head[0]

    def _mask_forward(self, stage, x, object_feats, mask_preds, img_metas=None):
        mask_head = self.mask_head[stage]
        cls_score, mask_preds, object_feats = mask_head(x, object_feats, mask_preds, img_metas=img_metas)
        if mask_head.mask_upsample_stride > 1 and (stage == self.num_stages - 1 or self.training):
            scaled_mask_preds = F.interpolate(
                mask_preds,
                scale_factor=mask_head.mask_upsample_stride,
                align_corners=False,
                mode='bilinear'
            )
        else:
            scaled_mask_preds = mask_preds

        mask_results = dict(
            cls_score=cls_score,
            mask_preds=mask_preds,
            scaled_mask_preds=scaled_mask_preds,
            object_feats=object_feats
        )
        return mask_results

    def forward_train(self,
                      x,
                      proposal_feats,
                      mask_preds,
                      cls_score,
                      ref_img_metas,
                      gt_masks,
                      gt_labels,
                      gt_bboxes_ignore=None,
                      imgs_whwh=None,
                      gt_bboxes=None,
                      gt_sem_seg=None,
                      gt_sem_cls=None):

        num_imgs = len(ref_img_metas)
        num_frames = len(ref_img_metas[0])
        if self.mask_head[0].mask_upsample_stride > 1:
            prev_mask_preds = F.interpolate(
                mask_preds.detach(),
                scale_factor=self.mask_head[0].mask_upsample_stride,
                mode='bilinear',
                align_corners=False)
        else:
            prev_mask_preds = mask_preds.detach()

        if cls_score is not None:
            prev_cls_score = cls_score.detach()
        else:
            prev_cls_score = None

        if self.hard_target:
            gt_masks = [x.bool().float() for x in gt_masks]
        else:
            gt_masks = gt_masks

        object_feats = proposal_feats
        all_stage_loss = {}
        all_stage_mask_results = []
        assign_results = []
        for stage in range(self.num_stages):
            mask_results = self._mask_forward(stage, x, object_feats, mask_preds, img_metas=None)
            all_stage_mask_results.append(mask_results)
            mask_preds = mask_results['mask_preds']
            scaled_mask_preds = mask_results['scaled_mask_preds']
            cls_score = mask_results['cls_score']
            object_feats = mask_results['object_feats']

            if self.post_assign:
                prev_mask_preds = scaled_mask_preds.detach()
                prev_cls_score = cls_score.detach()

            sampling_results = []
            if stage < self.assign_stages:
                assign_results = []
            for i in range(num_imgs):
                for j in range(num_frames):
                    if stage < self.assign_stages:
                        mask_for_assign = prev_mask_preds[i * num_frames + j][:self.num_proposals]
                        if prev_cls_score is not None:
                            cls_for_assign = prev_cls_score[i * num_frames + j][:self.num_proposals, :self.num_thing_classes]
                        else:
                            cls_for_assign = None
                        assign_result = self.mask_assigner[stage].assign(
                            mask_for_assign, cls_for_assign, gt_masks[i][j],
                            gt_labels[i][:,1][gt_labels[i][:,0]==j], img_meta=None)
                        assign_results.append(assign_result)
                    sampling_result = self.mask_sampler[stage].sample(
                        assign_results[i * num_frames + j], scaled_mask_preds[i * num_frames + j], gt_masks[i][j])
                    sampling_results.append(sampling_result)
            mask_targets = self.mask_head[stage].get_targets(
                sampling_results,
                self.train_cfg[stage],
                True,
                gt_sem_seg=gt_sem_seg,
                gt_sem_cls=gt_sem_cls)

            single_stage_loss = self.mask_head[stage].loss(
                object_feats,
                cls_score,
                scaled_mask_preds,
                *mask_targets,
                imgs_whwh=imgs_whwh)
            for key, value in single_stage_loss.items():
                all_stage_loss[f's{stage}_{key}'] = value * \
                                    self.stage_loss_weights[stage]

            if not self.post_assign:
                prev_mask_preds = scaled_mask_preds.detach()
                prev_cls_score = cls_score.detach()

        bs_nf, num_query, c, ks1, ks2 = object_feats.size()
        bs_nf2, c2, h, w = x.size()
        assert ks1 == ks2
        assert bs_nf == bs_nf2
        assert bs_nf == num_frames * num_imgs
        assert c == c2
        features = {
            "obj_feats" : object_feats.reshape((num_imgs, num_frames, num_query, c, ks1, ks2)),
            # "x_feats":self.mask_head[-1].feat_transform(x).reshape((num_imgs, num_frames, c, h, w)),
            "x_feats": x.reshape((num_imgs, num_frames, c, h, w)),
            "cls_scores": cls_score.reshape((num_imgs, num_frames, num_query, self.num_classes)),
            "masks": mask_preds.reshape((num_imgs, num_frames, num_query, h, w)),
        }
        return all_stage_loss, features

    def simple_test(self,
                    x,
                    proposal_feats,
                    mask_preds,
                    cls_score,
                    img_metas,
                    ref_img_metas,
                    imgs_whwh=None,
                    rescale=False):

        # Decode initial proposals
        num_imgs = len(ref_img_metas)
        num_frames = len(ref_img_metas[0])
        # num_proposals = proposal_feats.size(1)

        object_feats = proposal_feats
        for stage in range(self.num_stages):
            mask_results = self._mask_forward(stage, x, object_feats,
                                              mask_preds)
            object_feats = mask_results['object_feats']
            cls_score = mask_results['cls_score']
            mask_preds = mask_results['mask_preds']
            scaled_mask_preds = mask_results['scaled_mask_preds']

        num_classes = self.mask_head[-1].num_classes
        results = []

        if self.mask_head[-1].loss_cls.use_sigmoid:
            cls_score = cls_score.sigmoid()
        else:
            cls_score = cls_score.softmax(-1)[..., :-1]

        bs_nf, num_query, c, ks1, ks2 = object_feats.size()
        bs_nf2, c2, h, w = x.size()
        assert ks1 == ks2
        assert bs_nf == bs_nf2
        assert bs_nf == num_frames * num_imgs
        assert c == c2
        features = {
            "obj_feats": object_feats.reshape((num_imgs, num_frames, num_query, c, ks1, ks2)),
            # "x_feats":self.mask_head[-1].feat_transform(x).reshape((num_imgs, num_frames, c, h, w)),
            "x_feats": x.reshape((num_imgs, num_frames, c, h, w)),
            "cls_scores": cls_score.reshape((num_imgs, num_frames, num_query, self.num_classes)),
            "masks": mask_preds.reshape((num_imgs, num_frames, num_query, h, w)),
        }

        if self.do_panoptic:
            raise NotImplementedError
            # for img_id in range(num_imgs):
            #     single_result = self.get_panoptic(cls_score[img_id],
            #                                       scaled_mask_preds[img_id],
            #                                       self.test_cfg,
            #                                       ref_img_metas[img_id])
            #     results.append(single_result)
        else:
            for img_id in range(num_imgs):
                for frame_id in range(num_frames):
                    cls_score_per_img = cls_score[img_id * num_frames + frame_id]
                    # h, quite tricky here, a bounding box can predict multiple results with different labels
                    scores_per_img, topk_indices = cls_score_per_img.flatten(0, 1).topk(
                            self.test_cfg.max_per_img, sorted=True)
                    mask_indices = topk_indices // num_classes
                    # Use the following when torch >= 1.9.0
                    # mask_indices = torch.div(topk_indices, num_classes, rounding_mode='floor')
                    labels_per_img = topk_indices % num_classes
                    masks_per_img = scaled_mask_preds[img_id * num_frames + frame_id][mask_indices]
                    single_result = self.mask_head[-1].get_seg_masks(
                        masks_per_img, labels_per_img, scores_per_img,
                        self.test_cfg, img_metas[img_id])
                    results.append(single_result)
        return results, features

    def aug_test(self, features, proposal_list, img_metas, rescale=False):
        raise NotImplementedError('SparseMask does not support `aug_test`')

    def forward_dummy(self, x, proposal_boxes, proposal_feats, img_metas):
        """Dummy forward function when do the flops computing."""
        all_stage_mask_results = []
        num_imgs = len(img_metas)
        num_proposals = proposal_feats.size(1)
        C, H, W = x.shape[-3:]
        mask_preds = proposal_feats.bmm(x.view(num_imgs, C, -1)).view(
            num_imgs, num_proposals, H, W)
        object_feats = proposal_feats
        for stage in range(self.num_stages):
            mask_results = self._mask_forward(stage, x, object_feats,
                                              mask_preds, img_metas)
            all_stage_mask_results.append(mask_results)
        return all_stage_mask_results

    def get_panoptic(self, cls_scores, mask_preds, test_cfg, img_meta):
        # resize mask predictions back
        scores = cls_scores[:self.num_proposals][:, :self.num_thing_classes]
        thing_scores, thing_labels = scores.max(dim=1)
        stuff_scores = cls_scores[
            self.num_proposals:][:, self.num_thing_classes:].diag()
        stuff_labels = torch.arange(
            0, self.num_stuff_classes) + self.num_thing_classes
        stuff_labels = stuff_labels.to(thing_labels.device)

        total_masks = self.mask_head[-1].rescale_masks(mask_preds, img_meta)
        total_scores = torch.cat([thing_scores, stuff_scores], dim=0)
        total_labels = torch.cat([thing_labels, stuff_labels], dim=0)

        panoptic_result = self.merge_stuff_thing(total_masks, total_labels,
                                                 total_scores,
                                                 test_cfg.merge_stuff_thing)
        return dict(pan_results=panoptic_result)

    def merge_stuff_thing(self,
                          total_masks,
                          total_labels,
                          total_scores,
                          merge_cfg=None):

        H, W = total_masks.shape[-2:]
        panoptic_seg = total_masks.new_full((H, W),
                                            self.num_classes,
                                            dtype=torch.long)

        cur_prob_masks = total_scores.view(-1, 1, 1) * total_masks
        cur_mask_ids = cur_prob_masks.argmax(0)

        # sort instance outputs by scores
        sorted_inds = torch.argsort(-total_scores)
        current_segment_id = 0

        for k in sorted_inds:
            pred_class = total_labels[k].item()
            isthing = pred_class < self.num_thing_classes
            if isthing and total_scores[k] < merge_cfg.instance_score_thr:
                continue

            mask = cur_mask_ids == k
            mask_area = mask.sum().item()
            original_area = (total_masks[k] >= 0.5).sum().item()

            if mask_area > 0 and original_area > 0:
                if mask_area / original_area < merge_cfg.overlap_thr:
                    continue

                panoptic_seg[mask] = total_labels[k] \
                    + current_segment_id * INSTANCE_OFFSET
                current_segment_id += 1

        return panoptic_seg.cpu().numpy()


================================================
FILE: knet_vis/tracker/kernel_update_head.py
================================================
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import (ConvModule, bias_init_with_prob, build_activation_layer,
                      build_norm_layer)
from mmcv.cnn.bricks.transformer import (FFN, MultiheadAttention,
                                         build_transformer_layer)
from mmcv.runner import force_fp32

from mmdet.core import multi_apply
from mmdet.models.builder import HEADS, build_loss
from mmdet.models.dense_heads.atss_head import reduce_mean
from mmdet.models.losses import accuracy
from mmdet.utils import get_root_logger

from mmtrack.transform import outs2results

@HEADS.register_module()
class KernelUpdateHeadVideo(nn.Module):

    def __init__(self,
                 with_cls=True,
                 num_proposals=100,
                 num_classes=80,
                 num_ffn_fcs=2,
                 num_heads=8,
                 num_cls_fcs=1,
                 num_mask_fcs=3,
                 feedforward_channels=2048,
                 in_channels=256,
                 out_channels=256,
                 dropout=0.0,
                 mask_thr=0.5,
                 act_cfg=dict(type='ReLU', inplace=True),
                 ffn_act_cfg=dict(type='ReLU', inplace=True),
                 conv_kernel_size=3,
                 feat_transform_cfg=None,
                 hard_mask_thr=0.5,
                 kernel_init=False,
                 with_ffn=True,
                 mask_out_stride=4,
                 relative_coors=False,
                 relative_coors_off=False,
                 feat_gather_stride=1,
                 mask_transform_stride=1,
                 mask_upsample_stride=1,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 ignore_label=255,
                 thing_label_in_seg=0,
                 # query fusion
                 query_merge_method='mean',

                 kernel_updator_cfg=dict(
                     type='DynamicConv',
                     in_channels=256,
                     feat_channels=64,
                     out_channels=256,
                     input_feat_shape=1,
                     act_cfg=dict(type='ReLU', inplace=True),
                     norm_cfg=dict(type='LN')),
                 loss_rank=None,
                 loss_mask=dict(
                     type='CrossEntropyLoss', use_mask=True, loss_weight=1.0),
                 loss_dice=dict(type='DiceLoss', loss_weight=3.0),
                 loss_cls=dict(
                     type='FocalLoss',
                     use_sigmoid=True,
                     gamma=2.0,
                     alpha=0.25,
                     loss_weight=2.0)):
        super().__init__()
        self.num_proposals = num_proposals
        self.num_classes = num_classes
        self.loss_cls = build_loss(loss_cls)
        self.loss_mask = build_loss(loss_mask)
        self.loss_dice = build_loss(loss_dice)
        if loss_rank is not None:
            self.loss_rank = build_loss(loss_rank)
        else:
            self.loss_rank = loss_rank

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.mask_thr = mask_thr
        self.fp16_enabled = False
        self.dropout = dropout

        self.num_heads = num_heads
        self.hard_mask_thr = hard_mask_thr
        self.kernel_init = kernel_init
        self.with_ffn = with_ffn
        self.mask_out_stride = mask_out_stride
        self.relative_coors = relative_coors
        self.relative_coors_off = relative_coors_off
        self.conv_kernel_size = conv_kernel_size
        self.feat_gather_stride = feat_gather_stride
        self.mask_transform_stride = mask_transform_stride
        self.mask_upsample_stride = mask_upsample_stride

        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.ignore_label = ignore_label
        self.thing_label_in_seg = thing_label_in_seg

        self.attention = MultiheadAttention(in_channels * conv_kernel_size**2,
                                            num_heads, dropout)
        self.attention_norm = build_norm_layer(
            dict(type='LN'), in_channels * conv_kernel_size**2)[1]

        self.kernel_update_conv = build_transformer_layer(kernel_updator_cfg)

        if feat_transform_cfg is not None:
            kernel_size = feat_transform_cfg.pop('kernel_size', 1)
            self.feat_transform = ConvModule(
                in_channels,
                in_channels,
                kernel_size,
                stride=feat_gather_stride,
                padding=int(feat_gather_stride // 2),
                **feat_transform_cfg)
        else:
            self.feat_transform = None

        if self.with_ffn:
            self.ffn = FFN(
                in_channels,
                feedforward_channels,
                num_ffn_fcs,
                act_cfg=ffn_act_cfg,
                ffn_drop=dropout)
            self.ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1]

        self.with_cls = with_cls
        if self.with_cls:
            self.cls_fcs = nn.ModuleList()
            for _ in range(num_cls_fcs):
                self.cls_fcs.append(
                    nn.Linear(in_channels, in_channels, bias=False))
                self.cls_fcs.append(
                    build_norm_layer(dict(type='LN'), in_channels)[1])
                self.cls_fcs.append(build_activation_layer(act_cfg))

            if self.loss_cls.use_sigmoid:
                self.fc_cls = nn.Linear(in_channels, self.num_classes)
            else:
                self.fc_cls = nn.Linear(in_channels, self.num_classes + 1)


        # query fusion
        self.query_merge_method = query_merge_method
        if self.query_merge_method == 'attention' and self.with_cls:
            _num_head = 8
            _drop_out = 0.
            self.query_merge_attn = MultiheadAttention(self.in_channels, _num_head, _drop_out, batch_first=True)
            self.query_merge_norm = build_norm_layer(dict(type='LN'),  self.in_channels)[1]
            self.query_merge_ffn = FFN(
                self.in_channels,
                self.in_channels * 8,
                num_ffn_fcs=2,
                act_cfg=dict(type='ReLU', inplace=True),
                ffn_drop=0.)
            self.query_merge_ffn_norm = build_norm_layer(dict(type='LN'), self.in_channels)[1]
        elif self.query_merge_method == 'attention_pos' and self.with_cls:
            _num_head = 8
            _drop_out = 0.
            self.query_merge_attn = MultiheadAttention(self.in_channels, _num_head, _drop_out, batch_first=True)
            self.query_merge_norm = build_norm_layer(dict(type='LN'), self.in_channels)[1]
            self.query_merge_ffn = FFN(
                self.in_channels,
                self.in_channels * 8,
                num_ffn_fcs=2,
                act_cfg=dict(type='ReLU', inplace=True),
                ffn_drop=0.)
            self.query_merge_ffn_norm = build_norm_layer(dict(type='LN'), self.in_channels)[1]

        self.mask_fcs = nn.ModuleList()
        for _ in range(num_mask_fcs):
            self.mask_fcs.append(
                nn.Linear(in_channels, in_channels, bias=False))
            self.mask_fcs.append(
                build_norm_layer(dict(type='LN'), in_channels)[1])
            self.mask_fcs.append(build_activation_layer(act_cfg))

        self.fc_mask = nn.Linear(in_channels, out_channels)

    def init_weights(self):
        """Use xavier initialization for all weight parameter and set
        classification head bias as a specific value when use focal loss."""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
            else:
                # adopt the default initialization for
                # the weight and bias of the layer norm
                pass
        if self.loss_cls.use_sigmoid:
            bias_init = bias_init_with_prob(0.01)
            nn.init.constant_(self.fc_cls.bias, bias_init)
        if self.kernel_init:
            logger = get_root_logger()
            logger.info(
                'mask kernel in mask head is normal initialized by std 0.01')
            nn.init.normal_(self.fc_mask.weight, mean=0, std=0.01)

    def forward(self,
                x,
                proposal_feat,
                mask_preds,
                prev_cls_score=None,
                mask_shape=None,
                img_metas=None,
                pos=None):
        if len(proposal_feat.size()) == 6:
            assert not self.with_cls
            is_gather_query = False
            N, _, num_proposals = proposal_feat.shape[:3]
        else:
            assert self.with_cls
            is_gather_query = True
            N, num_proposals = proposal_feat.shape[:2]
        assert self.num_proposals == num_proposals
        _, num_frames ,C, H, W = x.size()
        if self.feat_transform is not None:
            x = self.feat_transform(x.reshape((N * num_frames, C, H, W))).reshape((N, num_frames, C, H, W))

        mask_h, mask_w = mask_preds.shape[-2:]
        if mask_h != H or mask_w != W:
            gather_mask = F.interpolate(
                mask_preds.reshape((N * num_proposals, C, H, W)),
                (H, W), align_corners=False, mode='bilinear').reshape((N, num_frames, C, H, W))
        else:
            gather_mask = mask_preds

        sigmoid_masks = gather_mask.sigmoid()
        nonzero_inds = sigmoid_masks > self.hard_mask_thr
        sigmoid_masks = nonzero_inds.float()

        # einsum is faster than bmm by 30%
        if is_gather_query:
            # x_feat = torch.einsum('bfnhw,bfchw->bnc', sigmoid_masks, x)
            if self.query_merge_method == 'mean':
                x_feat = torch.einsum('bfnhw,bfchw->bfnc', sigmoid_masks, x).mean(1)
            elif self.query_merge_method == 'attention':
                x_feat = torch.einsum('bfnhw,bfchw->bfnc', sigmoid_masks, x)
                x_feat = x_feat.reshape((N, num_frames * num_proposals, self.in_channels))
                assert proposal_feat.size()[-2:] == (1,1), "Only supporting kernel size = 1"
                init_query = proposal_feat.reshape(N, num_proposals, self.in_channels).detach()
                x_feat = self.query_merge_attn(query=init_query, key=x_feat, value=x_feat)
                x_feat = self.query_merge_norm(x_feat)
                x_feat = self.query_merge_ffn_norm(self.query_merge_ffn(x_feat))
            elif self.query_merge_method == 'attention_pos':
                x_feat = torch.einsum('bfnhw,bfchw->bfnc', sigmoid_masks, x)
                x_feat = x_feat.reshape((N, num_frames * num_proposals, self.in_channels))
                assert proposal_feat.size()[-2:] == (1, 1), "Only supporting kernel size = 1"
                init_query = proposal_feat.reshape(N, num_proposals, self.in_channels).detach()
                query_pos = pos.repeat(N, 1, 1)
                key_pos = query_pos.repeat(1, num_frames, 1)
                x_feat = self.query_merge_attn(query=init_query, key=x_feat, value=x_feat,
                                               query_pos=query_pos, key_pos=key_pos)
                x_feat = self.query_merge_norm(x_feat)
                x_feat = self.query_merge_ffn_norm(self.query_merge_ffn(x_feat))
            else:
                raise NotImplementedError
        else:
            x_feat = torch.einsum('bfnhw,bfchw->bfnc', sigmoid_masks, x)

        # obj_feat in shape [B, N, C, K, K] -> [B, N, C, K*K] -> [B, N, K*K, C]
        if is_gather_query:
            proposal_feat = proposal_feat.reshape(N, num_proposals, self.in_channels, -1).permute(0, 1, 3, 2)
            obj_feat = self.kernel_update_conv(x_feat, proposal_feat)
        else:
            proposal_feat = proposal_feat.reshape(N * num_frames, num_proposals, self.in_channels, -1).permute(0, 1, 3, 2)
            obj_feat = self.kernel_update_conv(x_feat.reshape(N * num_frames, num_proposals, C), proposal_feat)
            N *= num_frames

        # [B, N, K*K, C] -> [B, N, K*K*C] -> [N, B, K*K*C]
        obj_feat = obj_feat.reshape(N, num_proposals, -1).permute(1, 0, 2)
        obj_feat = self.attention_norm(self.attention(obj_feat))
        # [N, B, K*K*C] -> [B, N, K*K*C]
        obj_feat = obj_feat.permute(1, 0, 2)

        # obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C]
        obj_feat = obj_feat.reshape(N, num_proposals, -1, self.in_channels)

        # FFN
        if self.with_ffn:
            obj_feat = self.ffn_norm(self.ffn(obj_feat))

        mask_feat = obj_feat

        if is_gather_query:
            cls_feat = obj_feat.sum(-2)
            for cls_layer in self.cls_fcs:
                cls_feat = cls_layer(cls_feat)
            cls_score = self.fc_cls(cls_feat).view(N, num_proposals, -1)
        else:
            cls_score = None

        for reg_layer in self.mask_fcs:
            mask_feat = reg_layer(mask_feat)
        # [B, N, K*K, C] -> [B, N, C, K*K]
        mask_feat = self.fc_mask(mask_feat).permute(0, 1, 3, 2)

        if (self.mask_transform_stride == 2 and self.feat_gather_stride == 1):
            mask_x = F.interpolate(
                x, scale_factor=0.5, mode='bilinear', align_corners=False)
            H, W = mask_x.shape[-2:]
            raise NotImplementedError
        else:
            mask_x = x
        # group conv is 5x faster than unfold and uses about 1/5 memory
        # Group conv vs. unfold vs. concat batch, 2.9ms :13.5ms :3.8ms
        # Group conv vs. unfold vs. concat batch, 278 : 1420 : 369
        # fold_x = F.unfold(
        #     mask_x,
        #     self.conv_kernel_size,
        #     padding=int(self.conv_kernel_size // 2))
        # mask_feat = mask_feat.reshape(N, num_proposals, -1)
        # new_mask_preds = torch.einsum('bnc,bcl->bnl', mask_feat, fold_x)
        # [B, N, C, K*K] -> [B*N, C, K, K]
        mask_feat = mask_feat.reshape(N, num_proposals, C,
                                      self.conv_kernel_size,
                                      self.conv_kernel_size)
        # [B, C, H, W] -> [1, B*C, H, W]
        if is_gather_query:
            new_mask_preds = []
            for i in range(N):
                new_mask_preds.append(
                    F.conv2d(
                        mask_x[i],
                        mask_feat[i],
                        padding=int(self.conv_kernel_size // 2)))

            new_mask_preds = torch.stack(new_mask_preds, dim=0)
            assert new_mask_preds.size() == (N, num_frames, num_proposals, H, W)
        else:
            N = N // num_frames
            new_mask_preds = []
            for i in range(N):
                for j in range(num_frames):
                    new_mask_preds.append(
                        F.conv2d(
                            mask_x[i][j][None],
                            mask_feat[i * num_frames + j],
                            padding=int(self.conv_kernel_size // 2)))
            new_mask_preds = torch.cat(new_mask_preds, dim=0)
            new_mask_preds = new_mask_preds.reshape(N, num_frames, num_proposals, H, W)
            assert new_mask_preds.size() == (N, num_frames, num_proposals, H, W)
        if self.mask_transform_stride == 2:
            new_mask_preds = F.interpolate(
                new_mask_preds,
                scale_factor=2,
                mode='bilinear',
                align_corners=False)
            raise NotImplementedError

        if mask_shape is not None and mask_shape[0] != H:
            new_mask_preds = F.interpolate(
                new_mask_preds,
                mask_shape,
                align_corners=False,
                mode='bilinear')
            raise NotImplementedError
        if is_gather_query:
            return cls_score, new_mask_preds, obj_feat.permute(0, 1, 3, 2).reshape(
                N, num_proposals, self.in_channels, self.conv_kernel_size,
                self.conv_kernel_size)
        else:
            return None, new_mask_preds, obj_feat.permute(0, 1, 3, 2).reshape(
                N, num_frames , num_proposals, self.in_channels, self.conv_kernel_size, self.conv_kernel_size)

    @force_fp32(apply_to=('cls_score', 'mask_pred'))
    def loss(self,
             object_feats,
             cls_score,
             mask_pred,
             labels,
             label_weights,
             mask_targets,
             mask_weights,
             imgs_whwh=None,
             reduction_override=None,
             **kwargs):

        losses = dict()
        bg_class_ind = self.num_classes
        # note in spare rcnn num_gt == num_pos
        pos_inds = (labels >= 0) & (labels < bg_class_ind)
        num_pos = pos_inds.sum().float()
        avg_factor = reduce_mean(num_pos).clamp_(min=1.0)

        num_preds = mask_pred.shape[0] * mask_pred.shape[1]
        if cls_score is not None:
            assert mask_pred.shape[0] == cls_score.shape[0]
            assert mask_pred.shape[1] == cls_score.shape[1]

        if cls_score is not None:
            if cls_score.numel() > 0:
                losses['loss_cls'] = self.loss_cls(
                    cls_score.view(num_preds, -1),
                    labels,
                    label_weights,
                    avg_factor=avg_factor,
                    reduction_override=reduction_override)
                losses['pos_acc'] = accuracy(
                    cls_score.view(num_preds, -1)[pos_inds], labels[pos_inds])
        if mask_pred is not None:
            bool_pos_inds = pos_inds.type(torch.bool)
            # 0~self.num_classes-1 are FG, self.num_classes is BG
            # do not perform bounding box regression for BG anymore.
            H, W = mask_pred.shape[-2:]
            if pos_inds.any():
                pos_mask_pred = mask_pred.reshape(num_preds, H,
                                                  W)[bool_pos_inds]
                pos_mask_targets = mask_targets[bool_pos_inds]
                losses['loss_mask'] = self.loss_mask(pos_mask_pred,
                                                     pos_mask_targets)
                losses['loss_dice'] = self.loss_dice(pos_mask_pred,
                                                     pos_mask_targets)

                if self.loss_rank is not None:
                    batch_size = mask_pred.size(0)
                    rank_target = mask_targets.new_full((batch_size, H, W),
                                                        self.ignore_label,
                                                        dtype=torch.long)
                    rank_inds = pos_inds.view(batch_size,
                                              -1).nonzero(as_tuple=False)
                    batch_mask_targets = mask_targets.view(
                        batch_size, -1, H, W).bool()
                    for i in range(batch_size):
                        curr_inds = (rank_inds[:, 0] == i)
                        curr_rank = rank_inds[:, 1][curr_inds]
                        for j in curr_rank:
                            rank_target[i][batch_mask_targets[i][j]] = j
                    losses['loss_rank'] = self.loss_rank(
                        mask_pred, rank_target, ignore_index=self.ignore_label)
            else:
                losses['loss_mask'] = mask_pred.sum() * 0
                losses['loss_dice'] = mask_pred.sum() * 0
                if self.loss_rank is not None:
                    losses['loss_rank'] = mask_pred.sum() * 0

        return losses

    def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask,
                           pos_gt_mask, pos_gt_labels, gt_sem_seg, gt_sem_cls,
                           cfg):

        num_pos = pos_mask.size(0)
        num_neg = neg_mask.size(0)
        num_samples = num_pos + num_neg
        H, W = pos_mask.shape[-2:]
        # original implementation uses new_zeros since BG are set to be 0
        # now use empty & fill because BG cat_id = num_classes,
        # FG cat_id = [0, num_classes-1]
        labels = pos_mask.new_full((num_samples, ),
                                   self.num_classes,
                                   dtype=torch.long)
        label_weights = pos_mask.new_zeros((num_samples, self.num_classes))
        mask_targets = pos_mask.new_zeros(num_samples, H, W)
        mask_weights = pos_mask.new_zeros(num_samples, H, W)
        if num_pos > 0:
            labels[pos_inds] = pos_gt_labels
            pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
            label_weights[pos_inds] = pos_weight
            pos_mask_targets = pos_gt_mask
            mask_targets[pos_inds, ...] = pos_mask_targets
            mask_weights[pos_inds, ...] = 1

        if num_neg > 0:
            label_weights[neg_inds] = 1.0

        if gt_sem_cls is not None and gt_sem_seg is not None:
            sem_labels = pos_mask.new_full((self.num_stuff_classes, ),
                                           self.num_classes,
                                           dtype=torch.long)
            sem_targets = pos_mask.new_zeros(self.num_stuff_classes, H, W)
            sem_weights = pos_mask.new_zeros(self.num_stuff_classes, H, W)
            sem_stuff_weights = torch.eye(
                self.num_stuff_classes, device=pos_mask.device)
            sem_thing_weights = pos_mask.new_zeros(
                (self.num_stuff_classes, self.num_thing_classes))
            sem_label_weights = torch.cat(
                [sem_thing_weights, sem_stuff_weights], dim=-1)
            if len(gt_sem_cls > 0):
                sem_inds = gt_sem_cls - self.num_thing_classes
                sem_inds = sem_inds.long()
                sem_labels[sem_inds] = gt_sem_cls.long()
                sem_targets[sem_inds] = gt_sem_seg
                sem_weights[sem_inds] = 1

            label_weights[:, self.num_thing_classes:] = 0
            labels = torch.cat([labels, sem_labels])
            label_weights = torch.cat([label_weights, sem_label_weights])
            mask_targets = torch.cat([mask_targets, sem_targets])
            mask_weights = torch.cat([mask_weights, sem_weights])

        return labels, label_weights, mask_targets, mask_weights

    def get_targets(self,
                    sampling_results,
                    rcnn_train_cfg,
                    concat=True,
                    gt_sem_seg=None,
                    gt_sem_cls=None):
        num_imgs = len(sampling_results)
        pos_inds_list = [res.pos_inds for res in sampling_results]
        neg_inds_list = [res.neg_inds for res in sampling_results]
        pos_mask_list = [res.pos_masks for res in sampling_results]
        neg_mask_list = [res.neg_masks for res in sampling_results]
        pos_gt_mask_list = [res.pos_gt_masks for res in sampling_results]
        pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results]
        if gt_sem_seg is None:
            gt_sem_seg = [None] * num_imgs
            gt_sem_cls = [None] * num_imgs

        labels, label_weights, mask_targets, mask_weights = multi_apply(
            self._get_target_single,
            pos_inds_list,
            neg_inds_list,
            pos_mask_list,
            neg_mask_list,
            pos_gt_mask_list,
            pos_gt_labels_list,
            gt_sem_seg,
            gt_sem_cls,
            cfg=rcnn_train_cfg)
        if concat:
            labels = torch.cat(labels, 0)
            label_weights = torch.cat(label_weights, 0)
            mask_targets = torch.cat(mask_targets, 0)
            mask_weights = torch.cat(mask_weights, 0)
        return labels, label_weights, mask_targets, mask_weights

    def rescale_masks(self, masks_per_img, img_meta):
        h, w, _ = img_meta['img_shape']
        masks_per_img = F.interpolate(
            masks_per_img.unsqueeze(0).sigmoid(),
            size=img_meta['batch_input_shape'],
            mode='bilinear',
            align_corners=False)

        masks_per_img = masks_per_img[:, :, :h, :w]
        ori_shape = img_meta['ori_shape']
        seg_masks = F.interpolate(
            masks_per_img,
            size=ori_shape[:2],
            mode='bilinear',
            align_corners=False).squeeze(0)
        return seg_masks

    def get_seg_masks(self, masks_per_img, labels_per_img, scores_per_img,
                      test_cfg, img_meta):
        # resize mask predictions back
        seg_masks = self.rescale_masks(masks_per_img, img_meta)
        seg_masks = seg_masks > test_cfg.mask_thr
        bbox_result, segm_result = self.segm2result(seg_masks, labels_per_img,
                                                    scores_per_img)
        return bbox_result, segm_result

    def segm2result(self, mask_preds, det_labels, cls_scores):
        num_classes = self.num_classes
        bbox_result = None
        segm_result = [[] for _ in range(num_classes)]
        mask_preds = mask_preds.cpu().numpy()
        det_labels = det_labels.cpu().numpy()
        cls_scores = cls_scores.cpu().numpy()
        num_ins = mask_preds.shape[0]
        # fake bboxes
        bboxes = np.zeros((num_ins, 5), dtype=np.float32)
        bboxes[:, -1] = cls_scores
        bbox_result = [bboxes[det_labels == i, :] for i in range(num_classes)]
        for idx in range(num_ins):
            segm_result[det_labels[idx]].append(mask_preds[idx])
        return bbox_result, segm_result

    def get_seg_masks_tracking(self, masks_per_img, labels_per_img, scores_per_img, ids_per_img,
                      test_cfg, img_meta):
        num_ins = masks_per_img.shape[0]
        # resize mask predictions back
        seg_masks = self.rescale_masks(masks_per_img, img_meta)
        seg_masks = seg_masks > test_cfg.mask_thr
        # fake bboxes
        bboxes = torch.zeros((num_ins, 5), dtype=torch.float32)
        bboxes[:, -1] = scores_per_img
        tracks = outs2results(
            bboxes=bboxes,
            labels=labels_per_img,
            masks=seg_masks,
            ids=ids_per_img,
            num_classes=self.num_classes,
        )
        return tracks['bbox_results'], tracks['mask_results']


================================================
FILE: knet_vis/tracker/mask_hungarian_assigner.py
================================================
import numpy as np
import torch

from mmdet.core import AssignResult, BaseAssigner
from mmdet.core.bbox.builder import BBOX_ASSIGNERS
from mmdet.core.bbox.match_costs.builder import build_match_cost

try:
    from scipy.optimize import linear_sum_assignment
except ImportError:
    linear_sum_assignment = None


@BBOX_ASSIGNERS.register_module()
class MaskHungarianAssignerVideo(BaseAssigner):
    """Computes one-to-one matching between predictions and ground truth.

    This class computes an assignment between the targets and the predictions
    based on the costs. The costs are weighted sum of three components:
    classfication cost, regression L1 cost and regression iou cost. The
    targets don't include the no_object, so generally there are more
    predictions than targets. After the one-to-one matching, the un-matched
    are treated as backgrounds. Thus each query prediction will be assigned
    with `0` or a positive integer indicating the ground truth index:

    - 0: negative sample, no assigned gt
    - positive integer: positive sample, index (1-based) of assigned gt

    Args:
        cls_weight (int | float, optional): The scale factor for classification
            cost. Default 1.0.
        bbox_weight (int | float, optional): The scale factor for regression
            L1 cost. Default 1.0.
        iou_weight (int | float, optional): The scale factor for regression
            iou cost. Default 1.0.
        iou_calculator (dict | optional): The config for the iou calculation.
            Default type `BboxOverlaps2D`.
        iou_mode (str | optional): "iou" (intersection over union), "iof"
                (intersection over foreground), or "giou" (generalized
                intersection over union). Default "giou".
    """

    def __init__(self,
                 cls_cost=dict(type='ClassificationCost', weight=1.),
                 mask_cost=dict(type='SigmoidCost', weight=1.0),
                 dice_cost=dict(),
                 boundary_cost=None,
                 topk=1):
        self.cls_cost = build_match_cost(cls_cost)
        self.mask_cost = build_match_cost(mask_cost)
        self.dice_cost = build_match_cost(dice_cost)
        if boundary_cost is not None:
            self.boundary_cost = build_match_cost(boundary_cost)
        else:
            self.boundary_cost = None
        self.topk = topk

    def assign(self,
               bbox_pred,
               cls_pred,
               gt_bboxes,
               gt_labels,
               gt_instance_ids,
               img_meta=None,
               gt_bboxes_ignore=None,
               eps=1e-7):
        """Computes one-to-one matching based on the weighted costs.

        This method assign each query prediction to a ground truth or
        background. The `assigned_gt_inds` with -1 means don't care,
        0 means negative sample, and positive number is the index (1-based)
        of assigned gt.
        The assignment is done in the following steps, the order matters.

        1. assign every prediction to -1
        2. compute the weighted costs
        3. do Hungarian matching on CPU based on the costs
        4. assign all to 0 (background) first, then for each matched pair
           between predictions and gts, treat this prediction as foreground
           and assign the corresponding gt index (plus 1) to it.

        Args:
            bbox_pred (Tensor): Predicted boxes with normalized coordinates
                (cx, cy, w, h), which are all in range [0, 1]. Shape
                [num_query, 4].
            cls_pred (Tensor): Predicted classification logits, shape
                [num_query, num_class].
            gt_bboxes (Tensor): Ground truth boxes with unnormalized
                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
            img_meta (dict): Meta information for current image.
            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
                labelled as `ignored`. Default None.
            eps (int | float, optional): A value added to the denominator for
                numerical stability. Default 1e-7.

        Returns:
            :obj:`AssignResult`: The assigned result.
        """
        assert gt_bboxes_ignore is None, \
            'Only case when gt_bboxes_ignore is None is supported.'
        instances = torch.unique(gt_instance_ids[:,1])
        num_frames = bbox_pred.size(0)
        h, w = bbox_pred.shape[-2:]
        gt_masks = []
        gt_labels_tensor =[]
        for instance_id in instances:
            gt_instance_frame_ids = gt_instance_ids[gt_instance_ids[:, 1] == instance_id, 0]
            instance_masks = []
            gt_label_id = None
            for frame_id in range(num_frames):
                gt_frame_instance_ids = gt_instance_ids[gt_instance_ids[:,0] == frame_id, 1]
                gt_frame_label_ids = gt_labels[gt_labels[:,0] == frame_id, 1]
                assert len(gt_frame_label_ids) == len(gt_frame_label_ids)
                if not (frame_id in gt_instance_frame_ids):
                    gt_mask_frame = torch.zeros((h, w), device=gt_instance_frame_ids.device, dtype=torch.float)
                else:
                    gt_index = torch.nonzero((gt_frame_instance_ids == instance_id), as_tuple=True)[0].item()
                    gt_mask_frame = gt_bboxes[frame_id][gt_index]
                    gt_label_id = gt_frame_label_ids[gt_index].item() if gt_label_id is None else gt_label_id
                    assert gt_label_id == gt_frame_label_ids[gt_index].item()
                instance_masks.append(gt_mask_frame)
            gt_masks.append(torch.stack(instance_masks))
            gt_labels_tensor.append(gt_label_id)
        gt_masks = torch.stack(gt_masks)
        gt_labels_tensor = torch.tensor(gt_labels_tensor, device=gt_masks.device, dtype=torch.long)


        num_gts, num_bboxes = len(instances), bbox_pred.size(1)

        # 1. assign -1 by default
        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ), -1, dtype=torch.long)
        assigned_labels = bbox_pred.new_full((num_bboxes, ), -1, dtype=torch.long)
        if num_gts == 0 or num_bboxes == 0:
            # No ground truth or boxes, return empty assignment
            if num_gts == 0:
                # No ground truth, assign all to background
                assigned_gt_inds[:] = 0
            return AssignResult(
                num_gts, assigned_gt_inds, None, labels=assigned_labels)

        # 2. compute the weighted costs
        # classification and bboxcost.
        pred_masks_match = torch.einsum('fqhw->qfhw', bbox_pred).reshape((num_bboxes, -1, w))
        gt_masks_match = gt_masks.reshape((num_gts, -1, w))
        if self.cls_cost.weight != 0 and cls_pred is not None:
            cls_cost = self.cls_cost(cls_pred, gt_labels_tensor)
        else:
            cls_cost = 0
        if self.mask_cost.weight != 0:
            reg_cost = self.mask_cost(pred_masks_match, gt_masks_match)
        else:
            reg_cost = 0
        if self.dice_cost.weight != 0:
            dice_cost = self.dice_cost(pred_masks_match, gt_masks_match)
        else:
            dice_cost = 0
        if self.boundary_cost is not None and self.boundary_cost.weight != 0:
            b_cost = self.boundary_cost(pred_masks_match, gt_masks_match)
        else:
            b_cost = 0
        cost = cls_cost + reg_cost + dice_cost + b_cost

        # 3. do Hungarian matching on CPU using linear_sum_assignment
        cost = cost.detach().cpu()
        if linear_sum_assignment is None:
            raise ImportError('Please run "pip install scipy" '
                              'to install scipy first.')
        if self.topk == 1:
            matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
        else:
            topk_matched_row_inds = []
            topk_matched_col_inds = []
            for i in range(self.topk):
                matched_row_inds, matched_col_inds = linear_sum_assignment(
                    cost)
                topk_matched_row_inds.append(matched_row_inds)
                topk_matched_col_inds.append(matched_col_inds)
                cost[matched_row_inds] = 1e10
            matched_row_inds = np.concatenate(topk_matched_row_inds)
            matched_col_inds = np.concatenate(topk_matched_col_inds)

        matched_row_inds = torch.from_numpy(matched_row_inds).to(
            bbox_pred.device)
        matched_col_inds = torch.from_numpy(matched_col_inds).to(
            bbox_pred.device)

        # 4. assign backgrounds and foregrounds
        # assign all indices to backgrounds first
        assigned_gt_inds[:] = 0
        # assign foregrounds based on matching results
        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
        assigned_labels[matched_row_inds] = gt_labels_tensor[matched_col_inds]
        return AssignResult(num_gts, assigned_gt_inds, None, labels=assigned_labels), gt_masks_match


================================================
FILE: knet_vis/tracker/positional_encoding.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
"""
Various positional encodings for the transformer.
"""
import math

import torch

from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING
from mmcv.runner import BaseModule


@POSITIONAL_ENCODING.register_module()
class PositionEmbeddingSine3D(BaseModule):
    """
    This is a more standard version of the position embedding, very similar to the one
    used by the Attention is all you need paper, generalized to work on images.
    """

    def __init__(self, num_feats=64, temperature=10000, normalize=False, scale=None):
        super().__init__()
        self.num_pos_feats = num_feats
        self.temperature = temperature
        self.normalize = normalize
        if scale is not None and normalize is False:
            raise ValueError("normalize should be True if scale is passed")
        if scale is None:
            scale = 2 * math.pi
        self.scale = scale

    def forward(self, x, mask=None):
        # b, t, c, h, w
        assert x.dim() == 5, f"{x.shape} should be a 5-dimensional Tensor, got {x.dim()}-dimensional Tensor instead"
        if mask is None:
            mask = torch.zeros((x.size(0), x.size(1), x.size(3), x.size(4)), device=x.device, dtype=torch.bool)
        not_mask = ~mask
        z_embed = not_mask.cumsum(1, dtype=torch.float32)
        y_embed = not_mask.cumsum(2, dtype=torch.float32)
        x_embed = not_mask.cumsum(3, dtype=torch.float32)
        if self.normalize:
            eps = 1e-6
            z_embed = z_embed / (z_embed[:, -1:, :, :] + eps) * self.scale
            y_embed = y_embed / (y_embed[:, :, -1:, :] + eps) * self.scale
            x_embed = x_embed / (x_embed[:, :, :, -1:] + eps) * self.scale

        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)

        dim_t_z = torch.arange((self.num_pos_feats * 2), dtype=torch.float32, device=x.device)
        dim_t_z = self.temperature ** (2 * (dim_t_z // 2) / (self.num_pos_feats * 2))

        pos_x = x_embed[:, :, :, :, None] / dim_t
        pos_y = y_embed[:, :, :, :, None] / dim_t
        pos_z = z_embed[:, :, :, :, None] / dim_t_z
        pos_x = torch.stack((pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
        pos_y = torch.stack((pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
        pos_z = torch.stack((pos_z[:, :, :, :, 0::2].sin(), pos_z[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
        pos = (torch.cat((pos_y, pos_x), dim=4) + pos_z).permute(0, 1, 4, 2, 3)  # b, t, c, h, w
        return pos


================================================
FILE: knet_vis/tracker/semantic_fpn_wrapper3D.py
================================================
import torch
import torch.nn as nn
from mmcv.cnn import ConvModule, normal_init
from mmdet.models.builder import NECKS
from mmcv.cnn.bricks.transformer import build_positional_encoding
from mmdet.utils import get_root_logger


@NECKS.register_module()
class SemanticFPNWrapper3D(nn.Module):
    """Implementation of Semantic FPN used in Panoptic FPN.

    Args:
        in_channels ([type]): [description]
        feat_channels ([type]): [description]
        out_channels ([type]): [description]
        start_level ([type]): [description]
        end_level ([type]): [description]
        cat_coors (bool, optional): [description]. Defaults to False.
        fuse_by_cat (bool, optional): [description]. Defaults to False.
        conv_cfg ([type], optional): [description]. Defaults to None.
        norm_cfg ([type], optional): [description]. Defaults to None.
    """

    def __init__(self,
                 in_channels,
                 feat_channels,
                 out_channels,
                 start_level,
                 end_level,
                 cat_coors=False,
                 positional_encoding=None,
                 cat_coors_level=3,
                 fuse_by_cat=False,
                 return_list=False,
                 upsample_times=3,
                 with_pred=True,
                 num_aux_convs=0,
                 act_cfg=dict(type='ReLU', inplace=True),
                 out_act_cfg=dict(type='ReLU'),
                 conv_cfg=None,
                 norm_cfg=None):
        super().__init__()

        self.in_channels = in_channels
        self.feat_channels = feat_channels
        self.start_level = start_level
        self.end_level = end_level
        assert start_level >= 0 and end_level >= start_level
        self.out_channels = out_channels
        self.conv_cfg = conv_cfg
        self.norm_cfg = norm_cfg
        self.act_cfg = act_cfg
        self.cat_coors = cat_coors
        self.cat_coors_level = cat_coors_level
        self.fuse_by_cat = fuse_by_cat
        self.return_list = return_list
        self.upsample_times = upsample_times
        self.with_pred = with_pred
        if positional_encoding is not None:
            self.positional_encoding = build_positional_encoding(
                positional_encoding)
        else:
            self.positional_encoding = None

        self.convs_all_levels = nn.ModuleList()
        for i in range(self.start_level, self.end_level + 1):
            convs_per_level = nn.Sequential()
            if i == 0:
                if i == self.cat_coors_level and self.cat_coors:
                    chn = self.in_channels + 2
                else:
                    chn = self.in_channels
                if upsample_times == self.end_level - i:
                    one_conv = ConvModule(
                        chn,
                        self.feat_channels,
                        3,
                        padding=1,
                        conv_cfg=self.conv_cfg,
                        norm_cfg=self.norm_cfg,
                        act_cfg=self.act_cfg,
                        inplace=False)
                    convs_per_level.add_module('conv' + str(i), one_conv)
                else:
                    for i in range(self.end_level - upsample_times):
                        one_conv = ConvModule(
                            chn,
                            self.feat_channels,
                            3,
                            padding=1,
                            stride=2,
                            conv_cfg=self.conv_cfg,
                            norm_cfg=self.norm_cfg,
                            act_cfg=self.act_cfg,
                            inplace=False)
                        convs_per_level.add_module('conv' + str(i), one_conv)
                self.convs_all_levels.append(convs_per_level)
                continue

            for j in range(i):
                if j == 0:
                    if i == self.cat_coors_level and self.cat_coors:
                        chn = self.in_channels + 2
                    else:
                        chn = self.in_channels
                    one_conv = ConvModule(
                        chn,
                        self.feat_channels,
                        3,
                        padding=1,
                        conv_cfg=self.conv_cfg,
                        norm_cfg=self.norm_cfg,
                        act_cfg=self.act_cfg,
                        inplace=False)
                    convs_per_level.add_module('conv' + str(j), one_conv)
                    if j < upsample_times - (self.end_level - i):
                        one_upsample = nn.Upsample(
                            scale_factor=2,
                            mode='bilinear',
                            align_corners=False)
                        convs_per_level.add_module('upsample' + str(j),
                                                   one_upsample)
                    continue

                one_conv = ConvModule(
                    self.feat_channels,
                    self.feat_channels,
                    3,
                    padding=1,
                    conv_cfg=self.conv_cfg,
                    norm_cfg=self.norm_cfg,
                    act_cfg=self.act_cfg,
                    inplace=False)
                convs_per_level.add_module('conv' + str(j), one_conv)
                if j < upsample_times - (self.end_level - i):
                    one_upsample = nn.Upsample(
                        scale_factor=2, mode='bilinear', align_corners=False)
                    convs_per_level.add_module('upsample' + str(j),
                                               one_upsample)

            self.convs_all_levels.append(convs_per_level)

        if fuse_by_cat:
            in_channels = self.feat_channels * len(self.convs_all_levels)
        else:
            in_channels = self.feat_channels

        if self.with_pred:
            self.conv_pred = ConvModule(
                in_channels,
                self.out_channels,
                1,
                padding=0,
                conv_cfg=self.conv_cfg,
                act_cfg=out_act_cfg,
                norm_cfg=self.norm_cfg)

        self.num_aux_convs = num_aux_convs
        self.aux_convs = nn.ModuleList()
        for i in range(num_aux_convs):
            self.aux_convs.append(
                ConvModule(
                    in_channels,
                    self.out_channels,
                    1,
                    padding=0,
                    conv_cfg=self.conv_cfg,
                    act_cfg=out_act_cfg,
                    norm_cfg=self.norm_cfg))

    def init_weights(self):
        logger = get_root_logger()
        logger.info('Use normal intialization for semantic FPN')
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                normal_init(m, std=0.01)

    def generate_coord(self, input_feat):
        x_range = torch.linspace(
            -1, 1, input_feat.shape[-1], device=input_feat.device)
        y_range = torch.linspace(
            -1, 1, input_feat.shape[-2], device=input_feat.device)
        y, x = torch.meshgrid(y_range, x_range)
        y = y.expand([input_feat.shape[0], 1, -1, -1])
        x = x.expand([input_feat.shape[0], 1, -1, -1])
        coord_feat = torch.cat([x, y], 1)
        return coord_feat

    def forward(self, inputs, num_imgs, num_frames):
        mlvl_feats = []
        for i in range(self.start_level, self.end_level + 1):
            input_p = inputs[i]
            if i == self.cat_coors_level:
                if self.positional_encoding is not None:
                    input_p = input_p.view(num_imgs, num_frames, *input_p.size()[1:])
                    assert self.positional_encoding.__class__.__name__.endswith('3D')
                    positional_encoding = self.positional_encoding(input_p)
                    input_p = (input_p + positional_encoding).reshape(num_imgs * num_frames, *input_p.size()[2:])
                if self.cat_coors:
                    coord_feat = self.generate_coord(input_p)
                    input_p = torch.cat([input_p, coord_feat], 1)

            mlvl_feats.append(self.convs_all_levels[i](input_p))

        if self.fuse_by_cat:
            feature_add_all_level = torch.cat(mlvl_feats, dim=1)
        else:
            feature_add_all_level = sum(mlvl_feats)

        if self.with_pred:
            out = self.conv_pred(feature_add_all_level)
        else:
            out = feature_add_all_level

        if self.num_aux_convs > 0:
            outs = [out]
            for conv in self.aux_convs:
                outs.append(conv(feature_add_all_level))
            return outs

        if self.return_list:
            return [out]
        else:
            return out


================================================
FILE: knet_vis/tracker/track.py
================================================
import copy

import torch
import torch.nn as nn
import torch.nn.functional as F

from mmdet.models.builder import DETECTORS
from mmdet.models.detectors import TwoStageDetector
from mmdet.utils import get_root_logger
from mmdet.models import build_head

from knet_vis.det.utils import sem2ins_masks


@DETECTORS.register_module()
class KNetTrack(TwoStageDetector):

    def __init__(self,
                 *args,
                 num_thing_classes=80,
                 num_stuff_classes=53,
                 mask_assign_stride=4,
                 thing_label_in_seg=0,
                 direct_tracker=False,
                 tracker_num=1,
                 tracker=None,
                 train_cfg=None,
                 test_cfg=None,
                 **kwargs):
        self.roi_head = None # init roi_head with None
        super().__init__(*args, **kwargs, train_cfg=train_cfg, test_cfg=test_cfg)
        assert self.with_rpn, 'KNet does not support external proposals'
        self.num_thing_classes = num_thing_classes
        self.num_stuff_classes = num_stuff_classes
        self.mask_assign_stride = mask_assign_stride
        self.thing_label_in_seg = thing_label_in_seg
        self.direct_tracker = direct_tracker
        self.tracker_num = tracker_num
        if tracker is not None:
            rcnn_train_cfg = train_cfg.tracker if train_cfg is not None else None
            tracker.update(train_cfg=rcnn_train_cfg)
            tracker.update(test_cfg=test_cfg.tracker)
            self.tracker = build_head(tracker)
            if self.tracker_num > 1:
                self.tracker_extra = nn.ModuleList(
                    [build_head(tracker) for _ in range(tracker_num - 1)]
                )
        logger = get_root_logger()
        logger.info(f'Model: \n{self}')


    def gt_transform(self, img_metas, gt_masks, gt_labels, gt_semantic_seg):
        # gt_masks and gt_semantic_seg are not padded when forming batch
        gt_masks_tensor = []
        gt_sem_seg = []
        gt_sem_cls = []
        # batch_input_shape shoud be the same across images
        pad_H, pad_W = img_metas[0]['batch_input_shape']
        assign_H = pad_H // self.mask_assign_stride
        assign_W = pad_W // self.mask_assign_stride

        for i, gt_mask in enumerate(gt_masks):
            mask_tensor = gt_mask.to_tensor(torch.float, gt_labels[0].device)
            if gt_mask.width != pad_W or gt_mask.height != pad_H:
                pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height)
                mask_tensor = F.pad(mask_tensor, pad_wh, value=0)

            if gt_semantic_seg is not None:
                # gt_semantic seg is padded by 255 and
                # zero indicating the first class
                sem_labels, sem_seg = sem2ins_masks(
                    gt_semantic_seg[i],
                    num_thing_classes=self.num_thing_classes)
                if sem_seg.shape[0] == 0:
                    gt_sem_seg.append(
                        mask_tensor.new_zeros(
                            (mask_tensor.size(0), assign_H, assign_W))
                    )
                else:
                    gt_sem_seg.append(
                        F.interpolate(
                            sem_seg[None], (assign_H, assign_W),
                            mode='bilinear',
                            align_corners=False)[0]
                    )
                gt_sem_cls.append(sem_labels)

            else:
                gt_sem_seg = None
                gt_sem_cls = None

            if mask_tensor.shape[0] == 0:
                gt_masks_tensor.append(
                    mask_tensor.new_zeros(
                        (mask_tensor.size(0), assign_H, assign_W))
                )
            else:
                gt_masks_tensor.append(
                    F.interpolate(
                        mask_tensor[None], (assign_H, assign_W),
                        mode='bilinear',
                        align_corners=False)[0]
                )
        return gt_masks_tensor, gt_sem_seg, gt_sem_cls

    def ref_gt_transform(self, ref_img_metas, ref_gt_masks, ref_gt_labels, ref_gt_semantic_seg=None ):
        # gt_masks and gt_semantic_seg are not padded when forming batch
        ref_gt_masks_tensor = []
        assert ref_gt_semantic_seg is None
        ref_gt_sem_seg = None
        ref_gt_sem_cls = None
        # batch_input_shape shoud be the same across images
        pad_H, pad_W = ref_img_metas[0]['batch_input_shape']
        assign_H = pad_H // self.mask_assign_stride
        assign_W = pad_W // self.mask_assign_stride

        for bs_i, gt_mask_frame in enumerate(ref_gt_masks):
            batch_cur_gt_masks_tensor = []
            for i, gt_mask in enumerate(gt_mask_frame):
                mask_tensor = gt_mask.to_tensor(torch.float, ref_gt_labels[bs_i].device)
                if gt_mask.width != pad_W or gt_mask.height != pad_H:
                    pad_wh = (0, pad_W - gt_mask.width, 0, pad_H - gt_mask.height)
                    mask_tensor = F.pad(mask_tensor, pad_wh, value=0)

                if mask_tensor.shape[0] == 0:
                    batch_cur_gt_masks_tensor.append(
                        mask_tensor.new_zeros(
                            (mask_tensor.size(0), assign_H, assign_W))
                    )
                else:
                    batch_cur_gt_masks_tensor.append(
                        F.interpolate(
                            mask_tensor[None], (assign_H, assign_W),
                            mode='bilinear',
                            align_corners=False)[0]
                    )
            ref_gt_masks_tensor.append(batch_cur_gt_masks_tensor)

        return ref_gt_masks_tensor, ref_gt_sem_seg, ref_gt_sem_cls


    def forward_train(self,
                      img,
                      img_metas,
                      gt_bboxes=None,
                      gt_labels=None,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      proposals=None,
                      gt_semantic_seg=None,
                      gt_instance_ids=None,
                      # references
                      ref_img=None,
                      ref_img_metas=None,
                      ref_gt_bboxes=None,
                      ref_gt_labels=None,
                      ref_gt_bboxes_ignore=None,
                      ref_gt_masks=None,
                      ref_gt_instance_ids=None,
                      **kwargs):

        super(TwoStageDetector, self).forward_train(img, img_metas)
        assert proposals is None, 'KNet does not support external proposals'
        assert gt_masks is not None

        ref_gt_masks, ref_gt_sem_seg, ref_gt_sem_cls  = \
            self.ref_gt_transform(img_metas, ref_gt_masks, ref_gt_labels, ref_gt_semantic_seg=None)
        bs, num_frame, _, h, w = ref_img.size()
        x = self.extract_feat(ref_img.reshape(bs * num_frame, _, h, w))

        losses = dict()

        rpn_losses, proposal_feats, x_feats, mask_preds, cls_scores = \
            self.rpn_head.forward_train(x, img_metas, ref_img_metas, ref_gt_masks, ref_gt_labels,
                                        ref_gt_instance_ids, ref_gt_sem_seg, ref_gt_sem_cls)
        losses.update(rpn_losses)

        if self.roi_head is not None:
            roi_losses, features = self.roi_head.forward_train(
                x_feats,
                proposal_feats,
                mask_preds,
                cls_scores,
                ref_img_metas,
                ref_gt_masks,
                ref_gt_labels,
                gt_bboxes_ignore=ref_gt_bboxes_ignore,
                gt_bboxes=ref_gt_bboxes,
                gt_sem_seg=ref_gt_sem_seg,
                gt_sem_cls=ref_gt_sem_cls,
                imgs_whwh=None)
            losses.update(roi_losses)

        if self.direct_tracker:
            proposal_feats = self.rpn_head.init_kernels.weight.clone()
            proposal_feats = proposal_feats[None].expand(bs, *proposal_feats.size())
            if mask_preds.shape[0] == bs * num_frame:
                mask_preds = mask_preds.reshape((bs, num_frame, *mask_preds.size()[1:]))
                x_feats = x_feats.reshape((bs, num_frame, *x_feats.size()[1:]))
            else:
                assert mask_preds.size()[:2] == (bs, num_frame)
                assert x_feats.size()[:2] == (bs, num_frame)

            tracker_losses, features = self.tracker.forward_train(
                x=x_feats,
                ref_img_metas=ref_img_metas,
                cls_scores=None,
                masks=mask_preds,
                obj_feats=proposal_feats,
                ref_gt_masks=ref_gt_masks,
                ref_gt_labels=ref_gt_labels,
                ref_gt_instance_ids=ref_gt_instance_ids,
            )
            if self.tracker_num > 1:
                for i in range(self.tracker_num - 1):
                    _tracker_losses, features = self.tracker_extra[i].forward_train(
                        x=features['x_feats'],
                        ref_img_metas=ref_img_metas,
                        cls_scores=None,
                        masks=features['masks'],
                        obj_feats=features['obj_feats'],
                        ref_gt_masks=ref_gt_masks,
                        ref_gt_labels=ref_gt_labels,
                        ref_gt_instance_ids=ref_gt_instance_ids,
                    )
                    for key, value in _tracker_losses.items():
                        tracker_losses[f'extra_m{i}_{key}'] = value
        else:
            tracker_losses, _ = self.tracker.forward_train(
                x=features['x_feats'],
                ref_img_metas=ref_img_metas,
                cls_scores=features['cls_scores'],
                masks=features['masks'],
                obj_feats=features['obj_feats'],
                ref_gt_masks=ref_gt_masks,
                ref_gt_labels=ref_gt_labels,
                ref_gt_instance_ids=ref_gt_instance_ids,
            )

        losses.update(tracker_losses)
        return losses

    def forward_test(self, imgs, img_metas, **kwargs):
        """
        Args:
            imgs (List[Tensor]): the outer list indicates test-time
                augmentations and inner Tensor should have a shape NxCxHxW,
                which contains all images in the batch.
            img_metas (List[List[dict]]): the outer list indicates test-time
                augs (multiscale, flip, etc.) and the inner list indicates
                images in a batch.
        """
        for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
            if not isinstance(var, list):
                raise TypeError(f'{name} must be a list, but got {type(var)}')

        num_augs = len(imgs)
        if num_augs != len(img_metas):
            raise ValueError(f'num of augmentations ({len(imgs)}) '
                             f'!= num of image meta ({len(img_metas)})')

        # NOTE the batched image size information may be useful, e.g.
        # in DETR, this is needed for the construction of masks, which is
        # then used for the transformer_head.
        for img, img_meta in zip(imgs, img_metas):
            batch_size = len(img_meta)
            for img_id in range(batch_size):
                img_meta[img_id]['batch_input_shape'] = tuple(img.size()[-2:])

        if num_augs == 1:
            # proposals (List[List[Tensor]]): the outer list indicates
            # test-time augs (multiscale, flip, etc.) and the inner list
            # indicates images in a batch.
            # The Tensor should have a shape Px4, where P is the number of
            # proposals.
            if 'proposals' in kwargs:
                kwargs['proposals'] = kwargs['proposals'][0]
            kwargs['ref_img_metas'] = kwargs['ref_img_metas'][0]
            kwargs['ref_img'] = kwargs['ref_img'][0]
            return self.simple_test(imgs[0], img_metas[0], **kwargs)
        else:
            assert imgs[0].size(0) == 1, 'aug test does not support ' \
                                         'inference with batch size ' \
                                         f'{imgs[0].size(0)}'
            # TODO: support test augmentation for predefined proposals
            assert 'proposals' not in kwargs
            return self.aug_test(imgs, img_metas, **kwargs)

    def simple_test(self, imgs, img_metas, **kwargs):
        ref_img = kwargs['ref_img']
        ref_img_metas = kwargs['ref_img_metas']
        # Step 1 extract features and get masks
        bs, num_frame, _, h, w = ref_img.size()
        x = self.extract_feat(ref_img.reshape(bs * num_frame, _, h, w))

        proposal_feats, x_feats, mask_preds, cls_scores, seg_preds = \
            self.rpn_head.simple_test_rpn(x, img_metas, ref_img_metas)

        if self.roi_head is not None:
            segm_results_single_frame, features = self.roi_head.simple_test(
                x_feats,
                proposal_feats,
                mask_preds,
                cls_scores,
                img_metas,
                ref_img_metas,
                imgs_whwh=None,
                rescale=True
            )

        if self.direct_tracker:
            proposal_feats = self.rpn_head.init_kernels.weight.clone()
            proposal_feats = proposal_feats[None].expand(bs, *proposal_feats.size())
            if mask_preds.shape[0] == bs * num_frame:
                mask_preds = mask_preds.reshape((bs, num_frame, *mask_preds.size()[1:]))
                x_feats = x_feats.reshape((bs, num_frame, *x_feats.size()[1:]))
            else:
                assert mask_preds.size()[:2] == (bs, num_frame)
                assert x_feats.size()[:2] == (bs, num_frame)
            segm_results, features = self.tracker.simple_test(
                x=x_feats,
                img_metas=img_metas,
                ref_img_metas=ref_img_metas,
                cls_scores=None,
                masks=mask_preds,
                obj_feats=proposal_feats,
            )
            if self.tracker_num > 1:
                for i in range(self.tracker_num - 1):
                    segm_results, features = self.tracker_extra[i].simple_test(
                        x=features['x_feats'],
                        img_metas=img_metas,
                        ref_img_metas=ref_img_metas,
                        cls_scores=None,
                        masks=features['masks'],
                        obj_feats=features['obj_feats'],
                    )
        else:
            segm_results, _ = self.tracker.simple_test(
                x=features['x_feats'],
                img_metas=img_metas,
                ref_img_metas=ref_img_metas,
                cls_scores=features['cls_scores'],
                masks=features['masks'],
                obj_feats=features['obj_feats'],
            )

        return segm_results

    def forward_dummy(self, img):
        """Used for computing network flops.

        See `mmdetection/tools/get_flops.py`
        """
        # backbone
        x = self.extract_feat(img)
        # rpn
        num_imgs = len(img)
        dummy_img_metas = [
            dict(img_shape=(800, 1333, 3)) for _ in range(num_imgs)
        ]
        rpn_results = self.rpn_head.simple_test_rpn(x, dummy_img_metas)
        (proposal_feats, x_feats, mask_preds, cls_scores,
         seg_preds) = rpn_results
        # roi_head
        roi_outs = self.roi_head.forward_dummy(x_feats, proposal_feats, dummy_img_metas)
        return roi_outs

    def init_weights(self):
        super().init_weights()
        if self.init_cfg is not None and self.init_cfg['type'] == 'Pretrained':
            assert self.tracker.init_cfg is None
            self.tracker.init_cfg = copy.deepcopy(self.init_cfg)
            self.tracker.init_cfg['prefix']='roi_head'
            self.tracker.init_weights()
            if self.tracker_num > 1:
                for _ in range(self.tracker_num - 1):
                    assert self.tracker_extra[_].init_cfg is None
                    self.tracker_extra[_].init_cfg = copy.deepcopy(self.init_cfg)
                    self.tracker_extra[_].init_cfg['prefix'] = 'roi_head'
                    self.tracker_extra[_].init_weights()


================================================
FILE: mmtrack/datasets/coco_video_dataset.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import random

import numpy as np
from mmcv.utils import print_log
from mmdet.datasets import DATASETS, CocoDataset
from terminaltables import AsciiTable

from mmdet.utils import get_root_logger
from .parsers import CocoVID


@DATASETS.register_module()
class CocoVideoDataset(CocoDataset):
    """Base coco video dataset for VID, MOT and SOT tasks.

    Args:
        load_as_video (bool): If True, using COCOVID class to load dataset,
            otherwise, using COCO class. Default: True.
        key_img_sampler (dict): Configuration of sampling key images.
        ref_img_sampler (dict): Configuration of sampling ref images.
        test_load_ann (bool): If True, loading annotations during testing,
            otherwise, not loading. Default: False.
    """

    CLASSES = None

    def __init__(self,
                 load_as_video=True,
                 key_img_sampler=dict(interval=1),
                 ref_img_sampler=dict(
                     frame_range=10,
                     stride=1,
                     num_ref_imgs=1,
                     filter_key_img=True,
                     method='uniform',
                     return_key_img=True),
                 test_load_ann=False,
                 load_all_frames=False,
                 *args,
                 **kwargs):
        self.load_as_video = load_as_video
        self.key_img_sampler = key_img_sampler
        self.ref_img_sampler = ref_img_sampler
        self.test_load_ann = test_load_ann
        self.load_all_frames = load_all_frames
        assert not (self.load_all_frames and ref_img_sampler is not None), "load all frames indicate no sampler"
        super().__init__(*args, **kwargs)
        self.logger = get_root_logger()

    def load_annotations(self, ann_file):
        """Load annotations from COCO/COCOVID style annotation file.

        Args:
            ann_file (str): Path of annotation file.

        Returns:
            list[dict]: Annotation information from COCO/COCOVID api.
        """
        if not self.load_as_video:
            data_infos = super().load_annotations(ann_file)
        else:
            data_infos = self.load_video_anns(ann_file)
        return data_infos

    def load_video_anns(self, ann_file):
        """Load annotations from COCOVID style annotation file.

        Args:
            ann_file (str): Path of annotation file.

        Returns:
            list[dict]: Annotation information from COCOVID api.
        """
        self.coco = CocoVID(ann_file)
        self.cat_ids = self.coco.get_cat_ids(cat_names=self.CLASSES)
        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}

        data_infos = []
        self.vid_ids = self.coco.get_vid_ids()
        self.img_ids = [] if not self.load_all_frames else None
        for vid_id in self.vid_ids:
            img_ids = self.coco.get_img_ids_from_vid(vid_id)
            if self.key_img_sampler is not None:
                img_ids = self.key_img_sampling(img_ids,
                                                **self.key_img_sampler)
            if self.load_all_frames:
                info = self.coco.load_imgs(img_ids)
                info = [info[0], *info]
                for item in info:
                    item['filename'] = item['file_name']
                data_infos.append(info)
            else:
                self.img_ids.extend(img_ids)
                for img_id in img_ids:
                    info = self.coco.load_imgs([img_id])[0]
                    info['filename'] = info['file_name']
                    data_infos.append(info)
        return data_infos

    def key_img_sampling(self, img_ids, interval=1):
        """Sampling key images."""
        return img_ids[::interval]

    def ref_img_sampling(self,
                         img_info,
                         frame_range,
                         stride=1,
                         num_ref_imgs=1,
                         filter_key_img=True,
                         method='uniform',
                         return_key_img=True):
        """Sampling reference frames in the same video for key frame.

        Args:
            img_info (dict): The information of key frame.
            frame_range (List(int) | int): The sampling range of reference
                frames in the same video for key frame.
            stride (int): The sampling frame stride when sampling reference
                images. Default: 1.
            num_ref_imgs (int): The number of sampled reference images.
                Default: 1.
            filter_key_img (bool): If False, the key image will be in the
                sampling reference candidates, otherwise, it is exclude.
                Default: True.
            method (str): The sampling method. Options are 'uniform',
                'bilateral_uniform', 'test_with_adaptive_stride',
                'test_with_fix_stride'. 'uniform' denotes reference images are
                randomly sampled from the nearby frames of key frame.
                'bilateral_uniform' denotes reference images are randomly
                sampled from the two sides of the nearby frames of key frame.
                'test_with_adaptive_stride' is only used in testing, and
                denotes the sampling frame stride is equal to (video length /
                the number of reference images). test_with_fix_stride is only
                used in testing with sampling frame stride equalling to
                `stride`. Default: 'uniform'.
            return_key_img (bool): If True, the information of key frame is
                returned, otherwise, not returned. Default: True.

        Returns:
            list(dict): `img_info` and the reference images information or
            only the reference images information.
        """
        assert isinstance(img_info, dict)
        if isinstance(frame_range, int):
            assert frame_range >= 0, 'frame_range can not be a negative value.'
            frame_range = [-frame_range, frame_range]
        elif isinstance(frame_range, list):
            assert len(frame_range) == 2, 'The length must be 2.'
            assert frame_range[0] <= 0 and frame_range[1] >= 0
            for i in frame_range:
                assert isinstance(i, int), 'Each element must be int.'
        else:
            raise TypeError('The type of frame_range must be int or list.')

        if 'test' in method and \
                (frame_range[1] - frame_range[0]) != num_ref_imgs:
            print_log(
                'Warning:'
                "frame_range[1] - frame_range[0] isn't equal to num_ref_imgs."
                'Set num_ref_imgs to frame_range[1] - frame_range[0].',
                logger=self.logger)
            self.ref_img_sampler[
                'num_ref_imgs'] = frame_range[1] - frame_range[0]

        if (not self.load_as_video) or img_info.get('frame_id', -1) < 0 \
                or (frame_range[0] == 0 and frame_range[1] == 0):
            ref_img_infos = []
            for i in range(num_ref_imgs):
                ref_img_infos.append(img_info.copy())
        else:
            vid_id, img_id, frame_id = img_info['video_id'], img_info[
                'id'], img_info['frame_id']
            img_ids = self.coco.get_img_ids_from_vid(vid_id)
            left = max(0, frame_id + frame_range[0])
            right = min(frame_id + frame_range[1], len(img_ids) - 1)

            ref_img_ids = []
            if method == 'uniform':
                valid_ids = img_ids[left:right + 1]
                if filter_key_img and img_id in valid_ids:
                    valid_ids.remove(img_id)

                if num_ref_imgs != len(valid_ids):
                    return None


                num_samples = min(num_ref_imgs, len(valid_ids))
                ref_img_ids.extend(random.sample(valid_ids, num_samples))
            elif method == 'bilateral_uniform':
                assert num_ref_imgs % 2 == 0, \
                    'only support load even number of ref_imgs.'
                for mode in ['left', 'right']:
                    if mode == 'left':
                        valid_ids = img_ids[left:frame_id + 1]
                    else:
                        valid_ids = img_ids[frame_id:right + 1]
                    if filter_key_img and img_id in valid_ids:
                        valid_ids.remove(img_id)
                    num_samples = min(num_ref_imgs // 2, len(valid_ids))
                    sampled_inds = random.sample(valid_ids, num_samples)
                    ref_img_ids.extend(sampled_inds)
            elif method == 'test_with_adaptive_stride':
                if frame_id == 0:
                    stride = float(len(img_ids) - 1) / (num_ref_imgs - 1)
                    for i in range(num_ref_imgs):
                        ref_id = round(i * stride)
                        ref_img_ids.append(img_ids[ref_id])
            elif method == 'test_with_fix_stride':
                if frame_id == 0:
                    for i in range(frame_range[0], 1):
                        ref_img_ids.append(img_ids[0])
                    for i in range(1, frame_range[1] + 1):
                        ref_id = min(round(i * stride), len(img_ids) - 1)
                        ref_img_ids.append(img_ids[ref_id])
                elif frame_id % stride == 0:
                    ref_id = min(
                        round(frame_id + frame_range[1] * stride),
                        len(img_ids) - 1)
                    ref_img_ids.append(img_ids[ref_id])
                img_info['num_left_ref_imgs'] = abs(frame_range[0]) \
                    if isinstance(frame_range, list) else frame_range
                img_info['frame_stride'] = stride
            else:
                raise NotImplementedError

            ref_img_infos = []
            for ref_img_id in ref_img_ids:
                ref_img_info = self.coco.load_imgs([ref_img_id])[0]
                ref_img_info['filename'] = ref_img_info['file_name']
                ref_img_infos.append(ref_img_info)
            ref_img_infos = sorted(ref_img_infos, key=lambda i: i['frame_id'])

        if return_key_img:
            return [img_info, *ref_img_infos]
        else:
            return ref_img_infos

    def get_ann_info(self, img_info):
        """Get COCO annotations by the information of image.

        Args:
            img_info (int): Information of image.

        Returns:
            dict: Annotation information of `img_info`.
        """
        img_id = img_info['id']
        ann_ids = self.coco.get_ann_ids(img_ids=[img_id], cat_ids=self.cat_ids)
        ann_info = self.coco.load_anns(ann_ids)
        return self._parse_ann_info(img_info, ann_info)

    def prepare_results(self, img_info):
        """Prepare results for image (e.g. the annotation information, ...)."""
        results = dict(img_info=img_info)
        if not self.test_mode or self.test_load_ann:
            results['ann_info'] = self.get_ann_info(img_info)
        if self.proposals is not None:
            idx = self.img_ids.index(img_info['id'])
            results['proposals'] = self.proposals[idx]

        super().pre_pipeline(results)
        results['is_video_data'] = self.load_as_video
        return results

    def prepare_data(self, idx):
        """Get data and annotations after pipeline.

        Args:
            idx (int): Index of data.

        Returns:
            dict: Data and annotations after pipeline with new keys introduced
            by pipeline.
        """
        img_info = self.data_infos[idx]
        if self.ref_img_sampler is not None:
            img_infos = self.ref_img_sampling(img_info, **self.ref_img_sampler)
            if img_infos is None:
                return None
            results = [
                self.prepare_results(img_info) for img_info in img_infos
            ]
        elif self.load_all_frames:
            results = [
                self.prepare_results(_img_info) for _img_info in img_info
            ]
        else:
            results = self.prepare_results(img_info)
        return self.pipeline(results)

    def prepare_train_img(self, idx):
        """Get training data and annotations after pipeline.

        Args:
            idx (int): Index of data.

        Returns:
            dict: Training data and annotations after pipeline with new keys
            introduced by pipeline.
        """
        return self.prepare_data(idx)

    def prepare_test_img(self, idx):
        """Get testing data after pipeline.

        Args:
            idx (int): Index of data.

        Returns:
            dict: Testing data after pipeline with new keys intorduced by
            pipeline.
        """
        return self.prepare_data(idx)

    def _parse_ann_info(self, img_info, ann_info):
        """Parse bbox and mask annotations.

        Args:
            img_anfo (dict): Information of image.
            ann_info (list[dict]): Annotation information of image.

        Returns:
            dict: A dict containing the following keys: bboxes, bboxes_ignore,
            labels, instance_ids, masks, seg_map. "masks" are raw
            annotations and not decoded into binary masks.
        """
        gt_bboxes = []
        gt_labels = []
        gt_bboxes_ignore = []
        gt_masks = []
        gt_instance_ids = []

        for i, ann in enumerate(ann_info):
            if ann.get('ignore', False):
                continue
            x1, y1, w, h = ann['bbox']
            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
            if inter_w * inter_h == 0:
                continue
            if ann['area'] <= 0 or w < 1 or h < 1:
                continue
            if ann['category_id'] not in self.cat_ids:
                continue
            bbox = [x1, y1, x1 + w, y1 + h]
            if ann.get('iscrowd', False):
                gt_bboxes_ignore.append(bbox)
            else:
                gt_bboxes.append(bbox)
                gt_labels.append(self.cat2label[ann['category_id']])
                if 'segmentation' in ann:
                    gt_masks.append(ann['segmentation'])
                if 'instance_id' in ann:
                    gt_instance_ids.append(ann['instance_id'])

        if gt_bboxes:
            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
            gt_labels = np.array(gt_labels, dtype=np.int64)
        else:
            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
            gt_labels = np.array([], dtype=np.int64)

        if gt_bboxes_ignore:
            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
        else:
            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)

        seg_map = img_info['filename'].replace('jpg', 'png')

        ann = dict(
            bboxes=gt_bboxes,
            labels=gt_labels,
            bboxes_ignore=gt_bboxes_ignore,
            masks=gt_masks,
            seg_map=seg_map)

        if self.load_as_video:
            ann['instance_ids'] = np.array(gt_instance_ids).astype(np.int)
        else:
            ann['instance_ids'] = np.arange(len(gt_labels))

        return ann

    def evaluate(self,
                 results,
                 metric=['bbox', 'track'],
                 logger=None,
                 bbox_kwargs=dict(
                     classwise=False,
                     proposal_nums=(100, 300, 1000),
                     iou_thrs=None,
                     metric_items=None),
                 track_kwargs=dict(
                     iou_thr=0.5,
                     ignore_iof_thr=0.5,
                     ignore_by_classes=False,
                     nproc=4)):
        """Evaluation in COCO protocol and CLEAR MOT metric (e.g. MOTA, IDF1).

        Args:
            results (dict): Testing results of the dataset.
            metric (str | list[str]): Metrics to be evaluated. Options are
                'bbox', 'segm', 'track'.
            logger (logging.Logger | str | None): Logger used for printing
                related information during evaluation. Default: None.
            bbox_kwargs (dict): Configuration for COCO styple evaluation.
            track_kwargs (dict): Configuration for CLEAR MOT evaluation.

        Returns:
            dict[str, float]: COCO style and CLEAR MOT evaluation metric.
        """
        if isinstance(metric, list):
            metrics = metric
        elif isinstance(metric, str):
            metrics = [metric]
        else:
            raise TypeError('metric must be a list or a str.')
        allowed_metrics = ['bbox', 'segm', 'track']
        for metric in metrics:
            if metric not in allowed_metrics:
                raise KeyError(f'metric {metric} is not supported.')

        eval_results = dict()
        if 'track' in metrics:
            assert len(self.data_infos) == len(results['track_bboxes'])
            inds = [
                i for i, _ in enumerate(self.data_infos) if _['frame_id'] == 0
            ]
            num_vids = len(inds)
            inds.append(len(self.data_infos))

            track_bboxes = [
                results['track_bboxes'][inds[i]:inds[i + 1]]
                for i in range(num_vids)
            ]
            ann_infos = [self.get_ann_info(_) for _ in self.data_infos]
            ann_infos = [
                ann_infos[inds[i]:inds[i + 1]] for i in range(num_vids)
            ]
            raise NotImplementedError("eval_mot is not implemented yet.")
            # track_eval_results = eval_mot(
            #     results=track_bboxes,
            #     annotations=ann_infos,
            #     logger=logger,
            #     classes=self.CLASSES,
            #     **track_kwargs)
            # eval_results.update(track_eval_results)

        # evaluate for detectors without tracker
        super_metrics = ['bbox', 'segm']
        super_metrics = [_ for _ in metrics if _ in super_metrics]
        if super_metrics:
            if isinstance(results, dict):
                if 'bbox' in super_metrics and 'segm' in super_metrics:
                    super_results = []
                    for bbox, mask in zip(results['det_bboxes'],
                                          results['det_masks']):
                        super_results.append((bbox, mask))
                else:
                    super_results = results['det_bboxes']
            elif isinstance(results, list):
                super_results = results
            else:
                raise TypeError('Results must be a dict or a list.')
            super_eval_results = super().evaluate(
                results=super_results,
                metric=super_metrics,
                logger=logger,
                **bbox_kwargs)
            eval_results.update(super_eval_results)

        return eval_results

    def __repr__(self):
        """Print the number of instance number suit for video dataset."""
        dataset_type = 'Test' if self.test_mode else 'Train'
        result = (f'\n{self.__class__.__name__} {dataset_type} dataset '
                  f'with number of images {len(self)}, '
                  f'and instance counts: \n')
        if self.CLASSES is None:
            result += 'Category names are not provided. \n'
            return result
        instance_count = np.zeros(len(self.CLASSES) + 1).astype(int)
        # count the instance number in each image
        for idx in range(len(self)):
            img_info = self.data_infos[idx]
            label = self.get_ann_info(img_info)['labels']
            unique, counts = np.unique(label, return_counts=True)
            if len(unique) > 0:
                # add the occurrence number to each class
                instance_count[unique] += counts
            else:
                # background is the last index
                instance_count[-1] += 1
        # create a table with category count
        table_data = [['category', 'count'] * 5]
        row_data = []
        for cls, count in enumerate(instance_count):
            if cls < len(self.CLASSES):
                row_data += [f'{cls} [{self.CLASSES[cls]}]', f'{count}']
            else:
                # add the background number
                row_data += ['-1 background', f'{count}']
            if len(row_data) == 10:
                table_data.append(row_data)
                row_data = []
        if len(row_data) >= 2:
            if row_data[-1] == '0':
                row_data = row_data[:-2]
            if len(row_data) >= 2:
                table_data.append([])
                table_data.append(row_data)

        table = AsciiTable(table_data)
        result += table.table
        return result


================================================
FILE: mmtrack/datasets/parsers/__init__.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from .coco_video_parser import CocoVID

__all__ = ['CocoVID']


================================================
FILE: mmtrack/datasets/parsers/coco_video_parser.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from collections import defaultdict

import numpy as np
from mmdet.datasets.api_wrappers import COCO
from pycocotools.coco import _isArrayLike


class CocoVID(COCO):
    """Inherit official COCO class in order to parse the annotations of bbox-
    related video tasks.
    Args:
        annotation_file (str): location of annotation file. Defaults to None.
        load_img_as_vid (bool): If True, convert image data to video data,
            which means each image is converted to a video. Defaults to False.
    """

    def __init__(self, annotation_file=None, load_img_as_vid=False):
        assert annotation_file, 'Annotation file must be provided.'
        self.load_img_as_vid = load_img_as_vid
        super(CocoVID, self).__init__(annotation_file=annotation_file)

    def convert_img_to_vid(self, dataset):
        """Convert image data to video data."""
        if 'images' in self.dataset:
            videos = []
            for i, img in enumerate(self.dataset['images']):
                videos.append(dict(id=img['id'], name=img['file_name']))
                img['video_id'] = img['id']
                img['frame_id'] = 0
            dataset['videos'] = videos

        if 'annotations' in self.dataset:
            for i, ann in enumerate(self.dataset['annotations']):
                ann['video_id'] = ann['image_id']
                ann['instance_id'] = ann['id']
        return dataset

    def createIndex(self, use_ext=False):
        """Create index."""
        print('creating index...')
        anns, cats, imgs, vids = {}, {}, {}, {}
        (imgToAnns, catToImgs, vidToImgs, vidToInstances,
         instancesToImgs) = defaultdict(list), defaultdict(list), defaultdict(
            list), defaultdict(list), defaultdict(list)

        if 'videos' not in self.dataset and self.load_img_as_vid:
            self.dataset = self.convert_img_to_vid(self.dataset)

        if 'videos' in self.dataset:
            for video in self.dataset['videos']:
                vids[video['id']] = video

        if 'annotations' in self.dataset:
            for ann in self.dataset['annotations']:
                imgToAnns[ann['image_id']].append(ann)
                anns[ann['id']] = ann
                if 'instance_id' in ann:
                    instancesToImgs[ann['instance_id']].append(ann['image_id'])
                    if 'video_id' in ann and \
                            ann['instance_id'] not in \
                            vidToInstances[ann['video_id']]:
                        vidToInstances[ann['video_id']].append(
                            ann['instance_id'])

        if 'images' in self.dataset:
            for img in self.dataset['images']:
                vidToImgs[img['video_id']].append(img)
                imgs[img['id']] = img

        if 'categories' in self.dataset:
            for cat in self.dataset['categories']:
                cats[cat['id']] = cat

        if 'annotations' in self.dataset and 'categories' in self.dataset:
            for ann in self.dataset['annotations']:
                catToImgs[ann['category_id']].append(ann['image_id'])

        print('index created!')

        self.anns = anns
        self.imgToAnns = imgToAnns
        self.catToImgs = catToImgs
        self.imgs = imgs
        self.cats = cats
        self.videos = vids
        self.vidToImgs = vidToImgs
        self.vidToInstances = vidToInstances
        self.instancesToImgs = instancesToImgs

    def get_vid_ids(self, vidIds=[]):
        """Get video ids that satisfy given filter conditions.
        Default return all video ids.
        Args:
            vidIds (list[int]): The given video ids. Defaults to [].
        Returns:
            list[int]: Video ids.
        """
        vidIds = vidIds if _isArrayLike(vidIds) else [vidIds]

        if len(vidIds) == 0:
            ids = self.videos.keys()
        else:
            ids = set(vidIds)

        return list(ids)

    def get_img_ids_from_vid(self, vidId):
        """Get image ids from given video id.
        Args:
            vidId (int): The given video id.
        Returns:
            list[int]: Image ids of given video id.
        """
        img_infos = self.vidToImgs[vidId]
        ids = list(np.zeros([len(img_infos)], dtype=np.int64))
        for img_info in img_infos:
            ids[img_info['frame_id']] = img_info['id']
        return ids

    def get_ins_ids_from_vid(self, vidId):
        """Get instance ids from given video id.
        Args:
            vidId (int): The given video id.
        Returns:
            list[int]: Instance ids of given video id.
        """
        return self.vidToInstances[vidId]

    def get_img_ids_from_ins_id(self, insId):
        """Get image ids from given instance id.
        Args:
            insId (int): The given instance id.
        Returns:
            list[int]: Image ids of given instance id.
        """
        return self.instancesToImgs[insId]

    def load_vids(self, ids=[]):
        """Get video information of given video ids.
        Default return all videos information.
        Args:
            ids (list[int]): The given video ids. Defaults to [].
        Returns:
            list[dict]: List of video information.
        """
        if _isArrayLike(ids):
            return [self.videos[id] for id in ids]
        elif type(ids) == int:
            return [self.videos[ids]]


================================================
FILE: mmtrack/datasets/youtube_vis_dataset.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import os.path
import os.path as osp
import tempfile
import zipfile

import mmcv
import numpy as np
from mmcv.utils import print_log
from mmdet.datasets import DATASETS

from .coco_video_dataset import CocoVideoDataset


def results2outs(bbox_results=None,
                 mask_results=None,
                 mask_shape=None,
                 **kwargs):
    """Restore the results (list of results of each category) into the results
    of the model forward.
    Args:
        bbox_results (list[np.ndarray]): Each list denotes bboxes of one
            category.
        mask_results (list[list[np.ndarray]]): Each outer list denotes masks of
            one category. Each inner list denotes one mask belonging to
            the category. Each mask has shape (h, w).
        mask_shape (tuple[int]): The shape (h, w) of mask.
    Returns:
        tuple: tracking results of each class. It may contain keys as belows:
        - bboxes (np.ndarray): shape (n, 5)
        - labels (np.ndarray): shape (n, )
        - masks (np.ndarray): shape (n, h, w)
        - ids (np.ndarray): shape (n, )
    """
    outputs = dict()

    if bbox_results is not None:
        labels = []
        for i, bbox in enumerate(bbox_results):
            labels.extend([i] * bbox.shape[0])
        labels = np.array(labels, dtype=np.int64)
        outputs['labels'] = labels

        bboxes = np.concatenate(bbox_results, axis=0).astype(np.float32)
        if bboxes.shape[1] == 5:
            outputs['bboxes'] = bboxes
        elif bboxes.shape[1] == 6:
            ids = bboxes[:, 0].astype(np.int64)
            bboxes = bboxes[:, 1:]
            outputs['bboxes'] = bboxes
            outputs['ids'] = ids
        else:
            raise NotImplementedError(
                f'Not supported bbox shape: (N, {bboxes.shape[1]})')

    if mask_results is not None:
        assert mask_shape is not None
        mask_height, mask_width = mask_shape
        mask_results = mmcv.concat_list(mask_results)
        if len(mask_results) == 0:
            masks = np.zeros((0, mask_height, mask_width)).astype(bool)
        else:
            masks = np.stack(mask_results, axis=0)
        outputs['masks'] = masks

    return outputs


@DATASETS.register_module()
class YouTubeVISDataset(CocoVideoDataset):
    """YouTube VIS dataset for video instance segmentation."""

    CLASSES_2019_version = ('person', 'giant_panda', 'lizard', 'parrot', 'skateboard',
                            'sedan', 'ape', 'dog', 'snake', 'monkey',
                            'hand', 'rabbit', 'duck', 'cat', 'cow',
                            'fish', 'train', 'horse', 'turtle', 'bear',
                            'motorbike', 'giraffe', 'leopard', 'fox', 'deer',
                            'owl', 'surfboard', 'airplane', 'truck', 'zebra',
                            'tiger', 'elephant', 'snowboard', 'boat', 'shark',
                            'mouse', 'frog', 'eagle', 'earless_seal', 'tennis_racket')

    CLASSES_2021_version = ('airplane', 'bear', 'bird', 'boat', 'car', 'cat',
                            'cow', 'deer', 'dog', 'duck', 'earless_seal',
                            'elephant', 'fish', 'flying_disc', 'fox', 'frog',
                            'giant_panda', 'giraffe', 'horse', 'leopard',
                            'lizard', 'monkey', 'motorbike', 'mouse', 'parrot',
                            'person', 'rabbit', 'shark', 'skateboard', 'snake',
                            'snowboard', 'squirrel', 'surfboard',
                            'tennis_racket', 'tiger', 'train', 'truck',
                            'turtle', 'whale', 'zebra')

    def __init__(self, dataset_version, *args, **kwargs):
        self.set_dataset_classes(dataset_version)
        super().__init__(*args, **kwargs)

    @classmethod
    def set_dataset_classes(cls, dataset_version):
        if dataset_version == '2019':
            cls.CLASSES = cls.CLASSES_2019_version
        elif dataset_version == '2021':
            cls.CLASSES = cls.CLASSES_2021_version
        else:
            raise NotImplementedError('Not supported YouTubeVIS dataset'
                                      f'version: {dataset_version}')

    def format_results(self,
                       _results,
                       resfile_path=None,
                       metrics=['track_segm']):
        """Format the results to a zip file (standard format for YouTube-VIS
        Challenge).
        Args:
            results (dict(list[ndarray])): Testing results of the dataset.
            resfile_path (str, optional): Path to save the formatted results.
                Defaults to None.
            metrics (list[str], optional): The results of the specific metrics
                will be formatted. Defaults to ['track_segm'].
        Returns:
            tuple: (resfiles, tmp_dir), resfiles is the path of the result
            json file, tmp_dir is the temporal directory created for saving
            files.
        """
        results = {
            'track_bboxes':[item[0] for item in _results],
            'track_masks':[item[1] for item in _results]
        }
        data_infos = []
        for item in self.data_infos:
            data_infos.extend(item[1:])
        assert isinstance(results, dict), 'results must be a dict.'
        if isinstance(metrics, str):
            metrics = [metrics]
        assert 'track_segm' in metrics
        if resfile_path is None:
            tmp_dir = tempfile.TemporaryDirectory()
            resfile_path = tmp_dir.name
        else:
            tmp_dir = None
            if not os.path.exists(resfile_path):
                os.makedirs(resfile_path)
        resfiles = osp.join(resfile_path, 'results.json')

        inds = [i for i, _ in enumerate(data_infos) if _['frame_id'] == 0]
        num_vids = len(inds)
        assert num_vids == len(self.vid_ids)
        inds.append(len(data_infos))
        vid_infos = self.coco.load_vids(self.vid_ids)

        json_results = []
        for i in range(num_vids):
            video_id = vid_infos[i]['id']
            # collect data for each instances in a video.
            collect_data = dict()
            for frame_id, (bbox_res, mask_res) in enumerate(
                    zip(results['track_bboxes'][inds[i]:inds[i + 1]],
                        results['track_masks'][inds[i]:inds[i + 1]])):
                outs_track = results2outs(bbox_results=bbox_res)
                bboxes = outs_track['bboxes']
                labels = outs_track['labels']
                ids = outs_track['ids']
                masks = mmcv.concat_list(mask_res)
                assert len(masks) == len(bboxes)
                for j, id in enumerate(ids):
                    if id not in collect_data:
                        collect_data[id] = dict(
                            category_ids=[], scores=[], segmentations=dict())
                    collect_data[id]['category_ids'].append(labels[j])
                    collect_data[id]['scores'].append(bboxes[j][4])
                    if isinstance(masks[j]['counts'], bytes):
                        masks[j]['counts'] = masks[j]['counts'].decode()
                    collect_data[id]['segmentations'][frame_id] = masks[j]

            # transform the collected data into official format
            for id, id_data in collect_data.items():
                output = dict()
                output['video_id'] = video_id
                output['score'] = np.array(id_data['scores']).mean().item()
                # majority voting for sequence category
                output['category_id'] = np.bincount(
                    np.array(id_data['category_ids'])).argmax().item() + 1
                output['segmentations'] = []
                for frame_id in range(inds[i + 1] - inds[i]):
                    if frame_id in id_data['segmentations']:
                        output['segmentations'].append(
                            id_data['segmentations'][frame_id])
                    else:
                        output['segmentations'].append(None)
                json_results.append(output)
        mmcv.dump(json_results, resfiles)

        # zip the json file in order to submit to the test server.
        zip_file_name = osp.join(resfile_path, 'submission_file.zip')
        zf = zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED)
        print_log(f"zip the 'results.json' into '{zip_file_name}', "
                  'please submmit the zip file to the test server')
        zf.write(resfiles, 'results.json')
        zf.close()

        return resfiles, tmp_dir


================================================
FILE: mmtrack/pipelines/__init__.py
================================================
from .formatting import *
from .loading import *
from .test_time_aug import *
from .transforms import *

================================================
FILE: mmtrack/pipelines/formatting.py
================================================
import numpy as np
import torch
from mmcv.parallel import DataContainer as DC
from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import to_tensor


@PIPELINES.register_module()
class ConcatVideoReferences(object):
    """Concat video references.

    If the input list contains at least two dicts, concat the input list of
    dict to one dict from 2-nd dict of the input list.

    Args:
        results (list[dict]): List of dict that contain keys such as 'img',
            'img_metas', 'gt_masks','proposals', 'gt_bboxes',
            'gt_bboxes_ignore', 'gt_labels','gt_semantic_seg',
            'gt_instance_ids'.

    Returns:
        list[dict]: The first dict of outputs is the same as the first
        dict of `results`. The second dict of outputs concats the
        dicts in `results[1:]`.
    """

    def __call__(self, results):
        assert (isinstance(results, list)), 'results must be list'
        outs = results[:1]
        for i, result in enumerate(results[1:], 1):
            if 'img' in result:
                img = result['img']
                if len(img.shape) < 3:
                    img = np.expand_dims(img, -1)
                if i == 1:
                    result['img'] = np.expand_dims(img, -1)
                else:
                    outs[1]['img'] = np.concatenate(
                        (outs[1]['img'], np.expand_dims(img, -1)), axis=-1)
            for key in ['img_metas', 'gt_masks']:
                if key in result:
                    if i == 1:
                        result[key] = [result[key]]
                    else:
                        outs[1][key].append(result[key])
            for key in [
                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
                'gt_instance_ids',
            ]:
                if key not in result:
                    continue
                value = result[key]
                if value.ndim == 1:
                    value = value[:, None]
                N = value.shape[0]
                value = np.concatenate((np.full(
                    (N, 1), i - 1, dtype=int if key in ['gt_labels', 'gt_instance_ids'] else np.float32
                ), value), axis=1)
                if i == 1:
                    result[key] = value
                else:
                    outs[1][key] = np.concatenate((outs[1][key], value), axis=0)
            if 'gt_semantic_seg' in result:
                if i == 1:
                    result['gt_semantic_seg'] = result['gt_semantic_seg'][..., None, None]
                else:
                    outs[1]['gt_semantic_seg'] = np.concatenate(
                        (outs[1]['gt_semantic_seg'], result['gt_semantic_seg'][..., None, None]), axis=-1)
            if i == 1:
                outs.append(result)
        return outs


@PIPELINES.register_module()
class ConcatVideos(object):
    """Concat video references.

    If the input list contains at least two dicts, concat the input list of
    dict to one dict from 2-nd dict of the input list.

    Args:
        results (list[dict]): List of dict that contain keys such as 'img',
            'img_metas', 'gt_masks','proposals', 'gt_bboxes',
            'gt_bboxes_ignore', 'gt_labels','gt_semantic_seg',
            'gt_instance_ids'.

    Returns:
        list[dict]: The first dict of outputs is the same as the first
        dict of `results`. The second dict of outputs concats the
        dicts in `results[1:]`.
    """

    def __call__(self, results):
        assert (isinstance(results, list)), 'results must be list'
        outs = results[:1]
        # outs = []
        for i, result in enumerate(results[0:], 1):
            if 'img' in result:
                img = result['img']
                if len(img.shape) < 3:
                    img = np.expand_dims(img, -1)
                if i == 1:
                    result['img'] = np.expand_dims(img, -1)
                else:
                    outs[1]['img'] = np.concatenate(
                        (outs[1]['img'], np.expand_dims(img, -1)), axis=-1)
            for key in ['img_metas', 'gt_masks']:
                if key in result:
                    if i == 1:
                        result[key] = [result[key]]
                    else:
                        outs[1][key].append(result[key])
            for key in [
                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
                'gt_instance_ids'
            ]:
                if key not in result:
                    continue
                value = result[key]
                if value.ndim == 1:
                    value = value[:, None]
                N = value.shape[0]
                value = np.concatenate((np.full(
                    (N, 1), i - 1, dtype=int if key in ['gt_labels', 'gt_instance_ids'] else np.float32
                ), value), axis=1)
                if i == 1:
                    result[key] = value
                else:
                    outs[1][key] = np.concatenate((outs[1][key], value),
                                                  axis=0)
            if 'gt_semantic_seg' in result:
                if i == 1:
                    result['gt_semantic_seg'] = result['gt_semantic_seg'][...,
                                                                          None,
                                                                          None]
                else:
                    outs[1]['gt_semantic_seg'] = np.concatenate(
                        (outs[1]['gt_semantic_seg'],
                         result['gt_semantic_seg'][..., None, None]),
                        axis=-1)
            if i == 1:
                outs.append(result)
        res = []
        res.append(outs[1])
        return res


@PIPELINES.register_module()
class MultiImagesToTensor(object):
    """Multi images to tensor.

    1. Transpose and convert image/multi-images to Tensor.
    2. Add prefix to every key in the second dict of the inputs. Then, add
    these keys and corresponding values into the outputs.

    Args:
        ref_prefix (str): The prefix of key added to the second dict of inputs.
            Defaults to 'ref'.
    """

    def __init__(self, ref_prefix='ref'):
        self.ref_prefix = ref_prefix

    def __call__(self, results):
        """Multi images to tensor.

        1. Transpose and convert image/multi-images to Tensor.
        2. Add prefix to every key in the second dict of the inputs. Then, add
        these keys and corresponding values into the output dict.

        Args:
            results (list[dict]): List of two dicts.

        Returns:
            dict: Each key in the first dict of `results` remains unchanged.
            Each key in the second dict of `results` adds `self.ref_prefix`
            as prefix.
        """
        outs = []
        for _results in results:
            _results = self.images_to_tensor(_results)
            outs.append(_results)

        data = {}
        data.update(outs[0])
        if len(outs) == 2:
            for k, v in outs[1].items():
                data[f'{self.ref_prefix}_{k}'] = v

        return data

    def images_to_tensor(self, results):
        """Transpose and convert images/multi-images to Tensor."""
        if 'img' in results:
            img = results['img']
            if len(img.shape) == 3:
                # (H, W, 3) to (3, H, W)
                img = np.ascontiguousarray(img.transpose(2, 0, 1))
            else:
                # (H, W, 3, N) to (N, 3, H, W)
                img = np.ascontiguousarray(img.transpose(3, 2, 0, 1))
            results['img'] = to_tensor(img)
        if 'proposals' in results:
            results['proposals'] = to_tensor(results['proposals'])
        if 'img_metas' in results:
            results['img_metas'] = DC(results['img_metas'], cpu_only=True)
        return results


@PIPELINES.register_module()
class SeqDefaultFormatBundle(object):
    """Sequence Default formatting bundle.

    It simplifies the pipeline of formatting common fields, including "img",
    "img_metas", "proposals", "gt_bboxes", "gt_instance_ids",
    "gt_match_indices", "gt_bboxes_ignore", "gt_labels", "gt_masks" and
    "gt_semantic_seg". These fields are formatted as follows.

    - img: (1) transpose, (2) to tensor, (3) to DataContainer (stack=True)
    - img_metas: (1) to DataContainer (cpu_only=True)
    - proposals: (1) to tensor, (2) to DataContainer
    - gt_bboxes: (1) to tensor, (2) to DataContainer
    - gt_instance_ids: (1) to tensor, (2) to DataContainer
    - gt_match_indices: (1) to tensor, (2) to DataContainer
    - gt_bboxes_ignore: (1) to tensor, (2) to DataContainer
    - gt_labels: (1) to tensor, (2) to DataContainer
    - gt_masks: (1) to DataContainer (cpu_only=True)
    - gt_semantic_seg: (1) unsqueeze dim-0 (2) to tensor, \
                       (3) to DataContainer (stack=True)

    Args:
        ref_prefix (str): The prefix of key added to the second dict of input
            list. Defaults to 'ref'.
    """

    def __init__(self, ref_prefix='ref'):
        self.ref_prefix = ref_prefix

    def __call__(self, results):
        """Sequence Default formatting bundle call function.

        Args:
            results (list[dict]): List of two dicts.

        Returns:
            dict: The result dict contains the data that is formatted with
            default bundle. Each key in the second dict of the input list
            adds `self.ref_prefix` as prefix.
        """
        outs = []
        for _results in results:
            _results = self.default_format_bundle(_results)
            outs.append(_results)

        data = {}
        if self.ref_prefix == 'ref':
            # origin frames
            data.update(outs[0])
            # reference frames
            if len(outs) == 1:
                # for k in outs[0]:
                #     data[f'{self.ref_prefix}_{k}'] = None
                pass
            else:
                for k, v in outs[1].items():
                    data[f'{self.ref_prefix}_{k}'] = v
        elif self.ref_prefix is None:
            # origin frames
            data.update(outs[0])

        return data

    def default_format_bundle(self, results):
        """Transform and format common fields in results.

        Args:
            results (dict): Result dict contains the data to convert.

        Returns:
            dict: The result dict contains the data that is formatted with
            default bundle.
        """
        if 'img' in results:
            img = results['img']
            if len(img.shape) == 3:
                img = np.ascontiguousarray(img.transpose(2, 0, 1))
            else:
                img = np.ascontiguousarray(img.transpose(3, 2, 0, 1))
            results['img'] = DC(to_tensor(img), stack=True)
        for key in [
            'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
            'gt_instance_ids', 'gt_match_indices',
        ]:
            if key not in results:
                continue
            results[key] = DC(to_tensor(results[key]))
        for key in ['img_metas', 'gt_masks']:
            if key in results:
                results[key] = DC(results[key], cpu_only=True)
        if 'gt_semantic_seg' in results:
            semantic_seg = results['gt_semantic_seg']
            if len(semantic_seg.shape) == 2:
                semantic_seg = semantic_seg[None, ...]
            else:
                semantic_seg = np.ascontiguousarray(
                    semantic_seg.transpose(3, 2, 0, 1))
            results['gt_semantic_seg'] = DC(
                to_tensor(semantic_seg), stack=True)
        return results

    def __repr__(self):
        return self.__class__.__name__


@PIPELINES.register_module()
class VideoCollect(object):
    """Collect data from the loader relevant to the specific task.

    Args:
        keys (Sequence[str]): Keys of results to be collected in ``data``.
        meta_keys (Sequence[str]): Meta keys to be converted to
            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
            Defaults to None.
        default_meta_keys (tuple): Default meta keys. Defaults to ('filename',
            'ori_filename', 'ori_shape', 'img_shape', 'pad_shape',
            'scale_factor', 'flip', 'flip_direction', 'img_norm_cfg',
            'frame_id', 'is_video_data').
    """

    def __init__(self,
                 keys,
                 meta_keys=None,
                 reject_empty=False,
                 num_ref_imgs=0,
                 # no_obj_class is added for handling non-0  no-obj class
                 default_meta_keys=('filename', 'ori_filename', 'ori_shape',
                                    'img_shape', 'pad_shape', 'scale_factor',
                                    'flip', 'flip_direction', 'img_norm_cfg',
                                    'video_id',
                                    'frame_id', 'is_video_data', 'no_obj_class')):
        self.keys = keys
        self.meta_keys = default_meta_keys
        if meta_keys is not None:
            if isinstance(meta_keys, str):
                meta_keys = (meta_keys,)
            else:
                assert isinstance(meta_keys, tuple), \
                    'meta_keys must be str or tuple'
            self.meta_keys += meta_keys

        self.reject_empty = reject_empty
        self.num_ref_imgs = num_ref_imgs

    def __call__(self, results):
        """Call function to collect keys in results.

        The keys in ``meta_keys`` and ``default_meta_keys`` will be converted
        to :obj:mmcv.DataContainer.

        Args:
            results (list[dict] | dict): List of dict or dict which contains
                the data to collect.

        Returns:
            list[dict] | dict: List of dict or dict that contains the
            following keys:

            - keys in ``self.keys``
            - ``img_metas``
        """
        results_is_dict = isinstance(results, dict)
        if results_is_dict:
            results = [results]
        outs = []
        for _results in results:
            _results = self._add_default_meta_keys(_results)
            _results = self._collect_meta_keys(_results)
            outs.append(_results)

        if results_is_dict:
            outs[0]['img_metas'] = DC(outs[0]['img_metas'], cpu_only=True)

        if self.reject_empty:
            if len(results[0]['gt_labels']) == 0:
                return None
        if self.num_ref_imgs > 0:
            if len(results) != self.num_ref_imgs + 1:
                raise NotImplementedError
        return outs[0] if results_is_dict else outs

    def _collect_meta_keys(self, results):
        """Collect `self.keys` and `self.meta_keys` from `results` (dict)."""
        data = {}
        img_meta = {}
        for key in self.meta_keys:
            if key in results:
                img_meta[key] = results[key]
            elif key in results['img_info']:
                img_meta[key] = results['img_info'][key]
        data['img_metas'] = img_meta
        for key in self.keys:
            data[key] = results[key]
        return data

    def _add_default_meta_keys(self, results):
        """Add default meta keys.

        We set default meta keys including `pad_shape`, `scale_factor` and
        `img_norm_cfg` to avoid the case where no `Resize`, `Normalize` and
        `Pad` are implemented during the whole pipeline.

        Args:
            results (dict): Result dict contains the data to convert.

        Returns:
            results (dict): Updated result dict contains the data to convert.
        """
        img = results['img']
        results.setdefault('pad_shape', img.shape)
        results.setdefault('scale_factor', 1.0)
        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
        results.setdefault(
            'img_norm_cfg',
            dict(
                mean=np.zeros(num_channels, dtype=np.float32),
                std=np.ones(num_channels, dtype=np.float32),
                to_rgb=False))
        return results


@PIPELINES.register_module()
class ToList(object):
    """Use list to warp each value of the input dict.

    Args:
        results (dict): Result dict contains the data to convert.

    Returns:
        dict: Updated result dict contains the data to convert.
    """

    def __call__(self, results):
        out = {}
        for k, v in results.items():
            out[k] = [v]
        return out


@PIPELINES.register_module()
class ReIDFormatBundle(object):
    """ReID formatting bundle.

    It first concatenates common fields, then simplifies the pipeline of
    formatting common fields, including "img", and "gt_label".
    These fields are formatted as follows.

    - img: (1) transpose, (2) to tensor, (3) to DataContainer (stack=True)
    - gt_labels: (1) to tensor, (2) to DataContainer
    """

    def __init__(self, *args, **kwargs):
        super().__init__()

    def __call__(self, results):
        """ReID formatting bundle call function.

        Args:
            results (list[dict] or dict): List of dicts or dict.

        Returns:
            dict: The result dict contains the data that is formatted with
            ReID bundle.
        """
        inputs = dict()
        if isinstance(results, list):
            assert len(results) > 1, \
                'the \'results\' only have one item, ' \
                'please directly use normal pipeline not \'Seq\' pipeline.'
            inputs['img'] = np.stack([_results['img'] for _results in results],
                                     axis=3)
            inputs['gt_label'] = np.stack(
                [_results['gt_label'] for _results in results], axis=0)
        elif isinstance(results, dict):
            inputs['img'] = results['img']
            inputs['gt_label'] = results['gt_label']
        else:
            raise TypeError('results must be a list or a dict.')
        outs = self.reid_format_bundle(inputs)

        return outs

    def reid_format_bundle(self, results):
        """Transform and format gt_label fields in results.

        Args:
            results (dict): Result dict contains the data to convert.

        Returns:
            dict: The result dict contains the data that is formatted with
            ReID bundle.
        """
        for key in results:
            if key == 'img':
                img = results[key]
                if img.ndim == 3:
                    img = np.ascontiguousarray(img.transpose(2, 0, 1))
                else:
                    img = np.ascontiguousarray(img.transpose(3, 2, 0, 1))
                results['img'] = DC(to_tensor(img), stack=True)
            elif key == 'gt_label':
                results[key] = DC(
                    to_tensor(results[key]), stack=True, pad_dims=None)
            else:
                raise KeyError(f'key {key} is not supported')
        return results


@PIPELINES.register_module()
class ImageToTensorWithRef(object):

    def __init__(self, keys):
        self.keys = keys

    def __call__(self, results):

        for key in self.keys:
            if key in ['ref_img']:
                if isinstance(results[key], list):
                    img_ref = []
                    for img in results[key]:
                        img = np.ascontiguousarray(img.transpose(2, 0, 1))
                        img_ref.append(img)
                    img_ref = np.array(img_ref)
                    results[key] = to_tensor(img_ref)
                else:
                    img = np.ascontiguousarray(results[key].transpose(2, 0, 1))
                    results[key] = to_tensor(img)
            else:
                results[key] = to_tensor(results[key].transpose(2, 0, 1))
        return results

    def __repr__(self):
        return self.__class__.__name__ + '(keys={})'.format(self.keys)

@PIPELINES.register_module()
class LabelConsistentChecker:
    """This module is to make the annotations are consistent in each video.
    """
    def __init__(self, num_frames=5):
        self.num_frames = num_frames

    def __call__(self, results):
        ref_gt_instance_ids = results['ref_gt_instance_ids'].data
        ins_mul_nframe = ref_gt_instance_ids.size(0)
        if ins_mul_nframe % self.num_frames != 0:
            return None
        num_ins = ins_mul_nframe // self.num_frames
        ins_id_bucket = torch.zeros((num_ins,), dtype=torch.float)
        for i in range(ins_mul_nframe):
            frame_cur = i // num_ins
            ins_cur = i % num_ins
            if ref_gt_instance_ids[i][0] != frame_cur:
                return None
            if frame_cur == 0:
                ins_id_bucket[ins_cur] = ref_gt_instance_ids[i][1]
            else:
                if ref_gt_instance_ids[i][1] != ins_id_bucket[ins_cur]:
                    return None
        return results

@PIPELINES.register_module()
class MM2CLIP:
    """This module is to make the annotations are consistent in each video.
    """
    def __init__(self, num_frames=5):
        self.num_frames = num_frames

    def __call__(self, results):
        ins_ids = np.unique(results[1]['gt_instance_ids'][:,1])
        num_ins = len(ins_ids)
        num_frames = len(results[1]['img_metas'])
        ins_id_bucket = np.zeros((num_ins,), dtype=float)
        for i in range(num_ins * num_frames):
            frame_cur = i // num_ins
            ins_cur = i % num_ins
            if results[1]['gt_instance_ids'][i][0] != frame_cur:
                return None
            if frame_cur == 0:
                ins_id_bucket[ins_cur] = results[1]['gt_instance_ids'][i][1]
            else:
                if results[1]['gt_instance_ids'][i][1] != ins_id_bucket[ins_cur]:
                    return None
        return results


================================================
FILE: mmtrack/pipelines/loading.py
================================================
import os.path as osp
import numpy as np

import mmcv
from mmdet.core import BitmapMasks

from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import LoadAnnotations, LoadImageFromFile


@PIPELINES.register_module()
class LoadMultiImagesFromFile(LoadImageFromFile):
    """Load multi images from file.
    Please refer to `mmdet.datasets.pipelines.loading.py:LoadImageFromFile`
    for detailed docstring.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def __call__(self, results):
        """Call function.
        For each dict in `results`, call the call function of
        `LoadImageFromFile` to load image.
        Args:
            results (list[dict]): List of dict from
                :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            list[dict]: List of dict that contains loaded image.
        """
        outs = []
        for _results in results:
            _results = super().__call__(_results)
            outs.append(_results)
        return outs


@PIPELINES.register_module()
class SeqLoadAnnotations(LoadAnnotations):
    """Sequence load annotations.
    Please refer to `mmdet.datasets.pipelines.loading.py:LoadAnnotations`
    for detailed docstring.
    Args:
        with_track (bool): If True, load instance ids of bboxes.
    """

    def __init__(self, with_track=False, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.with_track = with_track

    def _load_track(self, results):
        """Private function to load label annotations.
        Args:
            results (dict): Result dict from :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            dict: The dict contains loaded label annotations.
        """

        results['gt_instance_ids'] = results['ann_info']['instance_ids'].copy()

        return results

    def __call__(self, results):
        """Call function.
        For each dict in results, call the call function of `LoadAnnotations`
        to load annotation.
        Args:
            results (list[dict]): List of dict that from
                :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            list[dict]: List of dict that contains loaded annotations, such as
            bounding boxes, labels, instance ids, masks and semantic
            segmentation annotations.
        """
        outs = []
        for _results in results:
            _results = super().__call__(_results)
            if self.with_track:
                _results = self._load_track(_results)
            outs.append(_results)
        return outs


@PIPELINES.register_module()
class LoadRefImageFromFile(object):
    """
    Code reading reference frame information.
    Specific to Cityscapes-VPS, Cityscapes, and VIPER datasets.
    """

    def __init__(self, sample=True, to_float32=False):
        self.to_float32 = to_float32
        self.sample = sample

    def __call__(self, results):
        # requires dirname for ref images
        assert results['ref_prefix'] is not None, 'ref_prefix must be specified.'

        filename = osp.join(results['img_prefix'],
                            results['img_info']['filename'])
        img = mmcv.imread(filename)
        # if specified by another ref json file.
        if 'ref_filename' in results['img_info']:
            ref_filename = osp.join(results['ref_prefix'],
                                    results['img_info']['ref_filename'])
            ref_img = mmcv.imread(ref_filename)  # [1024, 2048, 3]
        else:
            raise NotImplementedError('We need this implementation.')

        if self.to_float32:
            img = img.astype(np.float32)
            ref_img = ref_img.astype(np.float32)

        results['filename'] = filename
        results['ori_filename'] = results['img_info']['filename']
        results['img'] = img
        results['img_shape'] = img.shape
        results['ori_shape'] = img.shape
        results['ref_img'] = ref_img
        results['iid'] = results['img_info']['id']
        return results

    def __repr__(self):
        return self.__class__.__name__ + '(to_float32={})'.format(
            self.to_float32)


def bitmasks2bboxes(bitmasks):
    bitmasks_array = bitmasks.masks
    boxes = np.zeros((bitmasks_array.shape[0], 4), dtype=np.float32)
    x_any = np.any(bitmasks_array, axis=1)
    y_any = np.any(bitmasks_array, axis=2)
    for idx in range(bitmasks_array.shape[0]):
        x = np.where(x_any[idx, :])[0]
        y = np.where(y_any[idx, :])[0]
        if len(x) > 0 and len(y) > 0:
            boxes[idx, :] = np.array((x[0], y[0], x[-1], y[-1]), dtype=np.float32)
    return boxes


@PIPELINES.register_module()
class LoadAnnotationsInstanceMasks:
    def __init__(self,
                 with_mask=True,
                 with_seg=True,
                 with_inst=False,
                 cherry=None,
                 file_client_args=dict(backend='disk')):
        self.with_mask = with_mask
        self.with_seg = with_seg
        self.with_inst = with_inst
        self.file_client_args = file_client_args.copy()
        self.cherry = cherry
        self.file_client = None

    def _load_masks(self, results):
        """Private function to load mask annotations.
        Args:
            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
        Returns:
            dict: The dict contains loaded mask annotations.
                If ``self.poly2mask`` is set ``True``, `gt_mask` will contain
                :obj:`PolygonMasks`. Otherwise, :obj:`BitmapMasks` is used.
        """

        img_bytes = self.file_client.get(results['ann_info']['inst_map'])
        inst_mask = mmcv.imfrombytes(img_bytes, flag='unchanged').squeeze()
        if self.with_inst:
            results['gt_instance_map'] = inst_mask.copy().astype(int)
            results['gt_instance_map'][inst_mask < 10000] *= 1000
        if not self.with_mask:
            return results
        masks = []
        labels = []
        for inst_id in np.unique(inst_mask):
            if inst_id >= 10000:
                if self.cherry is not None and not (inst_id // 1000 in self.cherry):
                    continue
                masks.append((inst_mask == inst_id).astype(int))
                labels.append(inst_id // 1000)
        if len(masks) == 0:
            return None
        gt_masks = BitmapMasks(masks, height=inst_mask.shape[0], width=inst_mask.shape[1])
        results['gt_masks'] = gt_masks
        results['mask_fields'].append('gt_masks')
        results['gt_labels'] = np.array(labels)

        boxes = bitmasks2bboxes(gt_masks)
        results['gt_bboxes'] = boxes
        results['bbox_fields'].append('gt_bboxes')
        return results

    def _load_semantic_seg(self, results):
        """Private function to load semantic segmentation annotations.
        Args:
            results (dict): Result dict from :obj:`dataset`.
        Returns:
            dict: The dict contains loaded semantic segmentation annotations.
        """
        img_bytes = self.file_client.get(results['ann_info']['seg_map'])
        results['gt_semantic_seg'] = mmcv.imfrombytes(
            img_bytes, flag='unchanged').squeeze()
        results['seg_fields'].append('gt_semantic_seg')
        return results

    def __call__(self, results):
        """Call function to load multiple types annotations.
        Args:
            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
        Returns:
            dict: The dict contains loaded bounding box, label, mask and
                semantic segmentation annotations.
        """
        if self.file_client is None:
            self.file_client = mmcv.FileClient(**self.file_client_args)
        if self.with_mask or self.with_inst:
            results = self._load_masks(results)
            if results is None:
                return None
        if self.with_seg:
            results = self._load_semantic_seg(results)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'with_mask={self.with_mask}, '
        repr_str += f'with_seg={self.with_seg}, '
        return repr_str


================================================
FILE: mmtrack/pipelines/test_time_aug.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import warnings

import mmcv

from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import Compose


@PIPELINES.register_module()
class MultiScaleFlipAugVideo:
    """Test-time augmentation with multiple scales and flipping.
    An example configuration is as followed:
    .. code-block::
        img_scale=[(1333, 400), (1333, 800)],
        flip=True,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ]
    After MultiScaleFLipAug with above configuration, the results are wrapped
    into lists of the same length as followed:
    .. code-block::
        dict(
            img=[...],
            img_shape=[...],
            scale=[(1333, 400), (1333, 400), (1333, 800), (1333, 800)]
            flip=[False, True, False, True]
            ...
        )
    Args:
        transforms (list[dict]): Transforms to apply in each augmentation.
        img_scale (tuple | list[tuple] | None): Images scales for resizing.
        scale_factor (float | list[float] | None): Scale factors for resizing.
        flip (bool): Whether apply flip augmentation. Default: False.
        flip_direction (str | list[str]): Flip augmentation directions,
            options are "horizontal", "vertical" and "diagonal". If
            flip_direction is a list, multiple flip augmentations will be
            applied. It has no effect when flip == False. Default:
            "horizontal".
    """

    def __init__(self,
                 transforms,
                 img_scale=None,
                 scale_factor=None,
                 flip=False,
                 flip_direction='horizontal'):
        self.transforms = Compose(transforms)
        assert (img_scale is None) ^ (scale_factor is None), (
            'Must have but only one variable can be set')
        if img_scale is not None:
            self.img_scale = img_scale if isinstance(img_scale,
                                                     list) else [img_scale]
            self.scale_key = 'scale'
            assert mmcv.is_list_of(self.img_scale, tuple)
        else:
            self.img_scale = scale_factor if isinstance(
                scale_factor, list) else [scale_factor]
            self.scale_key = 'scale_factor'

        self.flip = flip
        self.flip_direction = flip_direction if isinstance(
            flip_direction, list) else [flip_direction]
        assert mmcv.is_list_of(self.flip_direction, str)
        if not self.flip and self.flip_direction != ['horizontal']:
            warnings.warn(
                'flip_direction has no effect when flip is set to False')
        if (self.flip
                and not any([t['type'] == 'RandomFlip' for t in transforms])):
            warnings.warn(
                'flip has no effect when RandomFlip is not in transforms')

    def __call__(self, results):
        """Call function to apply test time augment transforms on results.
        Args:
            results (dict): Result dict contains the data to transform.
        Returns:
           dict[str: list]: The augmented data, where each value is wrapped
               into a list.
        """

        aug_data = []
        flip_args = [(False, None)]
        if self.flip:
            flip_args += [(True, direction)
                          for direction in self.flip_direction]
        for scale in self.img_scale:
            for flip, direction in flip_args:
                _results = []
                for results_single in results:
                    _results_single = results_single.copy()
                    _results_single[self.scale_key] = scale
                    _results_single['flip'] = flip
                    _results_single['flip_direction'] = direction
                    _results.append(_results_single)
                data = self.transforms(_results)
                aug_data.append(data)
        # list of dict to dict of list
        aug_data_dict = {key: [] for key in aug_data[0]}
        for data in aug_data:
            for key, val in data.items():
                aug_data_dict[key].append(val)
        return aug_data_dict

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(transforms={self.transforms}, '
        repr_str += f'img_scale={self.img_scale}, flip={self.flip}, '
        repr_str += f'flip_direction={self.flip_direction})'
        return repr_str

================================================
FILE: mmtrack/pipelines/transforms.py
================================================
import cv2
import mmcv
import numpy as np
import warnings
from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import Normalize, Pad, RandomFlip, Resize


@PIPELINES.register_module()
class SeqColorAug(object):
    """Color augmention for images.
    Args:
        prob (list[float]): The probability to perform color augmention for
            each image. Defaults to [1.0, 1.0].
        rgb_var (list[list]]): The values of color augmentaion. Defaults to
            [[-0.55919361, 0.98062831, -0.41940627],
            [1.72091413, 0.19879334, -1.82968581],
            [4.64467907, 4.73710203, 4.88324118]].
    """

    def __init__(self,
                 prob=[1.0, 1.0],
                 rgb_var=[[-0.55919361, 0.98062831, -0.41940627],
                          [1.72091413, 0.19879334, -1.82968581],
                          [4.64467907, 4.73710203, 4.88324118]]):
        self.prob = prob
        self.rgb_var = np.array(rgb_var, dtype=np.float32)

    def __call__(self, results):
        """Call function.
        For each dict in results, perform color augmention for image in the
        dict.
        Args:
            results (list[dict]): List of dict that from
                :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            list[dict]: List of dict that contains augmented color image.
        """
        outs = []
        for i, _results in enumerate(results):
            image = _results['img']

            if self.prob[i] > np.random.random():
                offset = np.dot(self.rgb_var, np.random.randn(3, 1))
                # bgr to rgb
                offset = offset[::-1]
                offset = offset.reshape(3)
                image = (image - offset).astype(np.float32)

            _results['img'] = image
            outs.append(_results)
        return outs


@PIPELINES.register_module()
class SeqBlurAug(object):
    """Blur augmention for images.
    Args:
        prob (list[float]): The probability to perform blur augmention for
            each image. Defaults to [0.0, 0.2].
    """

    def __init__(self, prob=[0.0, 0.2]):
        self.prob = prob

    def __call__(self, results):
        """Call function.
        For each dict in results, perform blur augmention for image in the
        dict.
        Args:
            results (list[dict]): List of dict that from
                :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            list[dict]: List of dict that contains augmented blur image.
        """
        outs = []
        for i, _results in enumerate(results):
            image = _results['img']

            if self.prob[i] > np.random.random():
                sizes = np.arange(5, 46, 2)
                size = np.random.choice(sizes)
                kernel = np.zeros((size, size))
                c = int(size / 2)
                wx = np.random.random()
                kernel[:, c] += 1. / size * wx
                kernel[c, :] += 1. / size * (1 - wx)
                image = cv2.filter2D(image, -1, kernel)

            _results['img'] = image
            outs.append(_results)
        return outs


@PIPELINES.register_module()
class SeqResize(Resize):
    """Resize images.
    Please refer to `mmdet.datasets.pipelines.transfroms.py:Resize` for
    detailed docstring.
    Args:
        share_params (bool): If True, share the resize parameters for all
            images. Defaults to True.
    """

    def __init__(self, share_params=True, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.share_params = share_params

    def __call__(self, results):
        """Call function.
        For each dict in results, call the call function of `Resize` to resize
        image and corresponding annotations.
        Args:
            results (list[dict]): List of dict that from
                :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            list[dict]: List of dict that contains resized results,
            'img_shape', 'pad_shape', 'scale_factor', 'keep_ratio' keys
            are added into result dict.
        """
        outs, scale = [], None
        for i, _results in enumerate(results):
            if self.share_params and i > 0:
                _results['scale'] = scale
            _results = super().__call__(_results)
            if self.share_params and i == 0:
                scale = _results['scale']
            outs.append(_results)
        return outs


@PIPELINES.register_module()
class SeqNormalize(Normalize):
    """Normalize images.
    Please refer to `mmdet.datasets.pipelines.transfroms.py:Normalize` for
    detailed docstring.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def __call__(self, results):
        """Call function.
        For each dict in results, call the call function of `Normalize` to
        normalize image.
        Args:
            results (list[dict]): List of dict that from
                :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            list[dict]: List of dict that contains normalized results,
            'img_norm_cfg' key is added into result dict.
        """
        outs = []
        for _results in results:
            _results = super().__call__(_results)
            outs.append(_results)
        return outs


@PIPELINES.register_module()
class SeqRandomFlip(RandomFlip):
    """Randomly flip for images.
    Please refer to `mmdet.datasets.pipelines.transfroms.py:RandomFlip` for
    detailed docstring.
    Args:
        share_params (bool): If True, share the flip parameters for all images.
            Defaults to True.
    """

    def __init__(self, share_params, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.share_params = share_params

    def __call__(self, results):
        """Call function.
        For each dict in results, call `RandomFlip` to randomly flip image.
        Args:
            results (list[dict]): List of dict that from
                :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            list[dict]: List of dict that contains flipped results, 'flip',
            'flip_direction' keys are added into the dict.
        """
        if self.share_params:
            if isinstance(self.direction, list):
                # None means non-flip
                direction_list = self.direction + [None]
            else:
                # None means non-flip
                direction_list = [self.direction, None]

            if isinstance(self.flip_ratio, list):
                non_flip_ratio = 1 - sum(self.flip_ratio)
                flip_ratio_list = self.flip_ratio + [non_flip_ratio]
            else:
                non_flip_ratio = 1 - self.flip_ratio
                # exclude non-flip
                single_ratio = self.flip_ratio / (len(direction_list) - 1)
                flip_ratio_list = [single_ratio] * (len(direction_list) -
                                                    1) + [non_flip_ratio]

            cur_dir = np.random.choice(direction_list, p=flip_ratio_list)
            flip = cur_dir is not None
            flip_direction = cur_dir

            for _results in results:
                _results['flip'] = flip
                _results['flip_direction'] = flip_direction

        outs = []
        for _results in results:
            _results = super().__call__(_results)
            outs.append(_results)
        return outs


@PIPELINES.register_module()
class SeqPad(Pad):
    """Pad images.
    Please refer to `mmdet.datasets.pipelines.transfroms.py:Pad` for detailed
    docstring.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def __call__(self, results):
        """Call function.
        For each dict in results, call the call function of `Pad` to pad image.
        Args:
            results (list[dict]): List of dict that from
                :obj:`mmtrack.CocoVideoDataset`.
        Returns:
            list[dict]: List of dict that contains padding results,
            'pad_shape', 'pad_fixed_size' and 'pad_size_divisor' keys are
            added into the dict.
        """
        outs = []
        for _results in results:
            _results = super().__call__(_results)
            outs.append(_results)
        return outs


@PIPELINES.register_module()
class SeqRandomCrop(object):
    """Sequentially random crop the images & bboxes & masks.
    The absolute `crop_size` is sampled based on `crop_type` and `image_size`,
    then the cropped results are generated.
    Args:
        crop_size (tuple): The relative ratio or absolute pixels of
            height and width.
        allow_negative_crop (bool, optional): Whether to allow a crop that does
            not contain any bbox area. Default False.
        share_params (bool, optional): Whether share the cropping parameters
            for the images.
        bbox_clip_border (bool, optional): Whether clip the objects outside
            the border of the image. Defaults to True.
    Note:
        - If the image is smaller than the absolute crop size, return the
            original image.
        - The keys for bboxes, labels and masks must be aligned. That is,
          `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and
          `gt_bboxes_ignore` corresponds to `gt_labels_ignore` and
          `gt_masks_ignore`.
        - If the crop does not contain any gt-bbox region and
          `allow_negative_crop` is set to False, skip this image.
    """

    def __init__(self,
                 crop_size,
                 allow_negative_crop=False,
                 share_params=False,
                 bbox_clip_border=True,
                 check_id_match=True
                 ):
        assert crop_size[0] > 0 and crop_size[1] > 0
        self.crop_size = crop_size
        self.allow_negative_crop = allow_negative_crop
        self.share_params = share_params
        self.bbox_clip_border = bbox_clip_border
        self.check_id_match = check_id_match
        # The key correspondence from bboxes to labels and masks.
        self.bbox2label = {
            'gt_bboxes': ['gt_labels', 'gt_instance_ids'],
            'gt_bboxes_ignore': ['gt_labels_ignore', 'gt_instance_ids_ignore']
        }
        self.bbox2mask = {
            'gt_bboxes': 'gt_masks',
            'gt_bboxes_ignore': 'gt_masks_ignore'
        }

    def get_offsets(self, img):
        """Random generate the offsets for cropping."""
        margin_h = max(img.shape[0] - self.crop_size[0], 0)
        margin_w = max(img.shape[1] - self.crop_size[1], 0)
        offset_h = np.random.randint(0, margin_h + 1)
        offset_w = np.random.randint(0, margin_w + 1)
        return offset_h, offset_w

    def random_crop(self, results, offsets=None):
        """Call function to randomly crop images, bounding boxes, masks,
        semantic segmentation maps.
        Args:
            results (dict): Result dict from loading pipeline.
            offsets (tuple, optional): Pre-defined offsets for cropping.
                Default to None.
        Returns:
            dict: Randomly cropped results, 'img_shape' key in result dict is
            updated according to crop size.
        """

        for key in results.get('img_fields', ['img']):
            img = results[key]
            if offsets is not None:
                offset_h, offset_w = offsets
            else:
                offset_h, offset_w = self.get_offsets(img)
            results['img_info']['crop_offsets'] = (offset_h, offset_w)
            crop_y1, crop_y2 = offset_h, offset_h + self.crop_size[0]
            crop_x1, crop_x2 = offset_w, offset_w + self.crop_size[1]

            # crop the image
            img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
            img_shape = img.shape
            results[key] = img
        results['img_shape'] = img_shape

        # crop bboxes accordingly and clip to the image boundary
        for key in results.get('bbox_fields', []):
            # e.g. gt_bboxes and gt_bboxes_ignore
            bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h],
                                   dtype=np.float32)
            bboxes = results[key] - bbox_offset
            if self.bbox_clip_border:
                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
            valid_inds = (bboxes[:, 2] > bboxes[:, 0]) & (
                    bboxes[:, 3] > bboxes[:, 1])
            # If the crop does not contain any gt-bbox area and
            # self.allow_negative_crop is False, skip this image.
            if (key == 'gt_bboxes' and not valid_inds.any()
                    and not self.allow_negative_crop):
                return None
            results[key] = bboxes[valid_inds, :]
            # label fields. e.g. gt_labels and gt_labels_ignore
            label_keys = self.bbox2label.get(key)
            for label_key in label_keys:
                if label_key in results:
                    results[label_key] = results[label_key][valid_inds]

            # mask fields, e.g. gt_masks and gt_masks_ignore
            mask_key = self.bbox2mask.get(key)
            if mask_key in results:
                results[mask_key] = results[mask_key][
                    valid_inds.nonzero()[0]].crop(
                    np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))

        # crop semantic seg
        for key in results.get('seg_fields', []):
            results[key] = results[key][crop_y1:crop_y2, crop_x1:crop_x2]
        return results

    def __call__(self, results):
        """Call function to sequentially randomly crop images, bounding boxes,
        masks, semantic segmentation maps.
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Randomly cropped results, 'img_shape' key in result dict is
            updated according to crop size.
        """
        if self.share_params:
            offsets = self.get_offsets(results[0]['img'])
        else:
            offsets = None

        outs = []
        for _results in results:
            _results = self.random_crop(_results, offsets)
            if _results is None:
                return None
            outs.append(_results)

        if len(outs) == 2 and self.check_id_match:
            ref_result, result = outs[1], outs[0]
            if self.check_match(ref_result, result):
                return None
        return outs

    def check_match(self, ref_results, results):
        ref_ids = ref_results['gt_instance_ids'].tolist()
        gt_ids = results['gt_instance_ids'].tolist()
        gt_pids = [ref_ids.index(i) if i in ref_ids else -1 for i in gt_ids]
        nomatch = (np.array(gt_pids) == -1).all()
        return nomatch


@PIPELINES.register_module()
class SeqPhotoMetricDistortion(object):
    """Apply photometric distortion to image sequentially, every transformation
    is applied with a probability of 0.5. The position of random contrast is in
    second or second to last.
    1. random brightness
    2. random contrast (mode 0)
    3. convert color from BGR to HSV
    4. random saturation
    5. random hue
    6. convert color from HSV to BGR
    7. random contrast (mode 1)
    8. randomly swap channels
    Args:
        brightness_delta (int): delta of brightness.
        contrast_range (tuple): range of contrast.
        saturation_range (tuple): range of saturation.
        hue_delta (int): delta of hue.
    """

    def __init__(self,
                 share_params=True,
                 brightness_delta=32,
                 contrast_range=(0.5, 1.5),
                 saturation_range=(0.5, 1.5),
                 hue_delta=18):
        self.share_params = share_params
        self.brightness_delta = brightness_delta
        self.contrast_lower, self.contrast_upper = contrast_range
        self.saturation_lower, self.saturation_upper = saturation_range
        self.hue_delta = hue_delta

    def get_params(self):
        """Generate parameters."""
        params = dict()
        # delta
        if np.random.randint(2):
            params['delta'] = np.random.uniform(-self.brightness_delta,
                                                self.brightness_delta)
        else:
            params['delta'] = None
        # mode
        mode = np.random.randint(2)
        params['contrast_first'] = True if mode == 1 else 0
        # alpha
        if np.random.randint(2):
            params['alpha'] = np.random.uniform(self.contrast_lower,
                                                self.contrast_upper)
        else:
            params['alpha'] = None
        # saturation
        if np.random.randint(2):
            params['saturation'] = np.random.uniform(self.saturation_lower,
                                                     self.saturation_upper)
        else:
            params['saturation'] = None
        # hue
        if np.random.randint(2):
            params['hue'] = np.random.uniform(-self.hue_delta, self.hue_delta)
        else:
            params['hue'] = None
        # swap
        if np.random.randint(2):
            params['permutation'] = np.random.permutation(3)
        else:
            params['permutation'] = None
        return params

    def photo_metric_distortion(self, results, params=None):
        """Call function to perform photometric distortion on images.
        Args:
            results (dict): Result dict from loading pipeline.
            params (dict, optional): Pre-defined parameters. Default to None.
        Returns:
            dict: Result dict with images distorted.
        """
        if params is None:
            params = self.get_params()
        results['img_info']['color_jitter'] = params

        if 'img_fields' in results:
            assert results['img_fields'] == ['img'], \
                'Only single img_fields is allowed'
        img = results['img']
        assert img.dtype == np.float32, \
            'PhotoMetricDistortion needs the input image of dtype np.float32,' \
            ' please set "to_float32=True" in "LoadImageFromFile" pipeline'
        # random brightness
        if params['delta'] is not None:
            img += params['delta']

        # mode == 0 --> do random contrast first
        # mode == 1 --> do random contrast last
        if params['contrast_first']:
            if params['alpha'] is not None:
                img *= params['alpha']

        # convert color from BGR to HSV
        img = mmcv.bgr2hsv(img)

        # random saturation
        if params['saturation'] is not None:
            img[..., 1] *= params['saturation']

        # random hue
        if params['hue'] is not None:
            img[..., 0] += params['hue']
            img[..., 0][img[..., 0] > 360] -= 360
            img[..., 0][img[..., 0] < 0] += 360

        # convert color from HSV to BGR
        img = mmcv.hsv2bgr(img)

        # random contrast
        if not params['contrast_first']:
            if params['alpha'] is not None:
                img *= params['alpha']

        # randomly swap channels
        if params['permutation'] is not None:
            img = img[..., params['permutation']]

        results['img'] = img
        return results

    def __call__(self, results):
        """Call function to perform photometric distortion on images.
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Result dict with images distorted.
        """
        if self.share_params:
            params = self.get_params()
        else:
            params = None

        outs = []
        for _results in results:
            _results = self.photo_metric_distortion(_results, params)
            outs.append(_results)

        return outs

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
        repr_str += 'contrast_range='
        repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
        repr_str += 'saturation_range='
        repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
        repr_str += f'hue_delta={self.hue_delta})'
        return repr_str


@PIPELINES.register_module()
class ResizeWithRef(object):
    """Resize images & bbox & mask.

    This transform resizes the input image to some scale. Bboxes and masks are
    then resized with the same scale factor. If the input dict contains the key
    "scale", then the scale in the input dict is used, otherwise the specified
    scale in the init method is used.

    `img_scale` can either be a tuple (single-scale) or a list of tuple
    (multi-scale). There are 3 multiscale modes:
    - `ratio_range` is not None: randomly sample a ratio from the ratio range
        and multiply it with the image scale.
    - `ratio_range` is None and `multiscale_mode` == "range": randomly sample a
        scale from the a range.
    - `ratio_range` is None and `multiscale_mode` == "value": randomly sample a
        scale from multiple scales.

    Args:
        img_scale (tuple or list[tuple]): Images scales for resizing.
        multiscale_mode (str): Either "range" or "value".
        ratio_range (tuple[float]): (min_ratio, max_ratio)
        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
            image.
    """

    def __init__(self,
                 img_scale=None,
                 multiscale_mode='range',
                 ratio_range=None,
                 keep_ratio=True):
        if img_scale is None:
            self.img_scale = None
        else:
            if isinstance(img_scale, list):
                self.img_scale = img_scale
            else:
                self.img_scale = [img_scale]
            assert mmcv.is_list_of(self.img_scale, tuple)

        if ratio_range is not None:
            # mode 1: given a scale and a range of image ratio
            assert len(self.img_scale) == 1
        else:
            # mode 2: given multiple scales or a range of scales
            assert multiscale_mode in ['value', 'range']

        self.multiscale_mode = multiscale_mode
        self.ratio_range = ratio_range
        self.keep_ratio = keep_ratio

    @staticmethod
    def random_select(img_scales):
        assert mmcv.is_list_of(img_scales, tuple)
        scale_idx = np.random.randint(len(img_scales))
        img_scale = img_scales[scale_idx]
        return img_scale, scale_idx

    @staticmethod
    def random_sample(img_scales):
        assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2
        img_scale_long = [max(s) for s in img_scales]
        img_scale_short = [min(s) for s in img_scales]
        long_edge = np.random.randint(
            min(img_scale_long),
            max(img_scale_long) + 1)
        short_edge = np.random.randint(
            min(img_scale_short),
            max(img_scale_short) + 1)
        img_scale = (long_edge, short_edge)
        return img_scale, None

    @staticmethod
    def random_sample_ratio(img_scale, ratio_range):
        assert isinstance(img_scale, tuple) and len(img_scale) == 2
        min_ratio, max_ratio = ratio_range
        assert min_ratio <= max_ratio
        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
        scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
        return scale, None

    def _random_scale(self, results):
        if self.ratio_range is not None:
            scale, scale_idx = self.random_sample_ratio(
                self.img_scale[0], self.ratio_range)
        elif len(self.img_scale) == 1:
            scale, scale_idx = self.img_scale[0], 0
        elif self.multiscale_mode == 'range':
            scale, scale_idx = self.random_sample(self.img_scale)
        elif self.multiscale_mode == 'value':
            scale, scale_idx = self.random_select(self.img_scale)
        else:
            raise NotImplementedError

        results['scale'] = scale
        results['scale_idx'] = scale_idx

    def _resize_img(self, results):
        els = ['ref_img', 'img'] if 'ref_img' in results else ['img']
        for el in els:
            if self.keep_ratio:
                img, scale_factor = mmcv.imrescale(
                    results[el], results['scale'], return_scale=True)
            else:
                img, w_scale, h_scale = mmcv.imresize(
                    results[el], results['scale'], return_scale=True)
                scale_factor = np.array([w_scale, h_scale, w_scale, h_scale],
                                        dtype=np.float32)
            results[el] = img
        results['img_shape'] = img.shape
        results['pad_shape'] = img.shape  # in case that there is no padding
        results['scale_factor'] = scale_factor
        results['keep_ratio'] = self.keep_ratio

    def _resize_bboxes(self, results):
        els = ['ref_bbox_fields', 'bbox_fields'] if 'ref_bbox_fields' in results else ['bbox_fields']
        for el in els:
            img_shape = results['img_shape']
            for key in results.get(el, []):
                bboxes = results[key] * results['scale_factor']
                bboxes[:, 0::2] = np.clip(
                    bboxes[:, 0::2], 0, img_shape[1] - 1)
                bboxes[:, 1::2] = np.clip(
                    bboxes[:, 1::2], 0, img_shape[0] - 1)
                results[key] = bboxes

    def _resize_masks(self, results):
        els = ['ref_mask_fields', 'mask_fields'] if 'ref_mask_fields' in results else ['mask_fields']
        for el in els:
            for key in results.get(el, []):
                if results[key] is None:
                    continue
                if self.keep_ratio:
                    masks = [
                        mmcv.imrescale(
                            mask, results['scale_factor'],
                            interpolation='nearest')
                        for mask in results[key]
                    ]
                else:
                    mask_size = (results['img_shape'][1],
                                 results['img_shape'][0])
                    masks = [
                        mmcv.imresize(mask, mask_size,
                                      interpolation='nearest')
                        for mask in results[key]
                    ]
                results[key] = masks

    def __call__(self, results):
        if 'scale' not in results:
            self._random_scale(results)
        self._resize_img(results)
        self._resize_bboxes(results)
        self._resize_masks(results)
        # self._resize_semantic_seg(results)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += ('(img_scale={}, multiscale_mode={}, ratio_range={}, '
                     'keep_ratio={})').format(self.img_scale,
                                              self.multiscale_mode,
                                              self.ratio_range,
                                              self.keep_ratio)
        return repr_str


@PIPELINES.register_module()
class RandomFlipWithRef(object):
    """Flip the image & bbox & mask.

    If the input dict contains the key "flip", then the flag will be used,
    otherwise it will be randomly decided by a ratio specified in the init
    method.

    Args:
        flip_ratio (float, optional): The flipping probability.
    """

    def __init__(self, flip_ratio=None):
        self.flip_ratio = flip_ratio
        if flip_ratio is not None:
            assert flip_ratio >= 0 and flip_ratio <= 1

    def bbox_flip(self, bboxes, img_shape):
        """Flip bboxes horizontally.

        Args:
            bboxes(ndarray): shape (..., 4*k)
            img_shape(tuple): (height, width)
        """
        assert bboxes.shape[-1] % 4 == 0
        w = img_shape[1]
        flipped = bboxes.copy()
        flipped[..., 0::4] = w - bboxes[..., 2::4] - 1
        flipped[..., 2::4] = w - bboxes[..., 0::4] - 1
        return flipped

    def __call__(self, results):
        if 'flip' not in results:
            flip = True if np.random.rand() < self.flip_ratio else False
            results['flip'] = flip
        if results['flip']:
            # flip image
            results['img'] = mmcv.imflip(results['img'])
            if 'ref_img' in results:
                results['ref_img'] = mmcv.imflip(results['ref_img'])
            # flip bboxes
            for key in results.get('bbox_fields', []):
                results[key] = self.bbox_flip(results[key],
                                              results['img_shape'])
            for key in results.get('ref_bbox_fields', []):
                results[key] = self.bbox_flip(results[key],
                                              results['img_shape'])
            # flip masks
            for key in results.get('mask_fields', []):
                results[key] = [mask[:, ::-1] for mask in results[key]]
            for key in results.get('ref_mask_fields', []):
                results[key] = [mask[:, ::-1] for mask in results[key]]
        return results

    def __repr__(self):
        return self.__class__.__name__ + '(flip_ratio={})'.format(
            self.flip_ratio)


@PIPELINES.register_module()
class PadWithRef(object):
    """Pad the image & mask.

    There are two padding modes: (1) pad to a fixed size and (2) pad to the
    minimum size that is divisible by some number.

    Args:
        size (tuple, optional): Fixed padding size.
        size_divisor (int, optional): The divisor of padded size.
        pad_val (float, optional): Padding value, 0 by default.
    """

    def __init__(self, size=None, size_divisor=None, pad_val=0):
        self.size = size
        self.size_divisor = size_divisor
        self.pad_val = pad_val
        # only one of size and size_divisor should be valid
        assert size is not None or size_divisor is not None
        assert size is None or size_divisor is None

    def _pad_img(self, results):
        els = ['ref_img', 'img'] if 'ref_img' in results else ['img']
        for el in els:
            if self.size is not None:
                padded_img = mmcv.impad(results['img'], self.size)
            elif self.size_divisor is not None:
                padded_img = mmcv.impad_to_multiple(
                    results[el], self.size_divisor, pad_val=self.pad_val)
            results[el] = padded_img
        results['pad_shape'] = padded_img.shape
        results['pad_fixed_size'] = self.size
        results['pad_size_divisor'] = self.size_divisor

    def _pad_masks(self, results):
        els = ['ref_mask_fields', 'mask_fields'] if 'ref_mask_fields' in results else ['mask_fields']
        for el in els:
            pad_shape = results['pad_shape'][:2]
            for key in results.get(el, []):
                padded_masks = [
                    mmcv.impad(mask, pad_shape, pad_val=self.pad_val)
                    for mask in results[key]
                ]
                results[key] = np.stack(padded_masks, axis=0)

    def __call__(self, results):
        self._pad_img(results)
        self._pad_masks(results)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += '(size={}, size_divisor={}, pad_val={})'.format(
            self.size, self.size_divisor, self.pad_val)
        return repr_str


@PIPELINES.register_module()
class NormalizeWithRef(object):
    """Normalize the image.

    Args:
        mean (sequence): Mean values of 3 channels.
        std (sequence): Std values of 3 channels.
        to_rgb (bool): Whether to convert the image from BGR to RGB,
            default is true.
    """

    def __init__(self, mean, std, to_rgb=True):
        self.mean = np.array(mean, dtype=np.float32)
        self.std = np.array(std, dtype=np.float32)
        self.to_rgb = to_rgb

    def __call__(self, results):
        results['img'] = mmcv.imnormalize(
            results['img'], self.mean, self.std, self.to_rgb)
        if 'ref_img' in results:
            results['ref_img'] = mmcv.imnormalize(
                results['ref_img'], self.mean, self.std, self.to_rgb)
        results['img_norm_cfg'] = dict(
            mean=self.mean, std=self.std, to_rgb=self.to_rgb)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += '(mean={}, std={}, to_rgb={})'.format(
            self.mean, self.std, self.to_rgb)
        return repr_str


@PIPELINES.register_module()
class RandomCropWithRef(object):
    """Random crop the image & bboxes & masks.

    Args:
        crop_size (tuple): Expected size after cropping, (h, w).
    """

    def __init__(self, crop_size):
        self.crop_size = crop_size

    def __call__(self, results):
        img = results['img']

        margin_h = max(img.shape[0] - self.crop_size[0], 0)
        margin_w = max(img.shape[1] - self.crop_size[1], 0)
        offset_h = np.random.randint(0, margin_h + 1)
        offset_w = np.random.randint(0, margin_w + 1)
        crop_y1, crop_y2 = offset_h, offset_h + self.crop_size[0]
        crop_x1, crop_x2 = offset_w, offset_w + self.crop_size[1]

        # crop the image
        ori_shape = img.shape
        img = img[crop_y1:crop_y2, crop_x1:crop_x2, :]
        img_shape = img.shape
        results['img'] = img
        if 'ref_img' in results:
            ref_img = results['ref_img']
            ref_img = ref_img[crop_y1:crop_y2, crop_x1:crop_x2, :]
            results['ref_img'] = ref_img
        results['img_shape'] = img_shape
        results['crop_coords'] = [crop_y1, crop_y2, crop_x1, crop_x2]

        # crop bboxes accordingly and clip to the image boundary
        els = ['ref_bbox_fields', 'bbox_fields'] if 'ref_bbox_fields' in results else ['bbox_fields']
        for el in els:
            for key in results.get(el, []):
                bbox_offset = np.array(
                    [offset_w, offset_h, offset_w, offset_h],
                    dtype=np.float32)
                bboxes = results[key] - bbox_offset
                bboxes[:, 0::2] = np.clip(
                    bboxes[:, 0::2], 0, img_shape[1] - 1)
                bboxes[:, 1::2] = np.clip(
                    bboxes[:, 1::2], 0, img_shape[0] - 1)
                results[key] = bboxes

        # filter out the gt bboxes that are completely cropped
        els = ['ref_bboxes', 'gt_bboxes'] if 'ref_bboxes' in results else ['gt_bboxes']
        for el in els:
            if el in results:
                gt_bboxes = results[el]
                valid_inds = (gt_bboxes[:, 2] > gt_bboxes[:, 0]) & (
                        gt_bboxes[:, 3] > gt_bboxes[:, 1])
                # if no gt bbox remains after cropping, just skip this image
                if not np.any(valid_inds):
                    return None
                results[el] = gt_bboxes[valid_inds, :]
                ell = el.replace('_bboxes', '_labels')
                if ell in results:
                    results[ell] = results[ell][valid_inds]
                #### filter gt_obj_ids just like gt_labes.
                elo = el.replace('_bboxes', '_obj_ids')
                if elo in results:
                    results[elo] = results[elo][valid_inds]
                # filter and crop the masks
                elm = el.replace('_bboxes', '_masks')
                if elm in results:
                    valid_gt_masks = []
                    for i in np.where(valid_inds)[0]:
                        gt_mask = results[elm][i][
                                  crop_y1:crop_y2, crop_x1:crop_x2]
                        valid_gt_masks.append(gt_mask)
                    results[elm] = valid_gt_masks

        return results

    def __repr__(self):
        return self.__class__.__name__ + '(crop_size={})'.format(
            self.crop_size)


@PIPELINES.register_module()
class PadFutureMMDet:
    """Pad the image & masks & segmentation map.
    There are two padding modes: (1) pad to a fixed size and (2) pad to the
    minimum size that is divisible by some number.
    Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
    Args:
        size (tuple, optional): Fixed padding size.
        size_divisor (int, optional): The divisor of padded size.
        pad_to_square (bool): Whether to pad the image into a square.
            Currently only used for YOLOX. Default: False.
        pad_val (dict, optional): A dict for padding value, the default
            value is `dict(img=0, masks=0, seg=255)`.
    """

    def __init__(self,
                 size=None,
                 size_divisor=None,
                 pad_to_square=False,
                 pad_val=dict(img=0, masks=0, seg=255)):
        self.size = size
        self.size_divisor = size_divisor
        if isinstance(pad_val, float) or isinstance(pad_val, int):
            warnings.warn(
                'pad_val of float type is deprecated now, '
                f'please use pad_val=dict(img={pad_val}, '
                f'masks={pad_val}, seg=255) instead.', DeprecationWarning)
            pad_val = dict(img=pad_val, masks=pad_val, seg=255)
        assert isinstance(pad_val, dict)
        self.pad_val = pad_val
        self.pad_to_square = pad_to_square

        if pad_to_square:
            assert size is None and size_divisor is None, \
                'The size and size_divisor must be None ' \
                'when pad2square is True'
        else:
            assert size is not None or size_divisor is not None, \
                'only one of size and size_divisor should be valid'
            assert size is None or size_divisor is None

    def _pad_img(self, results):
        """Pad images according to ``self.size``."""
        pad_val = self.pad_val.get('img', 0)
        for key in results.get('img_fields', ['img']):
            if self.pad_to_square:
                max_size = max(results[key].shape[:2])
                self.size = (max_size, max_size)
            if self.size is not None:
                padded_img = mmcv.impad(
                    results[key], shape=self.size, pad_val=pad_val)
            elif self.size_divisor is not None:
                padded_img = mmcv.impad_to_multiple(
                    results[key], self.size_divisor, pad_val=pad_val)
            results[key] = padded_img
        results['pad_shape'] = padded_img.shape
        results['pad_fixed_size'] = self.size
        results['pad_size_divisor'] = self.size_divisor

    def _pad_masks(self, results):
        """Pad masks according to ``results['pad_shape']``."""
        pad_shape = results['pad_shape'][:2]
        pad_val = self.pad_val.get('masks', 0)
        for key in results.get('mask_fields', []):
            results[key] = results[key].pad(pad_shape, pad_val=pad_val)

    def _pad_seg(self, results):
        """Pad semantic segmentation map according to
        ``results['pad_shape']``."""
        pad_val = self.pad_val.get('seg', 255)
        for key in results.get('seg_fields', []):
            results[key] = mmcv.impad(
                results[key], shape=results['pad_shape'][:2], pad_val=pad_val)

    def __call__(self, results):
        """Call function to pad images, masks, semantic segmentation maps.
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Updated result dict.
        """
        self._pad_img(results)
        self._pad_masks(results)
        self._pad_seg(results)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(size={self.size}, '
        repr_str += f'size_divisor={self.size_divisor}, '
        repr_str += f'pad_to_square={self.pad_to_square}, '
        repr_str += f'pad_val={self.pad_val})'
        return repr_str


@PIPELINES.register_module()
class KNetInsAdapter:
    """Adapter that is used to convert city-style instance class-ids
    to coco-style instance-ids (11-starting to 0-starting)
    """

    def __init__(self, stuff_nums=11):
        self.stuff_nums = stuff_nums

    def __call__(self, results):
        """Call function to modify gt_labels
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Updated result dict.
        """
        results['gt_labels'] -= self.stuff_nums
        return results


@PIPELINES.register_module()
class KNetInsAdapterCherryPick:
    """Adapter that is used to convert city-style instance class-ids
    to coco-style instance-ids (11-starting to 0-starting)
    """

    def __init__(self, stuff_nums=11, cherry=(11, 13)):
        self.cherry = cherry
        self.stuff_nums = stuff_nums

    def __call__(self, results):
        """Call function to modify gt_labels
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Updated result dict.
        """
        bias = 0
        for ch in self.cherry:
            results['gt_labels'][results['gt_labels'] == ch] -= bias
            bias += 1
        results['gt_labels'] -= self.stuff_nums
        return results


================================================
FILE: mmtrack/transform.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import torch
from mmdet.core import bbox2result

def outs2results(bboxes=None,
                 labels=None,
                 masks=None,
                 ids=None,
                 num_classes=None,
                 **kwargs):
    """Convert tracking/detection results to a list of numpy arrays.
    Args:
        bboxes (torch.Tensor | np.ndarray): shape (n, 5)
        labels (torch.Tensor | np.ndarray): shape (n, )
        masks (torch.Tensor | np.ndarray): shape (n, h, w)
        ids (torch.Tensor | np.ndarray): shape (n, )
        num_classes (int): class number, not including background class
    Returns:
        dict[str : list(ndarray) | list[list[np.ndarray]]]: tracking/detection
        results of each class. It may contain keys as belows:
        - bbox_results (list[np.ndarray]): Each list denotes bboxes of one
            category.
        - mask_results (list[list[np.ndarray]]): Each outer list denotes masks
            of one category. Each inner list denotes one mask belonging to
            the category. Each mask has shape (h, w).
    """
    assert labels is not None
    assert num_classes is not None

    results = dict()

    if ids is not None:
        valid_inds = ids > -1
        ids = ids[valid_inds]
        labels = labels[valid_inds]

    if bboxes is not None:
        if ids is not None:
            bboxes = bboxes[valid_inds]
            if bboxes.shape[0] == 0:
                bbox_results = [
                    np.zeros((0, 6), dtype=np.float32)
                    for i in range(num_classes)
                ]
            else:
                if isinstance(bboxes, torch.Tensor):
                    bboxes = bboxes.cpu().numpy()
                    labels = labels.cpu().numpy()
                    ids = ids.cpu().numpy()
                bbox_results = [
                    np.concatenate(
                        (ids[labels == i, None], bboxes[labels == i, :]),
                        axis=1) for i in range(num_classes)
                ]
        else:
            bbox_results = bbox2result(bboxes, labels, num_classes)
        results['bbox_results'] = bbox_results

    if masks is not None:
        if ids is not None:
            masks = masks[valid_inds]
        if isinstance(masks, torch.Tensor):
            masks = masks.detach().cpu().numpy()
        masks_results = [[] for _ in range(num_classes)]
        for i in range(bboxes.shape[0]):
            masks_results[labels[i]].append(masks[i])
        results['mask_results'] = masks_results

    return results

================================================
FILE: scripts/kitti_step_prepare.py
================================================
import os
import shutil

train_seqs = [0, 1, 3, 4, 5, 9, 11, 12, 15, 17, 19, 20]
val_seqs = [2, 6, 7, 8, 10, 13, 14, 16, 18]
test_seqs = list(range(29))

# your download the KITTI STEP dataset.
data_root = os.path.expanduser('/data/data1/datasets/STEP/kitti/training/')
data_root_test = os.path.expanduser('/data/data1/datasets/STEP/kitti/testing/')
data_out = os.path.expanduser('/data/data1/datasets/STEP/kitti_out')


def build_panoptic(seq_id, input_dir, output_dir):
    input_panoptic_dir = os.path.join(input_dir, '{:04d}'.format(seq_id))
    print("Preparing seq id : {}".format(seq_id))
    panoptic_files = sorted(list(map(lambda x: str(x), os.listdir(input_panoptic_dir))))

    print("Dst dir is {}".format(output_dir))
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file in panoptic_files:
        print(os.path.join(output_dir, '{:06d}_{}_panoptic.png'.format(seq_id, file.split('.')[0])))
        shutil.move(os.path.join(input_panoptic_dir, file),
                    os.path.join(output_dir, '{:06d}_{}_panoptic.png'.format(seq_id, file.split('.')[0])))


def build_img(seq_id, input_dir, output_dir):
    input_panoptic_dir = os.path.join(input_dir, '{:04d}'.format(seq_id))
    print("Preparing seq id : {}".format(seq_id))
    panoptic_files = sorted(list(map(lambda x: str(x), os.listdir(input_panoptic_dir))))

    print("Dst dir is {}".format(output_dir))
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file in panoptic_files:
        print(os.path.join(output_dir, '{:06d}_{}_leftImg8bit.png'.format(seq_id, file.split('.')[0])))
        shutil.move(os.path.join(input_panoptic_dir, file),
                    os.path.join(output_dir, '{:06d}_{}_leftImg8bit.png'.format(seq_id, file.split('.')[0])))


if __name__ == '__main__':
    for seq_id in train_seqs:
        build_panoptic(seq_id, os.path.join(data_root, 'panoptic'), os.path.join(data_out, 'video_sequence', 'train'))

    for seq_id in val_seqs:
        build_panoptic(seq_id, os.path.join(data_root, 'panoptic'), os.path.join(data_out, 'video_sequence', 'val'))

    for seq_id in train_seqs:
        build_img(seq_id, os.path.join(data_root, 'image_02'), os.path.join(data_out, 'video_sequence', 'train'))

    for seq_id in val_seqs:
        build_img(seq_id, os.path.join(data_root, 'image_02'), os.path.join(data_out, 'video_sequence', 'val'))

    for seq_id in test_seqs:
        build_img(seq_id, os.path.join(data_root_test, 'image_02'), os.path.join(data_out, 'video_sequence', 'test'))

================================================
FILE: scripts/visualizer.py
================================================
import hashlib
import numpy as np
import cv2

city_labels = [
    ('road', 0, (128, 64, 128)),
    ('sidewalk', 1, (244, 35, 232)),
    ('building', 2, (70, 70, 70)),
    ('wall', 3, (102, 102, 156)),
    ('fence', 4, (190, 153, 153)),
    ('pole', 5, (153, 153, 153)),
    ('traffic light', 6, (250, 170, 30)),
    ('traffic sign', 7, (220, 220, 0)),
    ('vegetation', 8, (107, 142, 35)),
    ('terrain', 9, (152, 251, 152)),
    ('sky', 10, (70, 130, 180)),
    ('person', 11, (220, 20, 60)),
    ('rider', 12, (255, 0, 0)),
    ('car', 13, (0, 0, 142)),
    ('truck', 14, (0, 0, 70)),
    ('bus', 15, (0, 60, 100)),
    ('train', 16, (0, 80, 100)),
    ('motorcycle', 17, (0, 0, 230)),
    ('bicycle', 18, (119, 11, 32)),
    ('void', 19, (0, 0, 0)),
    ('void', 255, (0, 0, 0))
]


def sha256num(num):
    hex = hashlib.sha256(str(num).encode('utf-8')).hexdigest()
    hex = hex[-6:]
    return int(hex, 16)


def id2rgb(id_map):
    if isinstance(id_map, np.ndarray):
        id_map_copy = id_map.copy()
        rgb_shape = tuple(list(id_map.shape) + [3])
        rgb_map = np.zeros(rgb_shape, dtype=np.uint8)
        for i in range(3):
            rgb_map[..., i] = id_map_copy % 256
            id_map_copy //= 256
        return rgb_map
    color = []
    for _ in range(3):
        color.append(id_map % 256)
        id_map //= 256
    return color


def cityscapes_cat2rgb(cat_map):
    color_map = np.zeros_like(cat_map).astype(np.uint8)
    color_map = color_map[..., None].repeat(3, axis=-1)
    for each_class in city_labels:
        index = cat_map == each_class[1]
        if index.any():
            color_map[index] = each_class[2]
    return color_map


def trackmap2rgb(track_map):
    color_map = np.zeros_like(track_map).astype(np.uint8)
    color_map = color_map[..., None].repeat(3, axis=-1)
    for id_cur in np.unique(track_map):
        if id_cur == 0:
            continue
        color_map[track_map == id_cur] = id2rgb(sha256num(id_cur))
    return color_map


def draw_bbox_on_img(vis_img, bboxes):
    for index in range(bboxes.shape[0]):
        cv2.rectangle(vis_img, (int(bboxes[index][0]), int(bboxes[index][1])),
                      (int(bboxes[index][2]), int(bboxes[index][3])), (0, 0, 255), thickness=1)
    return vis_img


================================================
FILE: swin/DetectRS.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch.nn as nn
import torch.utils.checkpoint as cp
from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init,
                      kaiming_init)
from mmcv.runner import Sequential, load_checkpoint
from torch.nn.modules.batchnorm import _BatchNorm

from mmdet.utils import get_root_logger
from mmdet.models.builder import BACKBONES
from mmdet.models.backbones.resnet import BasicBlock
from mmdet.models.backbones.resnet import Bottleneck as _Bottleneck
from mmdet.models.backbones.resnet import ResNet


class Bottleneck(_Bottleneck):
    r"""Bottleneck for the ResNet backbone in `DetectoRS
    <https://arxiv.org/pdf/2006.02334.pdf>`_.
    This bottleneck allows the users to specify whether to use
    SAC (Switchable Atrous Convolution) and RFP (Recursive Feature Pyramid).
    Args:
         inplanes (int): The number of input channels.
         planes (int): The number of output channels before expansion.
         rfp_inplanes (int, optional): The number of channels from RFP.
             Default: None. If specified, an additional conv layer will be
             added for ``rfp_feat``. Otherwise, the structure is the same as
             base class.
         sac (dict, optional): Dictionary to construct SAC. Default: None.
         init_cfg (dict or list[dict], optional): Initialization config dict.
            Default: None
    """
    expansion = 4

    def __init__(self,
                 inplanes,
                 planes,
                 rfp_inplanes=None,
                 sac=None,
                 init_cfg=None,
                 **kwargs):
        super(Bottleneck, self).__init__(
            inplanes, planes, init_cfg=init_cfg, **kwargs)

        assert sac is None or isinstance(sac, dict)
        self.sac = sac
        self.with_sac = sac is not None
        if self.with_sac:
            self.conv2 = build_conv_layer(
                self.sac,
                planes,
                planes,
                kernel_size=3,
                stride=self.conv2_stride,
                padding=self.dilation,
                dilation=self.dilation,
                bias=False)

        self.rfp_inplanes = rfp_inplanes
        if self.rfp_inplanes:
            self.rfp_conv = build_conv_layer(
                None,
                self.rfp_inplanes,
                planes * self.expansion,
                1,
                stride=1,
                bias=True)
            # TODO : Is this a bug ?
            if init_cfg is None:
                self.init_cfg = dict(
                    type='Constant', val=0, override=dict(name='rfp_conv'))

    def rfp_forward(self, x, rfp_feat):
        """The forward function that also takes the RFP features as input."""

        def _inner_forward(x):
            identity = x

            out = self.conv1(x)
            out = self.norm1(out)
            out = self.relu(out)

            if self.with_plugins:
                out = self.forward_plugin(out, self.after_conv1_plugin_names)

            out = self.conv2(out)
            out = self.norm2(out)
            out = self.relu(out)

            if self.with_plugins:
                out = self.forward_plugin(out, self.after_conv2_plugin_names)

            out = self.conv3(out)
            out = self.norm3(out)

            if self.with_plugins:
                out = self.forward_plugin(out, self.after_conv3_plugin_names)

            if self.downsample is not None:
                identity = self.downsample(x)

            out += identity

            return out

        if self.with_cp and x.requires_grad:
            out = cp.checkpoint(_inner_forward, x)
        else:
            out = _inner_forward(x)

        if self.rfp_inplanes:
            rfp_feat = self.rfp_conv(rfp_feat)
            out = out + rfp_feat

        out = self.relu(out)

        return out


class ResLayer(Sequential):
    """ResLayer to build ResNet style backbone for RPF in detectoRS.
    The difference between this module and base class is that we pass
    ``rfp_inplanes`` to the first block.
    Args:
        block (nn.Module): block used to build ResLayer.
        inplanes (int): inplanes of block.
        planes (int): planes of block.
        num_blocks (int): number of blocks.
        stride (int): stride of the first block. Default: 1
        avg_down (bool): Use AvgPool instead of stride conv when
            downsampling in the bottleneck. Default: False
        conv_cfg (dict): dictionary to construct and config conv layer.
            Default: None
        norm_cfg (dict): dictionary to construct and config norm layer.
            Default: dict(type='BN')
        downsample_first (bool): Downsample at the first block or last block.
            False for Hourglass, True for ResNet. Default: True
        rfp_inplanes (int, optional): The number of channels from RFP.
            Default: None. If specified, an additional conv layer will be
            added for ``rfp_feat``. Otherwise, the structure is the same as
            base class.
    """

    def __init__(self,
                 block,
                 inplanes,
                 planes,
                 num_blocks,
                 stride=1,
                 avg_down=False,
                 conv_cfg=None,
                 norm_cfg=dict(type='BN'),
                 downsample_first=True,
                 rfp_inplanes=None,
                 **kwargs):
        self.block = block
        assert downsample_first, f'downsample_first={downsample_first} is ' \
                                 'not supported in DetectoRS'

        downsample = None
        if stride != 1 or inplanes != planes * block.expansion:
            downsample = []
            conv_stride = stride
            if avg_down and stride != 1:
                conv_stride = 1
                downsample.append(
                    nn.AvgPool2d(
                        kernel_size=stride,
                        stride=stride,
                        ceil_mode=True,
                        count_include_pad=False))
            downsample.extend([
                build_conv_layer(
                    conv_cfg,
                    inplanes,
                    planes * block.expansion,
                    kernel_size=1,
                    stride=conv_stride,
                    bias=False),
                build_norm_layer(norm_cfg, planes * block.expansion)[1]
            ])
            downsample = nn.Sequential(*downsample)

        layers = []
        layers.append(
            block(
                inplanes=inplanes,
                planes=planes,
                stride=stride,
                downsample=downsample,
                conv_cfg=conv_cfg,
                norm_cfg=norm_cfg,
                rfp_inplanes=rfp_inplanes,
                **kwargs))
        inplanes = planes * block.expansion
        for _ in range(1, num_blocks):
            layers.append(
                block(
                    inplanes=inplanes,
                    planes=planes,
                    stride=1,
                    conv_cfg=conv_cfg,
                    norm_cfg=norm_cfg,
                    **kwargs))

        super(ResLayer, self).__init__(*layers)


@BACKBONES.register_module()
class DetectoRS_ResNet_Custom(ResNet):
    """ResNet backbone for DetectoRS.
    Args:
        sac (dict, optional): Dictionary to construct SAC (Switchable Atrous
            Convolution). Default: None.
        stage_with_sac (list): Which stage to use sac. Default: (False, False,
            False, False).
        rfp_inplanes (int, optional): The number of channels from RFP.
            Default: None. If specified, an additional conv layer will be
            added for ``rfp_feat``. Otherwise, the structure is the same as
            base class.
        output_img (bool): If ``True``, the input image will be inserted into
            the starting position of output. Default: False.
    """

    arch_settings = {
        50: (Bottleneck, (3, 4, 6, 3)),
        101: (Bottleneck, (3, 4, 23, 3)),
        152: (Bottleneck, (3, 8, 36, 3))
    }

    def __init__(self,
                 sac=None,
                 stage_with_sac=(False, False, False, False),
                 rfp_inplanes=None,
                 output_img=False,
                 pretrained=None,
                 init_cfg=None,
                 **kwargs):
        assert not (init_cfg and pretrained), \
            'init_cfg and pretrained cannot be specified at the same time'
        assert pretrained is None, "pretrained is not supported anymore"
        self.sac = sac
        self.stage_with_sac = stage_with_sac
        self.rfp_inplanes = rfp_inplanes
        self.output_img = output_img
        super().__init__(init_cfg=init_cfg, **kwargs)

        self.inplanes = self.stem_channels
        self.res_layers = []
        for i, num_blocks in enumerate(self.stage_blocks):
            stride = self.strides[i]
            dilation = self.dilations[i]
            dcn = self.dcn if self.stage_with_dcn[i] else None
            sac = self.sac if self.stage_with_sac[i] else None
            if self.plugins is not None:
                stage_plugins = self.make_stage_plugins(self.plugins, i)
            else:
                stage_plugins = None
            planes = self.base_channels * 2 ** i
            res_layer = self.make_res_layer(
                block=self.block,
                inplanes=self.inplanes,
                planes=planes,
                num_blocks=num_blocks,
                stride=stride,
                dilation=dilation,
                style=self.style,
                avg_down=self.avg_down,
                with_cp=self.with_cp,
                conv_cfg=self.conv_cfg,
                norm_cfg=self.norm_cfg,
                dcn=dcn,
                sac=sac,
                rfp_inplanes=rfp_inplanes if i > 0 else None,
                plugins=stage_plugins)
            self.inplanes = planes * self.block.expansion
            layer_name = f'layer{i + 1}'
            self.add_module(layer_name, res_layer)
            self.res_layers.append(layer_name)

        self._freeze_stages()

    # In order to be properly initialized by RFP
    def init_weights(self):
        # Calling this method will cause parameter initialization exception
        # super(DetectoRS_ResNet, self).init_weights()
        if self.init_cfg is not None:
            super(ResNet, self).init_weights()
        elif self.pretrained is None:
            for m in self.modules():
                if isinstance(m, nn.Conv2d):
                    kaiming_init(m)
                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
                    constant_init(m, 1)

            if self.dcn is not None:
                for m in self.modules():
                    if isinstance(m, Bottleneck) and hasattr(
                            m.conv2, 'conv_offset'):
                        constant_init(m.conv2.conv_offset, 0)

            if self.zero_init_residual:
                for m in self.modules():
                    if isinstance(m, Bottleneck):
                        constant_init(m.norm3, 0)
                    elif isinstance(m, BasicBlock):
                        constant_init(m.norm2, 0)
        else:
            raise TypeError('pretrained must be a str or None')

    def make_res_layer(self, **kwargs):
        """Pack all blocks in a stage into a ``ResLayer`` for DetectoRS."""
        return ResLayer(**kwargs)

    def forward(self, x):
        """Forward function."""
        outs = list(super().forward(x))
        if self.output_img:
            outs.insert(0, x)
        return tuple(outs)

    def rfp_forward(self, x, rfp_feats):
        """Forward function for RFP."""
        if self.deep_stem:
            x = self.stem(x)
        else:
            x = self.conv1(x)
            x = self.norm1(x)
            x = self.relu(x)
        x = self.maxpool(x)
        outs = []
        for i, layer_name in enumerate(self.res_layers):
            res_layer = getattr(self, layer_name)
            rfp_feat = rfp_feats[i] if i > 0 else None
            for layer in res_layer:
                x = layer.rfp_forward(x, rfp_feat)
            if i in self.out_indices:
                outs.append(x)
        return tuple(outs)


================================================
FILE: swin/ckpt_convert.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.

# This script consists of several convert functions which
# can modify the weights of model in original repo to be
# pre-trained weights.

from collections import OrderedDict

import torch


def pvt_convert(ckpt):
    new_ckpt = OrderedDict()
    # Process the concat between q linear weights and kv linear weights
    use_abs_pos_embed = False
    use_conv_ffn = False
    for k in ckpt.keys():
        if k.startswith('pos_embed'):
            use_abs_pos_embed = True
        if k.find('dwconv') >= 0:
            use_conv_ffn = True
    for k, v in ckpt.items():
        if k.startswith('head'):
            continue
        if k.startswith('norm.'):
            continue
        if k.startswith('cls_token'):
            continue
        if k.startswith('pos_embed'):
            stage_i = int(k.replace('pos_embed', ''))
            new_k = k.replace(f'pos_embed{stage_i}',
                              f'layers.{stage_i - 1}.1.0.pos_embed')
            if stage_i == 4 and v.size(1) == 50:  # 1 (cls token) + 7 * 7
                new_v = v[:, 1:, :]  # remove cls token
            else:
                new_v = v
        elif k.startswith('patch_embed'):
            stage_i = int(k.split('.')[0].replace('patch_embed', ''))
            new_k = k.replace(f'patch_embed{stage_i}',
                              f'layers.{stage_i - 1}.0')
            new_v = v
            if 'proj.' in new_k:
                new_k = new_k.replace('proj.', 'projection.')
        elif k.startswith('block'):
            stage_i = int(k.split('.')[0].replace('block', ''))
            layer_i = int(k.split('.')[1])
            new_layer_i = layer_i + use_abs_pos_embed
            new_k = k.replace(f'block{stage_i}.{layer_i}',
                              f'layers.{stage_i - 1}.1.{new_layer_i}')
            new_v = v
            if 'attn.q.' in new_k:
                sub_item_k = k.replace('q.', 'kv.')
                new_k = new_k.replace('q.', 'attn.in_proj_')
                new_v = torch.cat([v, ckpt[sub_item_k]], dim=0)
            elif 'attn.kv.' in new_k:
                continue
            elif 'attn.proj.' in new_k:
                new_k = new_k.replace('proj.', 'attn.out_proj.')
            elif 'attn.sr.' in new_k:
                new_k = new_k.replace('sr.', 'sr.')
            elif 'mlp.' in new_k:
                string = f'{new_k}-'
                new_k = new_k.replace('mlp.', 'ffn.layers.')
                if 'fc1.weight' in new_k or 'fc2.weight' in new_k:
                    new_v = v.reshape((*v.shape, 1, 1))
                new_k = new_k.replace('fc1.', '0.')
                new_k = new_k.replace('dwconv.dwconv.', '1.')
                if use_conv_ffn:
                    new_k = new_k.replace('fc2.', '4.')
                else:
                    new_k = new_k.replace('fc2.', '3.')
                string += f'{new_k} {v.shape}-{new_v.shape}'
        elif k.startswith('norm'):
            stage_i = int(k[4])
            new_k = k.replace(f'norm{stage_i}', f'layers.{stage_i - 1}.2')
            new_v = v
        else:
            new_k = k
            new_v = v
        new_ckpt[new_k] = new_v

    return new_ckpt


def swin_converter(ckpt):

    new_ckpt = OrderedDict()

    def correct_unfold_reduction_order(x):
        out_channel, in_channel = x.shape
        x = x.reshape(out_channel, 4, in_channel // 4)
        x = x[:, [0, 2, 1, 3], :].transpose(1,
                                            2).reshape(out_channel, in_channel)
        return x

    def correct_unfold_norm_order(x):
        in_channel = x.shape[0]
        x = x.reshape(4, in_channel // 4)
        x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel)
        return x

    for k, v in ckpt.items():
        if k.startswith('head'):
            continue
        elif k.startswith('layers'):
            new_v = v
            if 'attn.' in k:
                new_k = k.replace('attn.', 'attn.w_msa.')
            elif 'mlp.' in k:
                if 'mlp.fc1.' in k:
                    new_k = k.replace('mlp.fc1.', 'ffn.layers.0.0.')
                elif 'mlp.fc2.' in k:
                    new_k = k.replace('mlp.fc2.', 'ffn.layers.1.')
                else:
                    new_k = k.replace('mlp.', 'ffn.')
            elif 'downsample' in k:
                new_k = k
                if 'reduction.' in k:
                    new_v = correct_unfold_reduction_order(v)
                elif 'norm.' in k:
                    new_v = correct_unfold_norm_order(v)
            else:
                new_k = k
            new_k = new_k.replace('layers', 'stages', 1)
        elif k.startswith('patch_embed'):
            new_v = v
            if 'proj' in k:
                new_k = k.replace('proj', 'projection')
            else:
                new_k = k
        else:
            new_v = v
            new_k = k

        new_ckpt[new_k] = new_v

    return new_ckpt


================================================
FILE: swin/mix_transformer.py
================================================
# ---------------------------------------------------------------
# Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
#
# This work is licensed under the NVIDIA Source Code License
# ---------------------------------------------------------------
import torch
import torch.nn as nn
import torch.nn.functional as F
from functools import partial

from timm.models.layers import DropPath, to_2tuple, trunc_normal_
from timm.models.registry import register_model
from timm.models.vision_transformer import _cfg
from mmdet.models.builder import BACKBONES
from mmdet.utils import get_root_logger
from mmdet.models.backbones.resnet import ResNet
from mmcv.runner import load_checkpoint, BaseModule
import math


class Mlp(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.dwconv = DWConv(hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)
        elif isinstance(m, nn.Conv2d):
            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
            fan_out //= m.groups
            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
            if m.bias is not None:
                m.bias.data.zero_()

    def forward(self, x, H, W):
        x = self.fc1(x)
        x = self.dwconv(x, H, W)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


class Attention(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1):
        super().__init__()
        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."

        self.dim = dim
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim ** -0.5

        self.q = nn.Linear(dim, dim, bias=qkv_bias)
        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

        self.sr_ratio = sr_ratio
        if sr_ratio > 1:
            self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
            self.norm = nn.LayerNorm(dim)

        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)
        elif isinstance(m, nn.Conv2d):
            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
            fan_out //= m.groups
            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
            if m.bias is not None:
                m.bias.data.zero_()

    def forward(self, x, H, W):
        B, N, C = x.shape
        q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)

        if self.sr_ratio > 1:
            x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
            x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
            x_ = self.norm(x_)
            kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        else:
            kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        k, v = kv[0], kv[1]

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)

        return x


class Block(nn.Module):

    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1):
        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attn = Attention(
            dim,
            num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
            attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio)
        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)

        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)
        elif isinstance(m, nn.Conv2d):
            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
            fan_out //= m.groups
            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
            if m.bias is not None:
                m.bias.data.zero_()

    def forward(self, x, H, W):
        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))

        return x


class OverlapPatchEmbed(nn.Module):
    """ Image to Patch Embedding
    """

    def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dims=768):
        super().__init__()
        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)

        self.img_size = img_size
        self.patch_size = patch_size
        self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1]
        self.num_patches = self.H * self.W
        self.proj = nn.Conv2d(in_chans, embed_dims, kernel_size=patch_size, stride=stride,
                              padding=(patch_size[0] // 2, patch_size[1] // 2))
        self.norm = nn.LayerNorm(embed_dims)

        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)
        elif isinstance(m, nn.Conv2d):
            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
            fan_out //= m.groups
            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
            if m.bias is not None:
                m.bias.data.zero_()

    def forward(self, x):
        x = self.proj(x)
        _, _, H, W = x.shape
        x = x.flatten(2).transpose(1, 2)
        x = self.norm(x)

        return x, H, W


class MixVisionTransformer(BaseModule):
    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dims=[64, 128, 256, 512],
                 num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0.,
                 attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm,
                 depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], init_cfg=None):
        super().__init__(init_cfg=init_cfg)
        self.num_classes = num_classes
        self.depths = depths

        # patch_embed
        self.patch_embed1 = OverlapPatchEmbed(img_size=img_size, patch_size=7, stride=4, in_chans=in_chans,
                                              embed_dims=embed_dimss[0])
        self.patch_embed2 = OverlapPatchEmbed(img_size=img_size // 4, patch_size=3, stride=2, in_chans=embed_dimss[0],
                                              embed_dims=embed_dimss[1])
        self.patch_embed3 = OverlapPatchEmbed(img_size=img_size // 8, patch_size=3, stride=2, in_chans=embed_dimss[1],
                                              embed_dims=embed_dimss[2])
        self.patch_embed4 = OverlapPatchEmbed(img_size=img_size // 16, patch_size=3, stride=2, in_chans=embed_dimss[2],
                                              embed_dims=embed_dimss[3])

        # transformer encoder
        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
        cur = 0
        self.block1 = nn.ModuleList([Block(
            dim=embed_dimss[0], num_heads=num_heads[0], mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, qk_scale=qk_scale,
            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
            sr_ratio=sr_ratios[0])
            for i in range(depths[0])])
        self.norm1 = norm_layer(embed_dimss[0])

        cur += depths[0]
        self.block2 = nn.ModuleList([Block(
            dim=embed_dimss[1], num_heads=num_heads[1], mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale,
            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
            sr_ratio=sr_ratios[1])
            for i in range(depths[1])])
        self.norm2 = norm_layer(embed_dimss[1])

        cur += depths[1]
        self.block3 = nn.ModuleList([Block(
            dim=embed_dimss[2], num_heads=num_heads[2], mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias, qk_scale=qk_scale,
            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
            sr_ratio=sr_ratios[2])
            for i in range(depths[2])])
        self.norm3 = norm_layer(embed_dimss[2])

        cur += depths[2]
        self.block4 = nn.ModuleList([Block(
            dim=embed_dimss[3], num_heads=num_heads[3], mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias, qk_scale=qk_scale,
            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
            sr_ratio=sr_ratios[3])
            for i in range(depths[3])])
        self.norm4 = norm_layer(embed_dimss[3])

        # classification head
        # self.head = nn.Linear(embed_dimss[3], num_classes) if num_classes > 0 else nn.Identity()

        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)
        elif isinstance(m, nn.Conv2d):
            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
            fan_out //= m.groups
            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
            if m.bias is not None:
                m.bias.data.zero_()

    # def init_weights(self, pretrained=None):
    #     if isinstance(pretrained, str):
    #         logger = get_root_logger()
    #         load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger)

    def reset_drop_path(self, drop_path_rate):
        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(self.depths))]
        cur = 0
        for i in range(self.depths[0]):
            self.block1[i].drop_path.drop_prob = dpr[cur + i]

        cur += self.depths[0]
        for i in range(self.depths[1]):
            self.block2[i].drop_path.drop_prob = dpr[cur + i]

        cur += self.depths[1]
        for i in range(self.depths[2]):
            self.block3[i].drop_path.drop_prob = dpr[cur + i]

        cur += self.depths[2]
        for i in range(self.depths[3]):
            self.block4[i].drop_path.drop_prob = dpr[cur + i]

    def freeze_patch_emb(self):
        self.patch_embed1.requires_grad = False

    @torch.jit.ignore
    def no_weight_decay(self):
        return {'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token'}  # has pos_embed may be better

    def get_classifier(self):
        return self.head

    def reset_classifier(self, num_classes, global_pool=''):
        self.num_classes = num_classes
        self.head = nn.Linear(self.embed_dims, num_classes) if num_classes > 0 else nn.Identity()

    def forward_features(self, x):
        B = x.shape[0]
        outs = []

        # stage 1
        x, H, W = self.patch_embed1(x)
        for i, blk in enumerate(self.block1):
            x = blk(x, H, W)
        x = self.norm1(x)
        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
        outs.append(x)

        # stage 2
        x, H, W = self.patch_embed2(x)
        for i, blk in enumerate(self.block2):
            x = blk(x, H, W)
        x = self.norm2(x)
        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
        outs.append(x)

        # stage 3
        x, H, W = self.patch_embed3(x)
        for i, blk in enumerate(self.block3):
            x = blk(x, H, W)
        x = self.norm3(x)
        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
        outs.append(x)

        # stage 4
        x, H, W = self.patch_embed4(x)
        for i, blk in enumerate(self.block4):
            x = blk(x, H, W)
        x = self.norm4(x)
        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
        outs.append(x)

        return outs

    def forward(self, x):
        x = self.forward_features(x)
        # x = self.head(x)

        return x


class DWConv(nn.Module):
    def __init__(self, dim=768):
        super(DWConv, self).__init__()
        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)

    def forward(self, x, H, W):
        B, N, C = x.shape
        x = x.transpose(1, 2).view(B, C, H, W)
        x = self.dwconv(x)
        x = x.flatten(2).transpose(1, 2)

        return x


@BACKBONES.register_module()
class mit_b0(MixVisionTransformer):
    def __init__(self, **kwargs):
        super(mit_b0, self).__init__(
            patch_size=4, embed_dims=[32, 64, 160, 256], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1],
            drop_rate=0.0, drop_path_rate=0.1, **kwargs)


@BACKBONES.register_module()
class mit_b1(MixVisionTransformer):
    def __init__(self, **kwargs):
        super(mit_b1, self).__init__(
            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1],
            drop_rate=0.0, drop_path_rate=0.1, **kwargs)


@BACKBONES.register_module()
class mit_b2(MixVisionTransformer):
    def __init__(self, **kwargs):
        super(mit_b2, self).__init__(
            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1],
            drop_rate=0.0, drop_path_rate=0.1, **kwargs)


@BACKBONES.register_module()
class mit_b3(MixVisionTransformer):
    def __init__(self, **kwargs):
        super(mit_b3, self).__init__(
            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 18, 3], sr_ratios=[8, 4, 2, 1],
            drop_rate=0.0, drop_path_rate=0.1, **kwargs)


@BACKBONES.register_module()
class mit_b4(MixVisionTransformer):
    def __init__(self, **kwargs):
        super(mit_b4, self).__init__(
            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 8, 27, 3], sr_ratios=[8, 4, 2, 1],
            drop_rate=0.0, drop_path_rate=0.1, **kwargs)


@BACKBONES.register_module()
class mit_b5(MixVisionTransformer):
    def __init__(self, **kwargs):
        super(mit_b5, self).__init__(
            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 6, 40, 3], sr_ratios=[8, 4, 2, 1],
            drop_rate=0.0, drop_path_rate=0.1, **kwargs)


@BACKBONES.register_module()
class ResNetV1c(ResNet):
    r"""ResNetV1d variant described in `Bag of Tricks
    <https://arxiv.org/pdf/1812.01187.pdf>`_.

    Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in
    the input stem with three 3x3 convs. And in the downsampling block, a 2x2
    avg_pool with stride 2 is added before conv, whose stride is changed to 1.
    """

    def __init__(self, **kwargs):
        super(ResNetV1c, self).__init__(
            deep_stem=True, avg_down=False, **kwargs)

================================================
FILE: swin/swin_checkpoint.py
================================================
# Copyright (c) Open-MMLab. All rights reserved.
import io
import os
import os.path as osp
import pkgutil
import time
import warnings
from collections import OrderedDict
from importlib import import_module
from tempfile import TemporaryDirectory

import mmcv
import torch
import torchvision
from mmcv.fileio import FileClient
from mmcv.fileio import load as load_file
from mmcv.parallel import is_module_wrapper
from mmcv.runner import get_dist_info
from mmcv.utils import mkdir_or_exist
from torch.nn import functional as F
from torch.optim import Optimizer
from torch.utils import model_zoo

ENV_MMCV_HOME = 'MMCV_HOME'
ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
DEFAULT_CACHE_DIR = '~/.cache'


def _get_mmcv_home():
    mmcv_home = os.path.expanduser(
        os.getenv(
            ENV_MMCV_HOME,
            os.path.join(
                os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv')))

    mkdir_or_exist(mmcv_home)
    return mmcv_home


def load_state_dict(module, state_dict, strict=False, logger=None):
    """Load state_dict to a module.

    This method is modified from :meth:`torch.nn.Module.load_state_dict`.
    Default value for ``strict`` is set to ``False`` and the message for
    param mismatch will be shown even if strict is False.
    Args:
        module (Module): Module that receives the state_dict.
        state_dict (OrderedDict): Weights.
        strict (bool): whether to strictly enforce that the keys
            in :attr:`state_dict` match the keys returned by this module's
            :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
        logger (:obj:`logging.Logger`, optional): Logger to log the error
            message. If not specified, print function will be used.
    """
    unexpected_keys = []
    all_missing_keys = []
    err_msg = []

    metadata = getattr(state_dict, '_metadata', None)
    state_dict = state_dict.copy()
    if metadata is not None:
        state_dict._metadata = metadata

    # use _load_from_state_dict to enable checkpoint version control
    def load(module, prefix=''):
        # recursively check parallel module in case that the model has a
        # complicated structure, e.g., nn.Module(nn.Module(DDP))
        if is_module_wrapper(module):
            module = module.module
        local_metadata = {} if metadata is None else metadata.get(
            prefix[:-1], {})
        module._load_from_state_dict(state_dict, prefix, local_metadata, True,
                                     all_missing_keys, unexpected_keys,
                                     err_msg)
        for name, child in module._modules.items():
            if child is not None:
                load(child, prefix + name + '.')

    load(module)
    load = None  # break load->load reference cycle

    # ignore "num_batches_tracked" of BN layers
    missing_keys = [
        key for key in all_missing_keys if 'num_batches_tracked' not in key
    ]

    if unexpected_keys:
        err_msg.append('unexpected key in source '
                       f'state_dict: {", ".join(unexpected_keys)}\n')
    if missing_keys:
        err_msg.append(
            f'missing keys in source state_dict: {", ".join(missing_keys)}\n')

    rank, _ = get_dist_info()
    if len(err_msg) > 0 and rank == 0:
        err_msg.insert(
            0, 'The model and loaded state dict do not match exactly\n')
        err_msg = '\n'.join(err_msg)
        if strict:
            raise RuntimeError(err_msg)
        elif logger is not None:
            logger.warning(err_msg)
        else:
            print(err_msg)


def load_url_dist(url, model_dir=None):
    """In distributed setting, this function only download checkpoint at local
    rank 0."""
    rank, world_size = get_dist_info()
    rank = int(os.environ.get('LOCAL_RANK', rank))
    if rank == 0:
        checkpoint = model_zoo.load_url(url, model_dir=model_dir)
    if world_size > 1:
        torch.distributed.barrier()
        if rank > 0:
            checkpoint = model_zoo.load_url(url, model_dir=model_dir)
    return checkpoint


def load_pavimodel_dist(model_path, map_location=None):
    """In distributed setting, this function only download checkpoint at local
    rank 0."""
    try:
        from pavi import modelcloud
    except ImportError:
        raise ImportError(
            'Please install pavi to load checkpoint from modelcloud.')
    rank, world_size = get_dist_info()
    rank = int(os.environ.get('LOCAL_RANK', rank))
    if rank == 0:
        model = modelcloud.get(model_path)
        with TemporaryDirectory() as tmp_dir:
            downloaded_file = osp.join(tmp_dir, model.name)
            model.download(downloaded_file)
            checkpoint = torch.load(downloaded_file, map_location=map_location)
    if world_size > 1:
        torch.distributed.barrier()
        if rank > 0:
            model = modelcloud.get(model_path)
            with TemporaryDirectory() as tmp_dir:
                downloaded_file = osp.join(tmp_dir, model.name)
                model.download(downloaded_file)
                checkpoint = torch.load(
                    downloaded_file, map_location=map_location)
    return checkpoint


def load_fileclient_dist(filename, backend, map_location):
    """In distributed setting, this function only download checkpoint at local
    rank 0."""
    rank, world_size = get_dist_info()
    rank = int(os.environ.get('LOCAL_RANK', rank))
    allowed_backends = ['ceph']
    if backend not in allowed_backends:
        raise ValueError(f'Load from Backend {backend} is not supported.')
    if rank == 0:
        fileclient = FileClient(backend=backend)
        buffer = io.BytesIO(fileclient.get(filename))
        checkpoint = torch.load(buffer, map_location=map_location)
    if world_size > 1:
        torch.distributed.barrier()
        if rank > 0:
            fileclient = FileClient(backend=backend)
            buffer = io.BytesIO(fileclient.get(filename))
            checkpoint = torch.load(buffer, map_location=map_location)
    return checkpoint


def get_torchvision_models():
    model_urls = dict()
    for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
        if ispkg:
            continue
        _zoo = import_module(f'torchvision.models.{name}')
        if hasattr(_zoo, 'model_urls'):
            _urls = getattr(_zoo, 'model_urls')
            model_urls.update(_urls)
    return model_urls


def get_external_models():
    mmcv_home = _get_mmcv_home()
    default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json')
    default_urls = load_file(default_json_path)
    assert isinstance(default_urls, dict)
    external_json_path = osp.join(mmcv_home, 'open_mmlab.json')
    if osp.exists(external_json_path):
        external_urls = load_file(external_json_path)
        assert isinstance(external_urls, dict)
        default_urls.update(external_urls)

    return default_urls


def get_mmcls_models():
    mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json')
    mmcls_urls = load_file(mmcls_json_path)

    return mmcls_urls


def get_deprecated_model_names():
    deprecate_json_path = osp.join(mmcv.__path__[0],
                                   'model_zoo/deprecated.json')
    deprecate_urls = load_file(deprecate_json_path)
    assert isinstance(deprecate_urls, dict)

    return deprecate_urls


def _process_mmcls_checkpoint(checkpoint):
    state_dict = checkpoint['state_dict']
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        if k.startswith('backbone.'):
            new_state_dict[k[9:]] = v
    new_checkpoint = dict(state_dict=new_state_dict)

    return new_checkpoint


def _load_checkpoint(filename, map_location=None):
    """Load checkpoint from somewhere (modelzoo, file, url).

    Args:
        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
            details.
        map_location (str | None): Same as :func:`torch.load`. Default: None.
    Returns:
        dict | OrderedDict: The loaded checkpoint. It can be either an
            OrderedDict storing model weights or a dict containing other
            information, which depends on the checkpoint.
    """
    if filename.startswith('modelzoo://'):
        warnings.warn('The URL scheme of "modelzoo://" is deprecated, please '
                      'use "torchvision://" instead')
        model_urls = get_torchvision_models()
        model_name = filename[11:]
        checkpoint = load_url_dist(model_urls[model_name])
    elif filename.startswith('torchvision://'):
        model_urls = get_torchvision_models()
        model_name = filename[14:]
        checkpoint = load_url_dist(model_urls[model_name])
    elif filename.startswith('open-mmlab://'):
        model_urls = get_external_models()
        model_name = filename[13:]
        deprecated_urls = get_deprecated_model_names()
        if model_name in deprecated_urls:
            warnings.warn(f'open-mmlab://{model_name} is deprecated in favor '
                          f'of open-mmlab://{deprecated_urls[model_name]}')
            model_name = deprecated_urls[model_name]
        model_url = model_urls[model_name]
        # check if is url
        if model_url.startswith(('http://', 'https://')):
            checkpoint = load_url_dist(model_url)
        else:
            filename = osp.join(_get_mmcv_home(), model_url)
            if not osp.isfile(filename):
                raise IOError(f'{filename} is not a checkpoint file')
            checkpoint = torch.load(filename, map_location=map_location)
    elif filename.startswith('mmcls://'):
        model_urls = get_mmcls_models()
        model_name = filename[8:]
        checkpoint = load_url_dist(model_urls[model_name])
        checkpoint = _process_mmcls_checkpoint(checkpoint)
    elif filename.startswith(('http://', 'https://')):
        checkpoint = load_url_dist(filename)
    elif filename.startswith('pavi://'):
        model_path = filename[7:]
        checkpoint = load_pavimodel_dist(model_path, map_location=map_location)
    elif filename.startswith('s3://'):
        checkpoint = load_fileclient_dist(
            filename, backend='ceph', map_location=map_location)
    else:
        if not osp.isfile(filename):
            raise IOError(f'{filename} is not a checkpoint file')
        checkpoint = torch.load(filename, map_location=map_location)
    return checkpoint


def load_checkpoint(model,
                    filename,
                    map_location='cpu',
                    strict=False,
                    logger=None):
    """Load checkpoint from a file or URI.

    Args:
        model (Module): Module to load checkpoint.
        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
            details.
        map_location (str): Same as :func:`torch.load`.
        strict (bool): Whether to allow different params for the model and
            checkpoint.
        logger (:mod:`logging.Logger` or None): The logger for error message.
    Returns:
        dict or OrderedDict: The loaded checkpoint.
    """
    checkpoint = _load_checkpoint(filename, map_location)
    # OrderedDict is a subclass of dict
    if not isinstance(checkpoint, dict):
        raise RuntimeError(
            f'No state_dict found in checkpoint file {filename}')
    # get state_dict from checkpoint
    if 'state_dict' in checkpoint:
        state_dict = checkpoint['state_dict']
    elif 'model' in checkpoint:
        state_dict = checkpoint['model']
    else:
        state_dict = checkpoint
    # strip prefix of state_dict
    if list(state_dict.keys())[0].startswith('module.'):
        state_dict = {k[7:]: v for k, v in state_dict.items()}

    # reshape absolute position embedding
    if state_dict.get('absolute_pos_embed') is not None:
        absolute_pos_embed = state_dict['absolute_pos_embed']
        N1, L, C1 = absolute_pos_embed.size()
        N2, C2, H, W = model.absolute_pos_embed.size()
        if N1 != N2 or C1 != C2 or L != H * W:
            logger.warning('Error in loading absolute_pos_embed, pass')
        else:
            state_dict['absolute_pos_embed'] = absolute_pos_embed.view(
                N2, H, W, C2).permute(0, 3, 1, 2)

    # interpolate position bias table if needed
    relative_position_bias_table_keys = [
        k for k in state_dict.keys() if 'relative_position_bias_table' in k
    ]
    for table_key in relative_position_bias_table_keys:
        table_pretrained = state_dict[table_key]
        table_current = model.state_dict()[table_key]
        L1, nH1 = table_pretrained.size()
        L2, nH2 = table_current.size()
        if nH1 != nH2:
            logger.warning(f'Error in loading {table_key}, pass')
        else:
            if L1 != L2:
                S1 = int(L1**0.5)
                S2 = int(L2**0.5)
                table_pretrained_resized = F.interpolate(
                    table_pretrained.permute(1, 0).view(1, nH1, S1, S1),
                    size=(S2, S2),
                    mode='bicubic')
                state_dict[table_key] = table_pretrained_resized.view(
                    nH2, L2).permute(1, 0)

    # load state_dict
    load_state_dict(model, state_dict, strict, logger)
    return checkpoint


def weights_to_cpu(state_dict):
    """Copy a model state_dict to cpu.

    Args:
        state_dict (OrderedDict): Model weights on GPU.
    Returns:
        OrderedDict: Model weights on GPU.
    """
    state_dict_cpu = OrderedDict()
    for key, val in state_dict.items():
        state_dict_cpu[key] = val.cpu()
    return state_dict_cpu


def _save_to_state_dict(module, destination, prefix, keep_vars):
    """Saves module state to `destination` dictionary.

    This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
    Args:
        module (nn.Module): The module to generate state_dict.
        destination (dict): A dict where state will be stored.
        prefix (str): The prefix for parameters and buffers used in this
            module.
    """
    for name, param in module._parameters.items():
        if param is not None:
            destination[prefix + name] = param if keep_vars else param.detach()
    for name, buf in module._buffers.items():
        # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d
        if buf is not None:
            destination[prefix + name] = buf if keep_vars else buf.detach()


def get_state_dict(module, destination=None, prefix='', keep_vars=False):
    """Returns a dictionary containing a whole state of the module.

    Both parameters and persistent buffers (e.g. running averages) are
    included. Keys are corresponding parameter and buffer names.
    This method is modified from :meth:`torch.nn.Module.state_dict` to
    recursively check parallel module in case that the model has a complicated
    structure, e.g., nn.Module(nn.Module(DDP)).
    Args:
        module (nn.Module): The module to generate state_dict.
        destination (OrderedDict): Returned dict for the state of the
            module.
        prefix (str): Prefix of the key.
        keep_vars (bool): Whether to keep the variable property of the
            parameters. Default: False.
    Returns:
        dict: A dictionary containing a whole state of the module.
    """
    # recursively check parallel module in case that the model has a
    # complicated structure, e.g., nn.Module(nn.Module(DDP))
    if is_module_wrapper(module):
        module = module.module

    # below is the same as torch.nn.Module.state_dict()
    if destination is None:
        destination = OrderedDict()
        destination._metadata = OrderedDict()
    destination._metadata[prefix[:-1]] = local_metadata = dict(
        version=module._version)
    _save_to_state_dict(module, destination, prefix, keep_vars)
    for name, child in module._modules.items():
        if child is not None:
            get_state_dict(
                child, destination, prefix + name + '.', keep_vars=keep_vars)
    for hook in module._state_dict_hooks.values():
        hook_result = hook(module, destination, prefix, local_metadata)
        if hook_result is not None:
            destination = hook_result
    return destination


def save_checkpoint(model, filename, optimizer=None, meta=None):
    """Save checkpoint to file.

    The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
    ``optimizer``. By default ``meta`` will contain version and time info.
    Args:
        model (Module): Module whose params are to be saved.
        filename (str): Checkpoint filename.
        optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
        meta (dict, optional): Metadata to be saved in checkpoint.
    """
    if meta is None:
        meta = {}
    elif not isinstance(meta, dict):
        raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
    meta.update(mmcv_version=mmcv.__version__, time=time.asctime())

    if is_module_wrapper(model):
        model = model.module

    if hasattr(model, 'CLASSES') and model.CLASSES is not None:
        # save class name to the meta
        meta.update(CLASSES=model.CLASSES)

    checkpoint = {
        'meta': meta,
        'state_dict': weights_to_cpu(get_state_dict(model))
    }
    # save optimizer state dict in the checkpoint
    if isinstance(optimizer, Optimizer):
        checkpoint['optimizer'] = optimizer.state_dict()
    elif isinstance(optimizer, dict):
        checkpoint['optimizer'] = {}
        for name, optim in optimizer.items():
            checkpoint['optimizer'][name] = optim.state_dict()

    if filename.startswith('pavi://'):
        try:
            from pavi import modelcloud
            from pavi.exception import NodeNotFoundError
        except ImportError:
            raise ImportError(
                'Please install pavi to load checkpoint from modelcloud.')
        model_path = filename[7:]
        root = modelcloud.Folder()
        model_dir, model_name = osp.split(model_path)
        try:
            model = modelcloud.get(model_dir)
        except NodeNotFoundError:
            model = root.create_training_model(model_dir)
        with TemporaryDirectory() as tmp_dir:
            checkpoint_file = osp.join(tmp_dir, model_name)
            with open(checkpoint_file, 'wb') as f:
                torch.save(checkpoint, f)
                f.flush()
            model.create_file(checkpoint_file, name=model_name)
    else:
        mmcv.mkdir_or_exist(osp.dirname(filename))
        # immediately flush buffer
        with open(filename, 'wb') as f:
            torch.save(checkpoint, f)
            f.flush()


================================================
FILE: swin/swin_transformer.py
================================================
# --------------------------------------------------------
# Swin Transformer
# Copyright (c) 2021 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ze Liu, Yutong Lin, Yixuan Wei
# --------------------------------------------------------

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from mmdet.models.builder import BACKBONES
from mmdet.utils import get_root_logger
from timm.models.layers import DropPath, to_2tuple, trunc_normal_

from .swin_checkpoint import load_checkpoint


class Mlp(nn.Module):
    """Multilayer perceptron."""

    def __init__(self,
                 in_features,
                 hidden_features=None,
                 out_features=None,
                 act_layer=nn.GELU,
                 drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


def window_partition(x, window_size):
    """
    Args:
        x: (B, H, W, C)
        window_size (int): window size
    Returns:
        windows: (num_windows*B, window_size, window_size, C)
    """
    B, H, W, C = x.shape
    x = x.view(B, H // window_size, window_size, W // window_size, window_size,
               C)
    windows = x.permute(0, 1, 3, 2, 4,
                        5).contiguous().view(-1, window_size, window_size, C)
    return windows


def window_reverse(windows, window_size, H, W):
    """
    Args:
        windows: (num_windows*B, window_size, window_size, C)
        window_size (int): Window size
        H (int): Height of image
        W (int): Width of image
    Returns:
        x: (B, H, W, C)
    """
    B = int(windows.shape[0] / (H * W / window_size / window_size))
    x = windows.view(B, H // window_size, W // window_size, window_size,
                     window_size, -1)
    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
    return x


class WindowAttention(nn.Module):
    """Window based multi-head self attention (W-MSA) module with relative
    position bias.

    It supports both of shifted and non-shifted window.
    Args:
        dim (int): Number of input channels.
        window_size (tuple[int]): The height and width of the window.
        num_heads (int): Number of attention heads.
        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
    """

    def __init__(self,
                 dim,
                 window_size,
                 num_heads,
                 qkv_bias=True,
                 qk_scale=None,
                 attn_drop=0.,
                 proj_drop=0.):

        super().__init__()
        self.dim = dim
        self.window_size = window_size  # Wh, Ww
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim**-0.5

        # define a parameter table of relative position bias
        self.relative_position_bias_table = nn.Parameter(
            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH

        # get pair-wise relative position index for each token inside the window
        coords_h = torch.arange(self.window_size[0])
        coords_w = torch.arange(self.window_size[1])
        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
        relative_coords = coords_flatten[:, :,
                                         None] - coords_flatten[:,
                                                                None, :]  # 2, Wh*Ww, Wh*Ww
        relative_coords = relative_coords.permute(
            1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
        relative_coords[:, :,
                        0] += self.window_size[0] - 1  # shift to start from 0
        relative_coords[:, :, 1] += self.window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
        self.register_buffer('relative_position_index',
                             relative_position_index)

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

        trunc_normal_(self.relative_position_bias_table, std=.02)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, mask=None):
        """Forward function.

        Args:
            x: input features with shape of (num_windows*B, N, C)
            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
        """
        B_, N, C = x.shape
        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[
            2]  # make torchscript happy (cannot use tensor as tuple)

        q = q * self.scale
        attn = (q @ k.transpose(-2, -1))

        relative_position_bias = self.relative_position_bias_table[
            self.relative_position_index.view(-1)].view(
                self.window_size[0] * self.window_size[1],
                self.window_size[0] * self.window_size[1],
                -1)  # Wh*Ww,Wh*Ww,nH
        relative_position_bias = relative_position_bias.permute(
            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
        attn = attn + relative_position_bias.unsqueeze(0)

        if mask is not None:
            nW = mask.shape[0]
            attn = attn.view(B_ // nW, nW, self.num_heads, N,
                             N) + mask.unsqueeze(1).unsqueeze(0)
            attn = attn.view(-1, self.num_heads, N, N)
            attn = self.softmax(attn)
        else:
            attn = self.softmax(attn)

        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class SwinTransformerBlock(nn.Module):
    """Swin Transformer Block.

    Args:
        dim (int): Number of input channels.
        num_heads (int): Number of attention heads.
        window_size (int): Window size.
        shift_size (int): Shift size for SW-MSA.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float, optional): Stochastic depth rate. Default: 0.0
        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
    """

    def __init__(self,
                 dim,
                 num_heads,
                 window_size=7,
                 shift_size=0,
                 mlp_ratio=4.,
                 qkv_bias=True,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm):
        super().__init__()
        self.dim = dim
        self.num_heads = num_heads
        self.window_size = window_size
        self.shift_size = shift_size
        self.mlp_ratio = mlp_ratio
        assert 0 <= self.shift_size < self.window_size, 'shift_size must in 0-window_size'

        self.norm1 = norm_layer(dim)
        self.attn = WindowAttention(
            dim,
            window_size=to_2tuple(self.window_size),
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop=attn_drop,
            proj_drop=drop)

        self.drop_path = DropPath(
            drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(
            in_features=dim,
            hidden_features=mlp_hidden_dim,
            act_layer=act_layer,
            drop=drop)

        self.H = None
        self.W = None

    def forward(self, x, mask_matrix):
        """Forward function.

        Args:
            x: Input feature, tensor size (B, H*W, C).
            H, W: Spatial resolution of the input feature.
            mask_matrix: Attention mask for cyclic shift.
        """
        B, L, C = x.shape
        H, W = self.H, self.W
        assert L == H * W, 'input feature has wrong size'

        shortcut = x
        x = self.norm1(x)
        x = x.view(B, H, W, C)

        # pad feature maps to multiples of window size
        pad_l = pad_t = 0
        pad_r = (self.window_size - W % self.window_size) % self.window_size
        pad_b = (self.window_size - H % self.window_size) % self.window_size
        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
        _, Hp, Wp, _ = x.shape

        # cyclic shift
        if self.shift_size > 0:
            shifted_x = torch.roll(
                x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
            attn_mask = mask_matrix
        else:
            shifted_x = x
            attn_mask = None

        # partition windows
        x_windows = window_partition(
            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
        x_windows = x_windows.view(-1, self.window_size * self.window_size,
                                   C)  # nW*B, window_size*window_size, C

        # W-MSA/SW-MSA
        attn_windows = self.attn(
            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C

        # merge windows
        attn_windows = attn_windows.view(-1, self.window_size,
                                         self.window_size, C)
        shifted_x = window_reverse(attn_windows, self.window_size, Hp,
                                   Wp)  # B H' W' C

        # reverse cyclic shift
        if self.shift_size > 0:
            x = torch.roll(
                shifted_x,
                shifts=(self.shift_size, self.shift_size),
                dims=(1, 2))
        else:
            x = shifted_x

        if pad_r > 0 or pad_b > 0:
            x = x[:, :H, :W, :].contiguous()

        x = x.view(B, H * W, C)

        # FFN
        x = shortcut + self.drop_path(x)
        x = x + self.drop_path(self.mlp(self.norm2(x)))

        return x


class PatchMerging(nn.Module):
    """ Patch Merging Layer
    Args:
        dim (int): Number of input channels.
        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
    """

    def __init__(self, dim, norm_layer=nn.LayerNorm):
        super().__init__()
        self.dim = dim
        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
        self.norm = norm_layer(4 * dim)

    def forward(self, x, H, W):
        """Forward function.

        Args:
            x: Input feature, tensor size (B, H*W, C).
            H, W: Spatial resolution of the input feature.
        """
        B, L, C = x.shape
        assert L == H * W, 'input feature has wrong size'

        x = x.view(B, H, W, C)

        # padding
        pad_input = (H % 2 == 1) or (W % 2 == 1)
        if pad_input:
            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))

        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C

        x = self.norm(x)
        x = self.reduction(x)

        return x


class BasicLayer(nn.Module):
    """A basic Swin Transformer layer for one stage.

    Args:
        dim (int): Number of feature channels
        depth (int): Depths of this stage.
        num_heads (int): Number of attention head.
        window_size (int): Local window size. Default: 7.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
        with_cp (bool): Whether to use checkpointing to save memory. Default: False.
    """

    def __init__(self,
                 dim,
                 depth,
                 num_heads,
                 window_size=7,
                 mlp_ratio=4.,
                 qkv_bias=True,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 norm_layer=nn.LayerNorm,
                 downsample=None,
                 with_cp=False):
        super().__init__()
        self.window_size = window_size
        self.shift_size = window_size // 2
        self.depth = depth
        self.with_cp = with_cp

        # build blocks
        self.blocks = nn.ModuleList([
            SwinTransformerBlock(
                dim=dim,
                num_heads=num_heads,
                window_size=window_size,
                shift_size=0 if (i % 2 == 0) else window_size // 2,
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop,
                attn_drop=attn_drop,
                drop_path=drop_path[i]
                if isinstance(drop_path, list) else drop_path,
                norm_layer=norm_layer) for i in range(depth)
        ])

        # patch merging layer
        if downsample is not None:
            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
        else:
            self.downsample = None

    def forward(self, x, H, W):
        """Forward function.

        Args:
            x: Input feature, tensor size (B, H*W, C).
            H, W: Spatial resolution of the input feature.
        """

        # calculate attention mask for SW-MSA
        Hp = int(np.ceil(H / self.window_size)) * self.window_size
        Wp = int(np.ceil(W / self.window_size)) * self.window_size
        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
        h_slices = (slice(0, -self.window_size),
                    slice(-self.window_size,
                          -self.shift_size), slice(-self.shift_size, None))
        w_slices = (slice(0, -self.window_size),
                    slice(-self.window_size,
                          -self.shift_size), slice(-self.shift_size, None))
        cnt = 0
        for h in h_slices:
            for w in w_slices:
                img_mask[:, h, w, :] = cnt
                cnt += 1

        mask_windows = window_partition(
            img_mask, self.window_size)  # nW, window_size, window_size, 1
        mask_windows = mask_windows.view(-1,
                                         self.window_size * self.window_size)
        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
        attn_mask = attn_mask.masked_fill(attn_mask != 0,
                                          float(-100.0)).masked_fill(
                                              attn_mask == 0, float(0.0))
        attn_mask = attn_mask.to(dtype=x.dtype)
        for blk in self.blocks:
            blk.H, blk.W = H, W
            if self.with_cp:
                x = checkpoint.checkpoint(blk, x, attn_mask)
            else:
                x = blk(x, attn_mask)
        if self.downsample is not None:
            x_down = self.downsample(x, H, W)
            Wh, Ww = (H + 1) // 2, (W + 1) // 2
            return x, H, W, x_down, Wh, Ww
        else:
            return x, H, W, x, H, W


class PatchEmbed(nn.Module):
    """ Image to Patch Embedding
    Args:
        patch_size (int): Patch token size. Default: 4.
        in_chans (int): Number of input image channels. Default: 3.
        embed_dims (int): Number of linear projection output channels. Default: 96.
        norm_layer (nn.Module, optional): Normalization layer. Default: None
    """

    def __init__(self,
                 patch_size=4,
                 in_chans=3,
                 embed_dims=96,
                 norm_layer=None):
        super().__init__()
        patch_size = to_2tuple(patch_size)
        self.patch_size = patch_size

        self.in_chans = in_chans
        self.embed_dims = embed_dims

        self.proj = nn.Conv2d(
            in_chans, embed_dims, kernel_size=patch_size, stride=patch_size)
        if norm_layer is not None:
            self.norm = norm_layer(embed_dims)
        else:
            self.norm = None

    def forward(self, x):
        """Forward function."""
        # padding
        _, _, H, W = x.size()
        if W % self.patch_size[1] != 0:
            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
        if H % self.patch_size[0] != 0:
            x = F.pad(x,
                      (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))

        x = self.proj(x)  # B C Wh Ww
        if self.norm is not None:
            Wh, Ww = x.size(2), x.size(3)
            x = x.flatten(2).transpose(1, 2)
            x = self.norm(x)
            x = x.transpose(1, 2).view(-1, self.embed_dims, Wh, Ww)

        return x


# @BACKBONES_Seg.register_module()
@BACKBONES.register_module()
class SwinTransformerDIY(nn.Module):
    """ Swin Transformer backbone.
        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
          https://arxiv.org/pdf/2103.14030
    Args:
        pretrain_img_size (int): Input image size for training the pretrained model,
            used in absolute postion embedding. Default 224.
        patch_size (int | tuple(int)): Patch size. Default: 4.
        in_chans (int): Number of input image channels. Default: 3.
        embed_dims (int): Number of linear projection output channels. Default: 96.
        depths (tuple[int]): Depths of each Swin Transformer stage.
        num_heads (tuple[int]): Number of attention head of each stage.
        window_size (int): Window size. Default: 7.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
        drop_rate (float): Dropout rate.
        attn_drop_rate (float): Attention dropout rate. Default: 0.
        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
        out_indices (Sequence[int]): Output from which stages.
        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
            -1 means not freezing any parameters.
        with_cp (bool): Whether to use checkpointing to save memory. Default: False.
    """

    def __init__(self,
                 pretrain_img_size=224,
                 patch_size=4,
                 in_chans=3,
                 embed_dims=96,
                 depths=[2, 2, 6, 2],
                 num_heads=[3, 6, 12, 24],
                 window_size=7,
                 mlp_ratio=4.,
                 qkv_bias=True,
                 qk_scale=None,
                 drop_rate=0.,
                 attn_drop_rate=0.,
                 drop_path_rate=0.2,
                 norm_layer=nn.LayerNorm,
                 use_abs_pos_embed=False,
                 patch_norm=True,
                 out_indices=(0, 1, 2, 3),
                 frozen_stages=-1,
                 with_cp=False,
                 output_img=False,
                 pretrained=None):
        super().__init__()
        self.output_img = output_img

        self.pretrain_img_size = pretrain_img_size
        self.num_layers = len(depths)
        self.embed_dims = embed_dims
        self.ape = use_abs_pos_embed
        self.patch_norm = patch_norm
        self.out_indices = out_indices
        self.frozen_stages = frozen_stages
        self.pretrained = pretrained

        # split image into non-overlapping patches
        self.patch_embed = PatchEmbed(
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dims=embed_dims,
            norm_layer=norm_layer if self.patch_norm else None)

        # absolute position embedding
        if self.ape:
            pretrain_img_size = to_2tuple(pretrain_img_size)
            patch_size = to_2tuple(patch_size)
            patches_resolution = [
                pretrain_img_size[0] // patch_size[0],
                pretrain_img_size[1] // patch_size[1]
            ]

            self.absolute_pos_embed = nn.Parameter(
                torch.zeros(1, embed_dims, patches_resolution[0],
                            patches_resolution[1]))
            trunc_normal_(self.absolute_pos_embed, std=.02)

        self.pos_drop = nn.Dropout(p=drop_rate)

        # stochastic depth
        dpr = [
            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
        ]  # stochastic depth decay rule

        # build layers
        self.layers = nn.ModuleList()
        for i_layer in range(self.num_layers):
            layer = BasicLayer(
                dim=int(embed_dims * 2**i_layer),
                depth=depths[i_layer],
                num_heads=num_heads[i_layer],
                window_size=window_size,
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
                norm_layer=norm_layer,
                downsample=PatchMerging if
                (i_layer < self.num_layers - 1) else None,
                with_cp=with_cp)
            self.layers.append(layer)

        num_features = [int(embed_dims * 2**i) for i in range(self.num_layers)]
        self.num_features = num_features

        # add a norm layer for each output
        for i_layer in out_indices:
            layer = norm_layer(num_features[i_layer])
            layer_name = f'norm{i_layer}'
            self.add_module(layer_name, layer)

        self._freeze_stages()

    def _freeze_stages(self):
        if self.frozen_stages >= 0:
            self.patch_embed.eval()
            for param in self.patch_embed.parameters():
                param.requires_grad = False

        if self.frozen_stages >= 1 and self.ape:
            self.absolute_pos_embed.requires_grad = False

        if self.frozen_stages >= 2:
            self.pos_drop.eval()
            for i in range(0, self.frozen_stages - 1):
                m = self.layers[i]
                m.eval()
                for param in m.parameters():
                    param.requires_grad = False

    def init_weights(self, pretrained=None):
        """Initialize the weights in backbone.

        Args:
            pretrained (str, optional): Path to pre-trained weights.
                Defaults to None.
        """
        if pretrained is None and self.pretrained is not None:
            pretrained = self.pretrained

        def _init_weights(m):
            if isinstance(m, nn.Linear):
                trunc_normal_(m.weight, std=.02)
                if isinstance(m, nn.Linear) and m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.LayerNorm):
                nn.init.constant_(m.bias, 0)
                nn.init.constant_(m.weight, 1.0)

        if isinstance(pretrained, str):
            self.apply(_init_weights)
            logger = get_root_logger()
            load_checkpoint(self, pretrained, strict=False, logger=logger)
        elif pretrained is None:
            self.apply(_init_weights)
        else:
            raise TypeError('pretrained must be a str or None')

    def forward(self, x):
        """Forward function."""
        x_idty = x
        x = self.patch_embed(x)

        Wh, Ww = x.size(2), x.size(3)
        if self.ape:
            # interpolate the position embedding to the corresponding size
            absolute_pos_embed = F.interpolate(
                self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
            x = (x + absolute_pos_embed).flatten(2).transpose(1,
                                                              2)  # B Wh*Ww C
        else:
            x = x.flatten(2).transpose(1, 2)
        x = self.pos_drop(x)

        outs = []
        for i in range(self.num_layers):
            layer = self.layers[i]
            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)

            if i in self.out_indices:
                norm_layer = getattr(self, f'norm{i}')
                x_out = norm_layer(x_out)

                out = x_out.view(-1, H, W,
                                 self.num_features[i]).permute(0, 3, 1,
                                                               2).contiguous()
                outs.append(out)

        if self.output_img:
            outs.insert(0, x_idty)
        return tuple(outs)

    def train(self, mode=True):
        """Convert the model into training mode while keep layers freezed."""
        super().train(mode)
        self._freeze_stages()


================================================
FILE: swin/swin_transformer_rfp.py
================================================
import warnings
from collections import OrderedDict
from copy import deepcopy

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as cp
from mmcv.cnn import build_norm_layer, constant_init, trunc_normal_init, build_conv_layer
from mmcv.cnn.bricks.transformer import FFN, build_dropout
from mmcv.runner import BaseModule, ModuleList, _load_checkpoint
from mmcv.utils import to_2tuple

from mmdet.utils import get_root_logger
from mmdet.models.builder import BACKBONES
from .ckpt_convert import swin_converter
from .transformer import PatchEmbed, PatchMerging


class WindowMSA(BaseModule):
    """Window based multi-head self-attention (W-MSA) module with relative
    position bias.
    Args:
        embed_dims (int): Number of input channels.
        num_heads (int): Number of attention heads.
        window_size (tuple[int]): The height and width of the window.
        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
            Default: True.
        qk_scale (float | None, optional): Override default qk scale of
            head_dim ** -0.5 if set. Default: None.
        attn_drop_rate (float, optional): Dropout ratio of attention weight.
            Default: 0.0
        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
        init_cfg (dict | None, optional): The Config for initialization.
            Default: None.
    """

    def __init__(self,
                 embed_dims,
                 num_heads,
                 window_size,
                 qkv_bias=True,
                 qk_scale=None,
                 attn_drop_rate=0.,
                 proj_drop_rate=0.,
                 init_cfg=None):
        super().__init__()
        self.embed_dims = embed_dims
        self.window_size = window_size  # Wh, Ww
        self.num_heads = num_heads
        head_embed_dims = embed_dims // num_heads
        self.scale = qk_scale or head_embed_dims ** -0.5
        self.init_cfg = init_cfg

        # define a parameter table of relative position bias
        self.relative_position_bias_table = nn.Parameter(
            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH

        # About 2x faster than original impl
        Wh, Ww = self.window_size
        rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww)
        rel_position_index = rel_index_coords + rel_index_coords.T
        rel_position_index = rel_position_index.flip(1).contiguous()
        self.register_buffer('relative_position_index', rel_position_index)

        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop_rate)
        self.proj = nn.Linear(embed_dims, embed_dims)
        self.proj_drop = nn.Dropout(proj_drop_rate)

        self.softmax = nn.Softmax(dim=-1)

    def init_weights(self):
        trunc_normal_init(self.relative_position_bias_table, std=0.02)

    def forward(self, x, mask=None):
        """
        Args:
            x (tensor): input features with shape of (num_windows*B, N, C)
            mask (tensor | None, Optional): mask with shape of (num_windows,
                Wh*Ww, Wh*Ww), value should be between (-inf, 0].
        """
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
        # make torchscript happy (cannot use tensor as tuple)
        q, k, v = qkv[0], qkv[1], qkv[2]

        q = q * self.scale
        attn = (q @ k.transpose(-2, -1))

        relative_position_bias = self.relative_position_bias_table[
            self.relative_position_index.view(-1)].view(
            self.window_size[0] * self.window_size[1],
            self.window_size[0] * self.window_size[1],
            -1)  # Wh*Ww,Wh*Ww,nH
        relative_position_bias = relative_position_bias.permute(
            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
        attn = attn + relative_position_bias.unsqueeze(0)

        if mask is not None:
            nW = mask.shape[0]
            attn = attn.view(B // nW, nW, self.num_heads, N,
                             N) + mask.unsqueeze(1).unsqueeze(0)
            attn = attn.view(-1, self.num_heads, N, N)
        attn = self.softmax(attn)

        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x

    @staticmethod
    def double_step_seq(step1, len1, step2, len2):
        seq1 = torch.arange(0, step1 * len1, step1)
        seq2 = torch.arange(0, step2 * len2, step2)
        return (seq1[:, None] + seq2[None, :]).reshape(1, -1)


class ShiftWindowMSA(BaseModule):
    """Shifted Window Multihead Self-Attention Module.
    Args:
        embed_dims (int): Number of input channels.
        num_heads (int): Number of attention heads.
        window_size (int): The height and width of the window.
        shift_size (int, optional): The shift step of each window towards
            right-bottom. If zero, act as regular window-msa. Defaults to 0.
        qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
            Default: True
        qk_scale (float | None, optional): Override default qk scale of
            head_dim ** -0.5 if set. Defaults: None.
        attn_drop_rate (float, optional): Dropout ratio of attention weight.
            Defaults: 0.
        proj_drop_rate (float, optional): Dropout ratio of output.
            Defaults: 0.
        dropout_layer (dict, optional): The dropout_layer used before output.
            Defaults: dict(type='DropPath', drop_prob=0.).
        init_cfg (dict, optional): The extra config for initialization.
            Default: None.
    """

    def __init__(self,
                 embed_dims,
                 num_heads,
                 window_size,
                 shift_size=0,
                 qkv_bias=True,
                 qk_scale=None,
                 attn_drop_rate=0,
                 proj_drop_rate=0,
                 dropout_layer=dict(type='DropPath', drop_prob=0.),
                 init_cfg=None):
        super().__init__(init_cfg)

        self.window_size = window_size
        self.shift_size = shift_size
        assert 0 <= self.shift_size < self.window_size

        self.w_msa = WindowMSA(
            embed_dims=embed_dims,
            num_heads=num_heads,
            window_size=to_2tuple(window_size),
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop_rate=attn_drop_rate,
            proj_drop_rate=proj_drop_rate,
            init_cfg=None)

        self.drop = build_dropout(dropout_layer)

    def forward(self, query, hw_shape):
        B, L, C = query.shape
        H, W = hw_shape
        assert L == H * W, 'input feature has wrong size'
        query = query.view(B, H, W, C)

        # pad feature maps to multiples of window size
        pad_r = (self.window_size - W % self.window_size) % self.window_size
        pad_b = (self.window_size - H % self.window_size) % self.window_size
        query = F.pad(query, (0, 0, 0, pad_r, 0, pad_b))
        H_pad, W_pad = query.shape[1], query.shape[2]

        # cyclic shift
        if self.shift_size > 0:
            shifted_query = torch.roll(
                query,
                shifts=(-self.shift_size, -self.shift_size),
                dims=(1, 2))

            # calculate attention mask for SW-MSA
            img_mask = torch.zeros((1, H_pad, W_pad, 1), device=query.device)
            h_slices = (slice(0, -self.window_size),
                        slice(-self.window_size,
                              -self.shift_size), slice(-self.shift_size, None))
            w_slices = (slice(0, -self.window_size),
                        slice(-self.window_size,
                              -self.shift_size), slice(-self.shift_size, None))
            cnt = 0
            for h in h_slices:
                for w in w_slices:
                    img_mask[:, h, w, :] = cnt
                    cnt += 1

            # nW, window_size, window_size, 1
            mask_windows = self.window_partition(img_mask)
            mask_windows = mask_windows.view(
                -1, self.window_size * self.window_size)
            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
            attn_mask = attn_mask.masked_fill(attn_mask != 0,
                                              float(-100.0)).masked_fill(
                attn_mask == 0, float(0.0))
        else:
            shifted_query = query
            attn_mask = None

        # nW*B, window_size, window_size, C
        query_windows = self.window_partition(shifted_query)
        # nW*B, window_size*window_size, C
        query_windows = query_windows.view(-1, self.window_size ** 2, C)

        # W-MSA/SW-MSA (nW*B, window_size*window_size, C)
        attn_windows = self.w_msa(query_windows, mask=attn_mask)

        # merge windows
        attn_windows = attn_windows.view(-1, self.window_size,
                                         self.window_size, C)

        # B H' W' C
        shifted_x = self.window_reverse(attn_windows, H_pad, W_pad)
        # reverse cyclic shift
        if self.shift_size > 0:
            x = torch.roll(
                shifted_x,
                shifts=(self.shift_size, self.shift_size),
                dims=(1, 2))
        else:
            x = shifted_x

        if pad_r > 0 or pad_b:
            x = x[:, :H, :W, :].contiguous()

        x = x.view(B, H * W, C)

        x = self.drop(x)
        return x

    def window_reverse(self, windows, H, W):
        """
        Args:
            windows: (num_windows*B, window_size, window_size, C)
            H (int): Height of image
            W (int): Width of image
        Returns:
            x: (B, H, W, C)
        """
        window_size = self.window_size
        B = int(windows.shape[0] / (H * W / window_size / window_size))
        x = windows.view(B, H // window_size, W // window_size, window_size,
                         window_size, -1)
        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
        return x

    def window_partition(self, x):
        """
        Args:
            x: (B, H, W, C)
        Returns:
            windows: (num_windows*B, window_size, window_size, C)
        """
        B, H, W, C = x.shape
        window_size = self.window_size
        x = x.view(B, H // window_size, window_size, W // window_size,
                   window_size, C)
        windows = x.permute(0, 1, 3, 2, 4, 5).contiguous()
        windows = windows.view(-1, window_size, window_size, C)
        return windows


class SwinBlock(BaseModule):
    """"
    Args:
        embed_dims (int): The feature dimension.
        num_heads (int): Parallel attention heads.
        feedforward_channels (int): The hidden dimension for FFNs.
        window_size (int, optional): The local window scale. Default: 7.
        shift (bool, optional): whether to shift window or not. Default False.
        qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
        qk_scale (float | None, optional): Override default qk scale of
            head_dim ** -0.5 if set. Default: None.
        drop_rate (float, optional): Dropout rate. Default: 0.
        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
        drop_path_rate (float, optional): Stochastic depth rate. Default: 0.
        act_cfg (dict, optional): The config dict of activation function.
            Default: dict(type='GELU').
        norm_cfg (dict, optional): The config dict of normalization.
            Default: dict(type='LN').
        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
            will save some memory while slowing down the training speed.
            Default: False.
        init_cfg (dict | list | None, optional): The init config.
            Default: None.
    """

    def __init__(self,
                 embed_dims,
                 num_heads,
                 feedforward_channels,
                 window_size=7,
                 shift=False,
                 qkv_bias=True,
                 qk_scale=None,
                 drop_rate=0.,
                 attn_drop_rate=0.,
                 drop_path_rate=0.,
                 act_cfg=dict(type='GELU'),
                 norm_cfg=dict(type='LN'),
                 with_cp=False,
                 init_cfg=None):

        super(SwinBlock, self).__init__()

        self.init_cfg = init_cfg
        self.with_cp = with_cp

        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
        self.attn = ShiftWindowMSA(
            embed_dims=embed_dims,
            num_heads=num_heads,
            window_size=window_size,
            shift_size=window_size // 2 if shift else 0,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop_rate=attn_drop_rate,
            proj_drop_rate=drop_rate,
            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
            init_cfg=None)

        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
        self.ffn = FFN(
            embed_dims=embed_dims,
            feedforward_channels=feedforward_channels,
            num_fcs=2,
            ffn_drop=drop_rate,
            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
            act_cfg=act_cfg,
            add_identity=True,
            init_cfg=None)

    def forward(self, x, hw_shape):

        def _inner_forward(x):
            identity = x
            x = self.norm1(x)
            x = self.attn(x, hw_shape)

            x = x + identity

            identity = x
            x = self.norm2(x)
            x = self.ffn(x, identity=identity)

            return x

        if self.with_cp and x.requires_grad:
            x = cp.checkpoint(_inner_forward, x)
        else:
            x = _inner_forward(x)

        return x


class SwinBlockSequence(BaseModule):
    """Implements one stage in Swin Transformer.
    Args:
        embed_dims (int): The feature dimension.
        num_heads (int): Parallel attention heads.
        feedforward_channels (int): The hidden dimension for FFNs.
        depth (int): The number of blocks in this stage.
        window_size (int, optional): The local window scale. Default: 7.
        qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
        qk_scale (float | None, optional): Override default qk scale of
            head_dim ** -0.5 if set. Default: None.
        drop_rate (float, optional): Dropout rate. Default: 0.
        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
        drop_path_rate (float | list[float], optional): Stochastic depth
            rate. Default: 0.
        downsample (BaseModule | None, optional): The downsample operation
            module. Default: None.
        act_cfg (dict, optional): The config dict of activation function.
            Default: dict(type='GELU').
        norm_cfg (dict, optional): The config dict of normalization.
            Default: dict(type='LN').
        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
            will save some memory while slowing down the training speed.
            Default: False.
        init_cfg (dict | list | None, optional): The init config.
            Default: None.
    """

    def __init__(self,
                 embed_dims,
                 num_heads,
                 feedforward_channels,
                 depth,
                 window_size=7,
                 qkv_bias=True,
                 qk_scale=None,
                 drop_rate=0.,
                 attn_drop_rate=0.,
                 drop_path_rate=0.,
                 downsample=None,
                 act_cfg=dict(type='GELU'),
                 norm_cfg=dict(type='LN'),
                 with_cp=False,
                 init_cfg=None):
        super().__init__(init_cfg=init_cfg)

        if isinstance(drop_path_rate, list):
            drop_path_rates = drop_path_rate
            assert len(drop_path_rates) == depth
        else:
            drop_path_rates = [deepcopy(drop_path_rate) for _ in range(depth)]

        self.blocks = ModuleList()
        for i in range(depth):
            block = SwinBlock(
                embed_dims=embed_dims,
                num_heads=num_heads,
                feedforward_channels=feedforward_channels,
                window_size=window_size,
                shift=False if i % 2 == 0 else True,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop_rate=drop_rate,
                attn_drop_rate=attn_drop_rate,
                drop_path_rate=drop_path_rates[i],
                act_cfg=act_cfg,
                norm_cfg=norm_cfg,
                with_cp=with_cp,
                init_cfg=None)
            self.blocks.append(block)

        self.downsample = downsample

    def forward(self, x, hw_shape):
        for block in self.blocks:
            x = block(x, hw_shape)

        if self.downsample:
            x_down, down_hw_shape = self.downsample(x, hw_shape)
            return x_down, down_hw_shape, x, hw_shape
        else:
            return x, hw_shape, x, hw_shape


class SwinTransformer(BaseModule):
    """ Swin Transformer
    A PyTorch implement of : `Swin Transformer:
    Hierarchical Vision Transformer using Shifted Windows`  -
        https://arxiv.org/abs/2103.14030
    Inspiration from
    https://github.com/microsoft/Swin-Transformer
    Args:
        pretrain_img_size (int | tuple[int]): The size of input image when
            pretrain. Defaults: 224.
        in_channels (int): The num of input channels.
            Defaults: 3.
        embed_dims (int): The feature dimension. Default: 96.
        patch_size (int | tuple[int]): Patch size. Default: 4.
        window_size (int): Window size. Default: 7.
        mlp_ratio (int): Ratio of mlp hidden dim to embedding dim.
            Default: 4.
        depths (tuple[int]): Depths of each Swin Transformer stage.
            Default: (2, 2, 6, 2).
        num_heads (tuple[int]): Parallel attention heads of each Swin
            Transformer stage. Default: (3, 6, 12, 24).
        strides (tuple[int]): The patch merging or patch embedding stride of
            each Swin Transformer stage. (In swin, we set kernel size equal to
            stride.) Default: (4, 2, 2, 2).
        out_indices (tuple[int]): Output from which stages.
            Default: (0, 1, 2, 3).
        qkv_bias (bool, optional): If True, add a learnable bias to query, key,
            value. Default: True
        qk_scale (float | None, optional): Override default qk scale of
            head_dim ** -0.5 if set. Default: None.
        patch_norm (bool): If add a norm layer for patch embed and patch
            merging. Default: True.
        drop_rate (float): Dropout rate. Defaults: 0.
        attn_drop_rate (float): Attention dropout rate. Default: 0.
        drop_path_rate (float): Stochastic depth rate. Defaults: 0.1.
        use_abs_pos_embed (bool): If True, add absolute position embedding to
            the patch embedding. Defaults: False.
        act_cfg (dict): Config dict for activation layer.
            Default: dict(type='LN').
        norm_cfg (dict): Config dict for normalization layer at
            output of backone. Defaults: dict(type='LN').
        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
            will save some memory while slowing down the training speed.
            Default: False.
        pretrained (str, optional): model pretrained path. Default: None.
        convert_weights (bool): The flag indicates whether the
            pre-trained model is from the original repo. We may need
            to convert some keys to make it compatible.
            Default: False.
        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
            -1 means not freezing any parameters.
        init_cfg (dict, optional): The Config for initialization.
            Defaults to None.
    """

    def __init__(self,
                 pretrain_img_size=224,
                 in_channels=3,
                 embed_dims=96,
                 patch_size=4,
                 window_size=7,
                 mlp_ratio=4,
                 depths=(2, 2, 6, 2),
                 num_heads=(3, 6, 12, 24),
                 strides=(4, 2, 2, 2),
                 out_indices=(0, 1, 2, 3),
                 qkv_bias=True,
                 qk_scale=None,
                 patch_norm=True,
                 drop_rate=0.,
                 attn_drop_rate=0.,
                 drop_path_rate=0.1,
                 use_abs_pos_embed=False,
                 act_cfg=dict(type='GELU'),
                 norm_cfg=dict(type='LN'),
                 with_cp=False,
                 pretrained=None,
                 convert_weights=False,
                 frozen_stages=-1,
                 init_cfg=None):
        self.convert_weights = convert_weights
        self.frozen_stages = frozen_stages
        if isinstance(pretrain_img_size, int):
            pretrain_img_size = to_2tuple(pretrain_img_size)
        elif isinstance(pretrain_img_size, tuple):
            if len(pretrain_img_size) == 1:
                pretrain_img_size = to_2tuple(pretrain_img_size[0])
            assert len(pretrain_img_size) == 2, \
                f'The size of image should have length 1 or 2, ' \
                f'but got {len(pretrain_img_size)}'

        assert not (init_cfg and pretrained), \
            'init_cfg and pretrained cannot be specified at the same time'
        if isinstance(pretrained, str):
            warnings.warn('DeprecationWarning: pretrained is deprecated, '
                          'please use "init_cfg" instead')
            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
        elif pretrained is None:
            self.init_cfg = init_cfg
        else:
            raise TypeError('pretrained must be a str or None')

        super(SwinTransformer, self).__init__(init_cfg=init_cfg)

        num_layers = len(depths)
        self.out_indices = out_indices
        self.use_abs_pos_embed = use_abs_pos_embed

        assert strides[0] == patch_size, 'Use non-overlapping patch embed.'

        self.patch_embed = PatchEmbed(
            in_channels=in_channels,
            embed_dims=embed_dims,
            conv_type='Conv2d',
            kernel_size=patch_size,
            stride=strides[0],
            norm_cfg=norm_cfg if patch_norm else None,
            init_cfg=None)

        if self.use_abs_pos_embed:
            patch_row = pretrain_img_size[0] // patch_size
            patch_col = pretrain_img_size[1] // patch_size
            num_patches = patch_row * patch_col
            self.absolute_pos_embed = nn.Parameter(
                torch.zeros((1, num_patches, embed_dims)))

        self.drop_after_pos = nn.Dropout(p=drop_rate)

        # set stochastic depth decay rule
        total_depth = sum(depths)
        dpr = [
            x.item() for x in torch.linspace(0, drop_path_rate, total_depth)
        ]

        self.stages = ModuleList()
        in_channels = embed_dims
        for i in range(num_layers):
            if i < num_layers - 1:
                downsample = PatchMerging(
                    in_channels=in_channels,
                    out_channels=2 * in_channels,
                    stride=strides[i + 1],
                    norm_cfg=norm_cfg if patch_norm else None,
                    init_cfg=None)
            else:
                downsample = None

            stage = SwinBlockSequence(
                embed_dims=in_channels,
                num_heads=num_heads[i],
                feedforward_channels=mlp_ratio * in_channels,
                depth=depths[i],
                window_size=window_size,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop_rate=drop_rate,
                attn_drop_rate=attn_drop_rate,
                drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])],
                downsample=downsample,
                act_cfg=act_cfg,
                norm_cfg=norm_cfg,
                with_cp=with_cp,
                init_cfg=None)
            self.stages.append(stage)
            if downsample:
                in_channels = downsample.out_channels

        self.num_features = [int(embed_dims * 2 ** i) for i in range(num_layers)]
        # Add a norm layer for each output
        for i in out_indices:
            layer = build_norm_layer(norm_cfg, self.num_features[i])[1]
            layer_name = f'norm{i}'
            self.add_module(layer_name, layer)

    def train(self, mode=True):
        """Convert the model into training mode while keep layers freezed."""
        super(SwinTransformer, self).train(mode)
        self._freeze_stages()

    def _freeze_stages(self):
        if self.frozen_stages >= 0:
            self.patch_embed.eval()
            for param in self.patch_embed.parameters():
                param.requires_grad = False
            if self.use_abs_pos_embed:
                self.absolute_pos_embed.requires_grad = False
            self.drop_after_pos.eval()

        for i in range(1, self.frozen_stages + 1):

            if (i - 1) in self.out_indices:
                norm_layer = getattr(self, f'norm{i - 1}')
                norm_layer.eval()
                for param in norm_layer.parameters():
                    param.requires_grad = False

            m = self.stages[i - 1]
            m.eval()
            for param in m.parameters():
                param.requires_grad = False

    def init_weights(self):
        logger = get_root_logger()
        if self.init_cfg is None:
            logger.warn(f'No pre-trained weights for '
                        f'{self.__class__.__name__}, '
                        f'training start from scratch')
            if self.use_abs_pos_embed:
                trunc_normal_init(self.absolute_pos_embed, std=0.02)
            for m in self.modules():
                if isinstance(m, nn.Linear):
                    trunc_normal_init(m.weight, std=.02)
                    if m.bias is not None:
                        constant_init(m.bias, 0)
                elif isinstance(m, nn.LayerNorm):
                    constant_init(m.bias, 0)
                    constant_init(m.weight, 1.0)
        else:
            assert 'checkpoint' in self.init_cfg, f'Only support ' \
                                                  f'specify `Pretrained` in ' \
                                                  f'`init_cfg` in ' \
                                                  f'{self.__class__.__name__} '
            ckpt = _load_checkpoint(
                self.init_cfg.checkpoint, logger=logger, map_location='cpu')
            if 'state_dict' in ckpt:
                _state_dict = ckpt['state_dict']
            elif 'model' in ckpt:
                _state_dict = ckpt['model']
            else:
                _state_dict = ckpt

            state_dict = OrderedDict()
            for k, v in _state_dict.items():
                if k.startswith('backbone.'):
                    state_dict[k[9:]] = v

            if self.convert_weights:
                # supported loading weight from original repo,
                state_dict = swin_converter(state_dict)

            # strip prefix of state_dict
            if list(state_dict.keys())[0].startswith('module.'):
                state_dict = {k[7:]: v for k, v in state_dict.items()}

            # reshape absolute position embedding
            if state_dict.get('absolute_pos_embed') is not None:
                absolute_pos_embed = state_dict['absolute_pos_embed']
                N1, L, C1 = absolute_pos_embed.size()
                N2, C2, H, W = self.absolute_pos_embed.size()
                if N1 != N2 or C1 != C2 or L != H * W:
                    logger.warning('Error in loading absolute_pos_embed, pass')
                else:
                    state_dict['absolute_pos_embed'] = absolute_pos_embed.view(
                        N2, H, W, C2).permute(0, 3, 1, 2).contiguous()

            # interpolate position bias table if needed
            relative_position_bias_table_keys = [
                k for k in state_dict.keys()
                if 'relative_position_bias_table' in k
            ]
            for table_key in relative_position_bias_table_keys:
                table_pretrained = state_dict[table_key]
                table_current = self.state_dict()[table_key]
                L1, nH1 = table_pretrained.size()
                L2, nH2 = table_current.size()
                if nH1 != nH2:
                    logger.warning(f'Error in loading {table_key}, pass')
                elif L1 != L2:
                    S1 = int(L1 ** 0.5)
                    S2 = int(L2 ** 0.5)
                    table_pretrained_resized = F.interpolate(
                        table_pretrained.permute(1, 0).reshape(1, nH1, S1, S1),
                        size=(S2, S2),
                        mode='bicubic')
                    state_dict[table_key] = table_pretrained_resized.view(
                        nH2, L2).permute(1, 0).contiguous()

            # load state_dict
            self.load_state_dict(state_dict, False)

    def forward(self, x):
        x, hw_shape = self.patch_embed(x)

        if self.use_abs_pos_embed:
            x = x + self.absolute_pos_embed
        x = self.drop_after_pos(x)

        outs = []
        for i, stage in enumerate(self.stages):
            x, hw_shape, out, out_hw_shape = stage(x, hw_shape)
            if i in self.out_indices:
                norm_layer = getattr(self, f'norm{i}')
                out = norm_layer(out)
                out = out.view(-1, *out_hw_shape,
                               self.num_features[i]).permute(0, 3, 1,
                                                             2).contiguous()
                outs.append(out)

        return outs


class SwinRFPLayer(BaseModule):
    """Implements one stage in Swin Transformer.
    Args:
        embed_dims (int): The feature dimension.
        num_heads (int): Parallel attention heads.
        feedforward_channels (int): The hidden dimension for FFNs.
        depth (int): The number of blocks in this stage.
        window_size (int, optional): The local window scale. Default: 7.
        qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
        qk_scale (float | None, optional): Override default qk scale of
            head_dim ** -0.5 if set. Default: None.
        drop_rate (float, optional): Dropout rate. Default: 0.
        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
        drop_path_rate (float | list[float], optional): Stochastic depth
            rate. Default: 0.
        downsample (BaseModule | None, optional): The downsample operation
            module. Default: None.
        act_cfg (dict, optional): The config dict of activation function.
            Default: dict(type='GELU').
        norm_cfg (dict, optional): The config dict of normalization.
            Default: dict(type='LN').
        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
            will save some memory while slowing down the training speed.
            Default: False.
        init_cfg (dict | list | None, optional): The init config.
            Default: None.
    """

    def __init__(self,
                 embed_dims,
                 num_heads,
                 feedforward_channels,
                 depth,
                 window_size=7,
                 qkv_bias=True,
                 qk_scale=None,
                 drop_rate=0.,
                 attn_drop_rate=0.,
                 drop_path_rate=0.,
                 downsample=None,
                 act_cfg=dict(type='GELU'),
                 norm_cfg=dict(type='LN'),
                 with_cp=False,
                 # Added
                 rfp_inplanes=None,
                 # Added Done
                 init_cfg=None):
        super().__init__(init_cfg=init_cfg)

        if isinstance(drop_path_rate, list):
            drop_path_rates = drop_path_rate
            assert len(drop_path_rates) == depth
        else:
            drop_path_rates = [deepcopy(drop_path_rate) for _ in range(depth)]

        self.blocks = ModuleList()
        for i in range(depth):
            block = SwinBlock(
                embed_dims=embed_dims,
                num_heads=num_heads,
                feedforward_channels=feedforward_channels,
                window_size=window_size,
                shift=False if i % 2 == 0 else True,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop_rate=drop_rate,
                attn_drop_rate=attn_drop_rate,
                drop_path_rate=drop_path_rates[i],
                act_cfg=act_cfg,
                norm_cfg=norm_cfg,
                with_cp=with_cp,
                init_cfg=None)
            self.blocks.append(block)

        self.downsample = downsample

        self.rfp_inplanes = rfp_inplanes
        if self.rfp_inplanes:
            self.rfp_conv = build_conv_layer(
                None,
                self.rfp_inplanes,
                embed_dims,
                1,
                stride=1,
                bias=True)

    def forward(self, x, hw_shape):
        for block in self.blocks:
            x = block(x, hw_shape)

        if self.downsample:
            x_down, down_hw_shape = self.downsample(x, hw_shape)
            return x_down, down_hw_shape, x, hw_shape
        else:
            return x, hw_shape, x, hw_shape

    def rfp_forward(self, x, hw_shape, rfp_feat):
        for block in self.blocks:
            x = block(x, hw_shape)

        haw = hw_shape[0] * hw_shape[1]
        if self.rfp_inplanes:
            rfp_feat = self.rfp_conv(rfp_feat)
            x = x + rfp_feat.permute((0, 2, 3, 1)) \
                .view(x.shape[0], haw, x.shape[2]).contiguous()

        if self.downsample:
            x_down, down_hw_shape = self.downsample(x, hw_shape)
            return x_down, down_hw_shape, x, hw_shape
        else:
            return x, hw_shape, x, hw_shape


@BACKBONES.register_module()
class SwinTransformerRFP(SwinTransformer):
    def __init__(
            self,
            rfp_inplanes=None,
            output_img=False,
            # Old settings
            pretrain_img_size=224,
            in_channels=3,
            embed_dims=96,
            patch_size=4,
            window_size=7,
            mlp_ratio=4,
            depths=(2, 2, 6, 2),
            num_heads=(3, 6, 12, 24),
            strides=(4, 2, 2, 2),
            out_indices=(0, 1, 2, 3),
            qkv_bias=True,
            qk_scale=None,
            patch_norm=True,
            drop_rate=0.,
            attn_drop_rate=0.,
            drop_path_rate=0.1,
            use_abs_pos_embed=False,
            act_cfg=dict(type='GELU'),
            norm_cfg=dict(type='LN'),
            with_cp=False,
            pretrained=None,
            convert_weights=False,
            frozen_stages=-1,
            init_cfg=None):
        self.rfp_inplanes = rfp_inplanes
        self.output_img = output_img
        super().__init__(
            pretrain_img_size=pretrain_img_size,
            in_channels=in_channels,
            embed_dims=embed_dims,
            patch_size=patch_size,
            window_size=window_size,
            mlp_ratio=mlp_ratio,
            depths=depths,
            num_heads=num_heads,
            strides=strides,
            out_indices=out_indices,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            patch_norm=patch_norm,
            drop_rate=drop_rate,
            attn_drop_rate=attn_drop_rate,
            drop_path_rate=drop_path_rate,
            use_abs_pos_embed=use_abs_pos_embed,
            act_cfg=act_cfg,
            norm_cfg=norm_cfg,
            with_cp=with_cp,
            pretrained=pretrained,
            convert_weights=convert_weights,
            frozen_stages=frozen_stages,
            init_cfg=init_cfg
        )
        # Re-write Swin Block
        self.stages = ModuleList()
        in_channels = embed_dims
        num_layers = len(depths)
        total_depth = sum(depths)
        dpr = [
            x.item() for x in torch.linspace(0, drop_path_rate, total_depth)
        ]
        for i in range(num_layers):
            if i < num_layers - 1:
                downsample = PatchMerging(
                    in_channels=in_channels,
                    out_channels=2 * in_channels,
                    stride=strides[i + 1],
                    norm_cfg=norm_cfg if patch_norm else None,
                    init_cfg=None)
            else:
                downsample = None

            stage = SwinRFPLayer(
                embed_dims=in_channels,
                num_heads=num_heads[i],
                feedforward_channels=mlp_ratio * in_channels,
                depth=depths[i],
                window_size=window_size,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop_rate=drop_rate,
                attn_drop_rate=attn_drop_rate,
                drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])],
                downsample=downsample,
                act_cfg=act_cfg,
                norm_cfg=norm_cfg,
                with_cp=with_cp,
                rfp_inplanes=rfp_inplanes if i > 0 else None,
                init_cfg=None)
            self.stages.append(stage)
            if downsample:
                in_channels = downsample.out_channels

    def forward(self, x):
        """Forward function."""
        outs = list(super().forward(x))
        if self.output_img:
            outs.insert(0, x)
        return tuple(outs)

    def rfp_forward(self, x, rfp_feats):
        x, hw_shape = self.patch_embed(x)

        if self.use_abs_pos_embed:
            x = x + self.absolute_pos_embed
        x = self.drop_after_pos(x)

        outs = []
        for i, stage in enumerate(self.stages):
            rfp_feat = rfp_feats[i] if i > 0 else None
            x, hw_shape, out, out_hw_shape = stage.rfp_forward(x, hw_shape, rfp_feat)
            if i in self.out_indices:
                norm_layer = getattr(self, f'norm{i}')
                out = norm_layer(out)
                out = out.view(-1, *out_hw_shape,
                               self.num_features[i]).permute(0, 3, 1,
                                                             2).contiguous()
                outs.append(out)

        return outs


================================================
FILE: swin/transformer.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import math
import warnings
from typing import Sequence

import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import (build_activation_layer, build_conv_layer,
                      build_norm_layer, xavier_init)
from mmcv.cnn.bricks.registry import (TRANSFORMER_LAYER,
                                      TRANSFORMER_LAYER_SEQUENCE)
from mmcv.cnn.bricks.transformer import (BaseTransformerLayer,
                                         TransformerLayerSequence,
                                         build_transformer_layer_sequence)
from mmcv.runner.base_module import BaseModule
from mmcv.utils import to_2tuple
from torch.nn.init import normal_

from mmdet.models.utils.builder import TRANSFORMER

try:
    from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention

except ImportError:
    warnings.warn(
        '`MultiScaleDeformableAttention` in MMCV has been moved to '
        '`mmcv.ops.multi_scale_deform_attn`, please update your MMCV')
    from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention


def nlc_to_nchw(x, hw_shape):
    """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor.
    Args:
        x (Tensor): The input tensor of shape [N, L, C] before convertion.
        hw_shape (Sequence[int]): The height and width of output feature map.
    Returns:
        Tensor: The output tensor of shape [N, C, H, W] after convertion.
    """
    H, W = hw_shape
    assert len(x.shape) == 3
    B, L, C = x.shape
    assert L == H * W, 'The seq_len does not match H, W'
    return x.transpose(1, 2).reshape(B, C, H, W).contiguous()


def nchw_to_nlc(x):
    """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor.
    Args:
        x (Tensor): The input tensor of shape [N, C, H, W] before convertion.
    Returns:
        Tensor: The output tensor of shape [N, L, C] after convertion.
    """
    assert len(x.shape) == 4
    return x.flatten(2).transpose(1, 2).contiguous()


class AdaptivePadding(nn.Module):
    """Applies padding to input (if needed) so that input can get fully covered
    by filter you specified. It support two modes "same" and "corner". The
    "same" mode is same with "SAME" padding mode in TensorFlow, pad zero around
    input. The "corner"  mode would pad zero to bottom right.
    Args:
        kernel_size (int | tuple): Size of the kernel:
        stride (int | tuple): Stride of the filter. Default: 1:
        dilation (int | tuple): Spacing between kernel elements.
            Default: 1
        padding (str): Support "same" and "corner", "corner" mode
            would pad zero to bottom right, and "same" mode would
            pad zero around input. Default: "corner".
    Example:
        >>> kernel_size = 16
        >>> stride = 16
        >>> dilation = 1
        >>> input = torch.rand(1, 1, 15, 17)
        >>> adap_pad = AdaptivePadding(
        >>>     kernel_size=kernel_size,
        >>>     stride=stride,
        >>>     dilation=dilation,
        >>>     padding="corner")
        >>> out = adap_pad(input)
        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
        >>> input = torch.rand(1, 1, 16, 17)
        >>> out = adap_pad(input)
        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
    """

    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):

        super(AdaptivePadding, self).__init__()

        assert padding in ('same', 'corner')

        kernel_size = to_2tuple(kernel_size)
        stride = to_2tuple(stride)
        padding = to_2tuple(padding)
        dilation = to_2tuple(dilation)

        self.padding = padding
        self.kernel_size = kernel_size
        self.stride = stride
        self.dilation = dilation

    def get_pad_shape(self, input_shape):
        input_h, input_w = input_shape
        kernel_h, kernel_w = self.kernel_size
        stride_h, stride_w = self.stride
        output_h = math.ceil(input_h / stride_h)
        output_w = math.ceil(input_w / stride_w)
        pad_h = max((output_h - 1) * stride_h +
                    (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
        pad_w = max((output_w - 1) * stride_w +
                    (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
        return pad_h, pad_w

    def forward(self, x):
        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
        if pad_h > 0 or pad_w > 0:
            if self.padding == 'corner':
                x = F.pad(x, [0, pad_w, 0, pad_h])
            elif self.padding == 'same':
                x = F.pad(x, [
                    pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
                    pad_h - pad_h // 2
                ])
        return x


class PatchEmbed(BaseModule):
    """Image to Patch Embedding.
    We use a conv layer to implement PatchEmbed.
    Args:
        in_channels (int): The num of input channels. Default: 3
        embed_dims (int): The dimensions of embedding. Default: 768
        conv_type (str): The config dict for embedding
            conv layer type selection. Default: "Conv2d.
        kernel_size (int): The kernel_size of embedding conv. Default: 16.
        stride (int): The slide stride of embedding conv.
            Default: None (Would be set as `kernel_size`).
        padding (int | tuple | string ): The padding length of
            embedding conv. When it is a string, it means the mode
            of adaptive padding, support "same" and "corner" now.
            Default: "corner".
        dilation (int): The dilation rate of embedding conv. Default: 1.
        bias (bool): Bias of embed conv. Default: True.
        norm_cfg (dict, optional): Config dict for normalization layer.
            Default: None.
        input_size (int | tuple | None): The size of input, which will be
            used to calculate the out size. Only work when `dynamic_size`
            is False. Default: None.
        init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
            Default: None.
    """

    def __init__(
        self,
        in_channels=3,
        embed_dims=768,
        conv_type='Conv2d',
        kernel_size=16,
        stride=16,
        padding='corner',
        dilation=1,
        bias=True,
        norm_cfg=None,
        input_size=None,
        init_cfg=None,
    ):
        super(PatchEmbed, self).__init__(init_cfg=init_cfg)

        self.embed_dims = embed_dims
        if stride is None:
            stride = kernel_size

        kernel_size = to_2tuple(kernel_size)
        stride = to_2tuple(stride)
        dilation = to_2tuple(dilation)

        if isinstance(padding, str):
            self.adap_padding = AdaptivePadding(
                kernel_size=kernel_size,
                stride=stride,
                dilation=dilation,
                padding=padding)
            # disable the padding of conv
            padding = 0
        else:
            self.adap_padding = None
        padding = to_2tuple(padding)

        self.projection = build_conv_layer(
            dict(type=conv_type),
            in_channels=in_channels,
            out_channels=embed_dims,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            bias=bias)

        if norm_cfg is not None:
            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
        else:
            self.norm = None

        if input_size:
            input_size = to_2tuple(input_size)
            # `init_out_size` would be used outside to
            # calculate the num_patches
            # when `use_abs_pos_embed` outside
            self.init_input_size = input_size
            if self.adap_padding:
                pad_h, pad_w = self.adap_padding.get_pad_shape(input_size)
                input_h, input_w = input_size
                input_h = input_h + pad_h
                input_w = input_w + pad_w
                input_size = (input_h, input_w)

            # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
                     (kernel_size[0] - 1) - 1) // stride[0] + 1
            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
                     (kernel_size[1] - 1) - 1) // stride[1] + 1
            self.init_out_size = (h_out, w_out)
        else:
            self.init_input_size = None
            self.init_out_size = None

    def forward(self, x):
        """
        Args:
            x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
        Returns:
            tuple: Contains merged results and its spatial shape.
                - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
                - out_size (tuple[int]): Spatial shape of x, arrange as
                    (out_h, out_w).
        """

        if self.adap_padding:
            x = self.adap_padding(x)

        x = self.projection(x)
        out_size = (x.shape[2], x.shape[3])
        x = x.flatten(2).transpose(1, 2)
        if self.norm is not None:
            x = self.norm(x)
        return x, out_size


class PatchMerging(BaseModule):
    """Merge patch feature map.
    This layer groups feature map by kernel_size, and applies norm and linear
    layers to the grouped feature map. Our implementation uses `nn.Unfold` to
    merge patch, which is about 25% faster than original implementation.
    Instead, we need to modify pretrained models for compatibility.
    Args:
        in_channels (int): The num of input channels.
            to gets fully covered by filter and stride you specified..
            Default: True.
        out_channels (int): The num of output channels.
        kernel_size (int | tuple, optional): the kernel size in the unfold
            layer. Defaults to 2.
        stride (int | tuple, optional): the stride of the sliding blocks in the
            unfold layer. Default: None. (Would be set as `kernel_size`)
        padding (int | tuple | string ): The padding length of
            embedding conv. When it is a string, it means the mode
            of adaptive padding, support "same" and "corner" now.
            Default: "corner".
        dilation (int | tuple, optional): dilation parameter in the unfold
            layer. Default: 1.
        bias (bool, optional): Whether to add bias in linear layer or not.
            Defaults: False.
        norm_cfg (dict, optional): Config dict for normalization layer.
            Default: dict(type='LN').
        init_cfg (dict, optional): The extra config for initialization.
            Default: None.
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size=2,
                 stride=None,
                 padding='corner',
                 dilation=1,
                 bias=False,
                 norm_cfg=dict(type='LN'),
                 init_cfg=None):
        super().__init__(init_cfg=init_cfg)
        self.in_channels = in_channels
        self.out_channels = out_channels
        if stride:
            stride = stride
        else:
            stride = kernel_size

        kernel_size = to_2tuple(kernel_size)
        stride = to_2tuple(stride)
        dilation = to_2tuple(dilation)

        if isinstance(padding, str):
            self.adap_padding = AdaptivePadding(
                kernel_size=kernel_size,
                stride=stride,
                dilation=dilation,
                padding=padding)
            # disable the padding of unfold
            padding = 0
        else:
            self.adap_padding = None

        padding = to_2tuple(padding)
        self.sampler = nn.Unfold(
            kernel_size=kernel_size,
            dilation=dilation,
            padding=padding,
            stride=stride)

        sample_dim = kernel_size[0] * kernel_size[1] * in_channels

        if norm_cfg is not None:
            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
        else:
            self.norm = None

        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)

    def forward(self, x, input_size):
        """
        Args:
            x (Tensor): Has shape (B, H*W, C_in).
            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
                Default: None.
        Returns:
            tuple: Contains merged results and its spatial shape.
                - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
                - out_size (tuple[int]): Spatial shape of x, arrange as
                    (Merged_H, Merged_W).
        """
        B, L, C = x.shape
        assert isinstance(input_size, Sequence), f'Expect ' \
                                                 f'input_size is ' \
                                                 f'`Sequence` ' \
                                                 f'but get {input_size}'

        H, W = input_size
        assert L == H * W, 'input feature has wrong size'

        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W
        # Use nn.Unfold to merge patch. About 25% faster than original method,
        # but need to modify pretrained model for compatibility

        if self.adap_padding:
            x = self.adap_padding(x)
            H, W = x.shape[-2:]

        x = self.sampler(x)
        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)

        out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
                 (self.sampler.kernel_size[0] - 1) -
                 1) // self.sampler.stride[0] + 1
        out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
                 (self.sampler.kernel_size[1] - 1) -
                 1) // self.sampler.stride[1] + 1

        output_size = (out_h, out_w)
        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
        x = self.norm(x) if self.norm else x
        x = self.reduction(x)
        return x, output_size


def inverse_sigmoid(x, eps=1e-5):
    """Inverse function of sigmoid.
    Args:
        x (Tensor): The tensor to do the
            inverse.
        eps (float): EPS avoid numerical
            overflow. Defaults 1e-5.
    Returns:
        Tensor: The x has passed the inverse
            function of sigmoid, has same
            shape with input.
    """
    x = x.clamp(min=0, max=1)
    x1 = x.clamp(min=eps)
    x2 = (1 - x).clamp(min=eps)
    return torch.log(x1 / x2)


================================================
FILE: tools/dataset/cityscapes_instance_idmap.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import os.path as osp

import mmcv
from cityscapesscripts.preparation.json2instanceImg import json2instanceImg


def convert_json_to_label(json_file):
    label_file = json_file.replace('_polygons.json', '_instanceTrainIds.png')
    json2instanceImg(json_file, label_file, 'trainIds')


def parse_args():
    parser = argparse.ArgumentParser(
        description='Convert Cityscapes annotations to TrainIds')
    parser.add_argument('cityscapes_path', help='cityscapes data path')
    parser.add_argument('--gt-dir', default='gtFine', type=str)
    parser.add_argument('-o', '--out-dir', help='output path')
    parser.add_argument(
        '--nproc', default=1, type=int, help='number of process')
    args = parser.parse_args()
    return args


def main():
    args = parse_args()
    cityscapes_path = args.cityscapes_path
    out_dir = args.out_dir if args.out_dir else cityscapes_path
    mmcv.mkdir_or_exist(out_dir)

    gt_dir = osp.join(cityscapes_path, args.gt_dir)

    poly_files = []
    for poly in mmcv.scandir(gt_dir, '_polygons.json', recursive=True):
        poly_file = osp.join(gt_dir, poly)
        poly_files.append(poly_file)
    if args.nproc > 1:
        mmcv.track_parallel_progress(convert_json_to_label, poly_files,
                                     args.nproc)
    else:
        mmcv.track_progress(convert_json_to_label, poly_files)


# install mmcv and cityscapesscripts
# python cityscapes_instance.py {PATH/TO/CITYSCAPES} --nproc 56
if __name__ == '__main__':
    main()


================================================
FILE: tools/dataset/youtubevis2coco.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import copy
import os
import os.path as osp
from collections import defaultdict

import mmcv


def parse_args():
    parser = argparse.ArgumentParser(
        description='YouTube-VIS to COCO Video format')
    parser.add_argument(
        '-i',
        '--input',
        help='root directory of YouTube-VIS annotations',
    )
    parser.add_argument(
        '-o',
        '--output',
        help='directory to save coco formatted label file',
    )
    parser.add_argument(
        '--version',
        choices=['2019', '2021'],
        help='The version of YouTube-VIS Dataset',
    )
    return parser.parse_args()


def convert_vis(ann_dir, save_dir, dataset_version, mode='train'):
    """Convert YouTube-VIS dataset in COCO style.
    Args:
        ann_dir (str): The path of YouTube-VIS dataset.
        save_dir (str): The path to save `VIS`.
        dataset_version (str): The version of dataset. Options are '2019',
            '2021'.
        mode (str): Convert train dataset or validation dataset or test
            dataset. Options are 'train', 'valid', 'test'. Default: 'train'.
    """
    assert dataset_version in ['2019', '2021']
    assert mode in ['train', 'valid', 'test']
    VIS = defaultdict(list)
    records = dict(vid_id=1, img_id=1, ann_id=1, global_instance_id=1)
    obj_num_classes = dict()

    if dataset_version == '2019':
        official_anns = mmcv.load(osp.join(ann_dir, f'{mode}.json'))
    elif dataset_version == '2021':
        official_anns = mmcv.load(osp.join(ann_dir, mode, 'instances.json'))
    VIS['categories'] = copy.deepcopy(official_anns['categories'])

    has_annotations = mode == 'train'
    if has_annotations:
        vid_to_anns = defaultdict(list)
        for ann_info in official_anns['annotations']:
            vid_to_anns[ann_info['video_id']].append(ann_info)

    video_infos = official_anns['videos']
    for video_info in video_infos:
        video_name = video_info['file_names'][0].split('/')[0]
        video = dict(id=video_info['id'], name=video_name)
        VIS['videos'].append(video)

        num_frames = len(video_info['file_names'])
        width = video_info['width']
        height = video_info['height']
        if has_annotations:
            ann_infos_in_video = vid_to_anns[video_info['id']]
            instance_id_maps = dict()

        for frame_id in range(num_frames):
            image = dict(
                file_name=video_info['file_names'][frame_id],
                height=height,
                width=width,
                id=records['img_id'],
                frame_id=frame_id,
                video_id=video_info['id'])
            VIS['images'].append(image)

            if has_annotations:
                for ann_info in ann_infos_in_video:
                    bbox = ann_info['bboxes'][frame_id]
                    if bbox is None:
                        continue

                    category_id = ann_info['category_id']
                    track_id = ann_info['id']
                    segmentation = ann_info['segmentations'][frame_id]
                    area = ann_info['areas'][frame_id]
                    assert isinstance(category_id, int)
                    assert isinstance(track_id, int)
                    assert segmentation is not None
                    assert area is not None

                    if track_id in instance_id_maps:
                        instance_id = instance_id_maps[track_id]
                    else:
                        instance_id = records['global_instance_id']
                        records['global_instance_id'] += 1
                        instance_id_maps[track_id] = instance_id

                    ann = dict(
                        id=records['ann_id'],
                        video_id=video_info['id'],
                        image_id=records['img_id'],
                        category_id=category_id,
                        instance_id=instance_id,
                        bbox=bbox,
                        segmentation=segmentation,
                        area=area,
                        iscrowd=ann_info['iscrowd'])

                    if category_id not in obj_num_classes:
                        obj_num_classes[category_id] = 1
                    else:
                        obj_num_classes[category_id] += 1

                    VIS['annotations'].append(ann)
                    records['ann_id'] += 1
            records['img_id'] += 1
        records['vid_id'] += 1

    if not osp.isdir(save_dir):
        os.makedirs(save_dir)
    mmcv.dump(VIS,
              osp.join(save_dir, f'youtube_vis_{dataset_version}_{mode}.json'))
    print(f'-----YouTube VIS {dataset_version} {mode}------')
    print(f'{records["vid_id"]- 1} videos')
    print(f'{records["img_id"]- 1} images')
    if has_annotations:
        print(f'{records["ann_id"] - 1} objects')
        print(f'{records["global_instance_id"] - 1} instances')
    print('-----------------------')
    if has_annotations:
        for i in range(1, len(VIS['categories']) + 1):
            class_name = VIS['categories'][i - 1]['name']
            print(f'Class {i} {class_name} has {obj_num_classes[i]} objects.')


def main():
    args = parse_args()
    for sub_set in ['train', 'valid', 'test']:
        convert_vis(args.input, args.output, args.version, sub_set)


if __name__ == '__main__':
    main()


================================================
FILE: tools/dist_step_test.sh
================================================
#!/usr/bin/env bash

CONFIG=$1
CHECKPOINT=$2
GPUS=$3
PORT=${PORT:-29500}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
    $(dirname "$0")/test_step.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}

================================================
FILE: tools/dist_test.sh
================================================
#!/usr/bin/env bash

CONFIG=$1
CHECKPOINT=$2
GPUS=$3
PORT=${PORT:-29500}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
    $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}


================================================
FILE: tools/dist_train.sh
================================================
#!/usr/bin/env bash

CONFIG=$1
GPUS=$2
PORT=${PORT:-$((29500 + $RANDOM % 29))}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python -m torch.distributed.launch  --nproc_per_node=$GPUS --master_port=$PORT \
    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}


================================================
FILE: tools/dist_train_new.sh
================================================
#!/usr/bin/env bash

CONFIG=$1
GPUS=$2
PORT=${PORT:-$((29500 + $RANDOM % 29))}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python -m torch.distributed.run  --nproc_per_node=$GPUS --master_port=$PORT \
    $(dirname "$0")/train_new.py $CONFIG --launcher pytorch ${@:3}


================================================
FILE: tools/dist_vps_test.sh
================================================
#!/usr/bin/env bash

CONFIG=$1
CHECKPOINT=$2
GPUS=$3
PORT=${PORT:-29500}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
    $(dirname "$0")/test_vps.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}

================================================
FILE: tools/docker.sh
================================================
#!/bin/bash

DATALOC=${DATALOC:-~/datasets}
LOGLOC=${LOGLOC:-~/logger}
IMG=${IMG:-"harbory/openmmlab:latest"}

docker run --gpus all -it --rm --ipc=host --net=host -v $(pwd):/data -v $DATALOC:/data/data -v $LOGLOC:/data/logger $IMG


================================================
FILE: tools/eval_dstq.py
================================================
import argparse
import os

import mmcv
import numpy as np
import torch
from mmcv import ProgressBar

import torch.nn.functional as F

from tools.utils.DSTQ import DSTQuality
from tools.utils.STQ import STQuality


def parse_args():
    parser = argparse.ArgumentParser(description='Evaluation of DSTQ')
    parser.add_argument('result_path')
    parser.add_argument('--gt-path', default='data/kitti-dvps')
    parser.add_argument('--split', default='val')
    parser.add_argument(
        '--depth',
        action='store_true',
        help='eval depth')
    parser.add_argument('--nproc', default=1, type=int, help='number of process')
    args = parser.parse_args()
    return args


def updater(pred_ins_name,
            pred_cls_name,
            pred_dep_name,
            gt_cls_seq_name,
            gt_ins_seq_name,
            gt_dep_seq_name,
            updater_obj,
            seq_id):
    pred_ins = mmcv.imread(pred_ins_name, flag='unchanged').astype(np.int32)
    pred_cls = mmcv.imread(pred_cls_name, flag='unchanged').astype(np.int32)
    pred_dep = mmcv.imread(pred_dep_name, flag='unchanged').astype(np.float32) if pred_dep_name is not None else None

    gt_ins = mmcv.imread(gt_ins_seq_name, flag='unchanged').astype(np.int32)
    gt_cls = mmcv.imread(gt_cls_seq_name, flag='unchanged').astype(np.int32)
    gt_dep = mmcv.imread(gt_dep_seq_name, flag='unchanged').astype(np.float32) if gt_dep_seq_name is not None else None
    if pred_dep is not None:
        pred_dep = F.interpolate(torch.from_numpy(pred_dep)[None][None], size=gt_dep.shape)[0][0].numpy()

    valid_mask_seg = gt_cls != 255

    pred_masked_ps = pred_cls[valid_mask_seg] * (2 ** 16) + pred_ins[valid_mask_seg]
    gt_masked_ps = gt_cls[valid_mask_seg] * (2 ** 16) + gt_ins[valid_mask_seg]

    if pred_dep_name is not None:
        valid_mask_dep = gt_dep > 0.

        pred_masked_depth = pred_dep[valid_mask_dep]
        gt_masked_depth = gt_dep[valid_mask_dep]

        updater_obj.update_state(gt_masked_ps, pred_masked_ps, gt_masked_depth, pred_masked_depth, seq_id)
    else:
        updater_obj.update_state(gt_masked_ps, pred_masked_ps, seq_id)


def eval_dstq(result_dir, gt_dir, seq_ids, with_depth=True):
    if with_depth:
        dstq_obj = DSTQuality(
            num_classes=19,
            things_list=list(range(8)),
            ignore_label=255,
            label_bit_shift=16,
            offset=2 ** 16 * 256,
            depth_threshold=(1.25,),
        )
    else:
        dstq_obj = STQuality(
            num_classes=19,
            things_list=list(range(8)),
            ignore_label=255,
            label_bit_shift=16,
            offset=2 ** 16 * 256,
        )

    gt_names = list(mmcv.scandir(gt_dir))
    gt_cls_names = sorted(list(filter(lambda x: 'gtFine_class' in x, gt_names)))
    gt_ins_names = sorted(list(filter(lambda x: 'gtFine_instance' in x, gt_names)))
    if with_depth:
        gt_dep_names = sorted(list(filter(lambda x: 'depth' in x, gt_names)))
    else:
        gt_dep_names = None

    for seq_id in seq_ids:
        pred_name_panoptic = list(mmcv.scandir(os.path.join(result_dir, 'panoptic', str(seq_id))))
        pred_ins_names = sorted(list(filter(lambda x: 'ins' in x, pred_name_panoptic)))
        pred_cls_names = sorted(list(filter(lambda x: 'cat' in x, pred_name_panoptic)))
        if with_depth:
            pred_name_depth = list(mmcv.scandir(os.path.join(result_dir, 'depth', str(seq_id))))
            pred_dep_names = sorted(pred_name_depth)
        else:
            pred_dep_names = [None] * len(pred_ins_names)
        gt_cls_seq_names = list(filter(lambda x: x.startswith('{:06d}'.format(seq_id)), gt_cls_names))
        gt_ins_seq_names = list(filter(lambda x: x.startswith('{:06d}'.format(seq_id)), gt_ins_names))
        if with_depth:
            gt_dep_seq_names = list(filter(lambda x: x.startswith('{:06d}'.format(seq_id)), gt_dep_names))
        else:
            gt_dep_seq_names = [None] * len(gt_cls_seq_names)
        prog_bar = ProgressBar(len(pred_ins_names))
        for pred_ins_name, pred_cls_name, pred_dep_name, gt_cls_seq_name, gt_ins_seq_name, gt_dep_seq_name in zip(
                pred_ins_names, pred_cls_names, pred_dep_names, gt_cls_seq_names, gt_ins_seq_names, gt_dep_seq_names
        ):
            prog_bar.update()
            updater(
                os.path.join(result_dir, 'panoptic', str(seq_id), pred_ins_name),
                os.path.join(result_dir, 'panoptic', str(seq_id), pred_cls_name),
                os.path.join(result_dir, 'depth', str(seq_id), pred_dep_name) if pred_dep_name is not None else None,
                os.path.join(gt_dir, gt_cls_seq_name),
                os.path.join(gt_dir, gt_ins_seq_name),
                os.path.join(gt_dir, gt_dep_seq_name) if gt_dep_seq_name is not None else None,
                dstq_obj,
                seq_id
            )
    result = dstq_obj.result()
    print(result)


if __name__ == '__main__':
    args = parse_args()
    result_path = args.result_path
    gt_path = args.gt_path
    split = args.split
    eval_dstq(result_path, os.path.join(gt_path, 'video_sequence', split), [8], args.depth)


================================================
FILE: tools/eval_dstq_step.py
================================================
import argparse
import os

import mmcv
import numpy as np
import torch
from mmcv import ProgressBar

import torch.nn.functional as F

from tools.utils.DSTQ import DSTQuality
from tools.utils.STQ import STQuality


def parse_args():
    parser = argparse.ArgumentParser(description='Evaluation of DSTQ')
    parser.add_argument('result_path')
    parser.add_argument('--gt-path', default='data/kitti-step')
    parser.add_argument('--split', default='val')
    parser.add_argument(
        '--depth',
        action='store_true',
        help='eval depth')
    parser.add_argument('--nproc', default=1, type=int, help='number of process')
    args = parser.parse_args()
    return args


def updater(pred_ins_name,
            pred_cls_name,
            pred_dep_name,
            gt_pan_seq_name,
            gt_dep_seq_name,
            updater_obj,
            seq_id):
    pred_ins = mmcv.imread(pred_ins_name, flag='unchanged').astype(np.int32)
    pred_cls = mmcv.imread(pred_cls_name, flag='unchanged').astype(np.int32)
    pred_dep = mmcv.imread(pred_dep_name, flag='unchanged').astype(np.float32) if pred_dep_name is not None else None

    gt_pan = mmcv.imread(gt_pan_seq_name, flag='color', channel_order='rgb')
    gt_cls = gt_pan[..., 0].astype(np.int32)
    gt_ins = gt_pan[..., 1].astype(np.int32) * 256 + gt_pan[..., 2].astype(np.int32)
    gt_dep = mmcv.imread(gt_dep_seq_name, flag='unchanged').astype(np.float32) if gt_dep_seq_name is not None else None
    if pred_dep is not None:
        pred_dep = F.interpolate(torch.from_numpy(pred_dep)[None][None], size=gt_dep.shape)[0][0].numpy()

    valid_mask_seg = gt_cls != 255

    pred_masked_ps = pred_cls[valid_mask_seg] * (2 ** 16) + pred_ins[valid_mask_seg]
    gt_masked_ps = gt_cls[valid_mask_seg] * (2 ** 16) + gt_ins[valid_mask_seg]

    if pred_dep_name is not None:
        valid_mask_dep = gt_dep > 0.

        pred_masked_depth = pred_dep[valid_mask_dep]
        gt_masked_depth = gt_dep[valid_mask_dep]

        updater_obj.update_state(gt_masked_ps, pred_masked_ps, gt_masked_depth, pred_masked_depth, seq_id)
    else:
        updater_obj.update_state(gt_masked_ps, pred_masked_ps, seq_id)


def eval_dstq(result_dir, gt_dir, seq_ids, with_depth=True):
    if with_depth:
        dstq_obj = DSTQuality(
            num_classes=19,
            things_list=list(range(11, 19)),
            ignore_label=255,
            label_bit_shift=16,
            offset=2 ** 16 * 256,
            depth_threshold=(1.25,),
        )
    else:
        dstq_obj = STQuality(
            num_classes=19,
            things_list=list(range(11, 19)),
            ignore_label=255,
            label_bit_shift=16,
            offset=2 ** 16 * 256,
        )

    gt_names = list(mmcv.scandir(gt_dir))
    gt_pan_names = sorted(list(filter(lambda x: 'panoptic' in x, gt_names)))
    if with_depth:
        gt_dep_names = sorted(list(filter(lambda x: 'depth' in x, gt_names)))
    else:
        gt_dep_names = None

    for seq_id in seq_ids:
        pred_name_panoptic = list(mmcv.scandir(os.path.join(result_dir, 'panoptic', str(seq_id))))
        pred_ins_names = sorted(list(filter(lambda x: 'ins' in x, pred_name_panoptic)))
        pred_cls_names = sorted(list(filter(lambda x: 'cat' in x, pred_name_panoptic)))
        if with_depth:
            pred_name_depth = list(mmcv.scandir(os.path.join(result_dir, 'depth', str(seq_id))))
            pred_dep_names = sorted(pred_name_depth)
        else:
            pred_dep_names = [None] * len(pred_ins_names)
        gt_pan_seq_names = list(filter(lambda x: x.startswith('{:06d}'.format(seq_id)), gt_pan_names))
        if with_depth:
            gt_dep_seq_names = list(filter(lambda x: x.startswith('{:06d}'.format(seq_id)), gt_dep_names))
        else:
            gt_dep_seq_names = [None] * len(gt_pan_seq_names)
        prog_bar = ProgressBar(len(pred_ins_names))
        for pred_ins_name, pred_cls_name, pred_dep_name, gt_pan_seq_name, gt_dep_seq_name in zip(
                pred_ins_names, pred_cls_names, pred_dep_names, gt_pan_seq_names, gt_dep_seq_names
        ):
            prog_bar.update()
            updater(
                os.path.join(result_dir, 'panoptic', str(seq_id), pred_ins_name),
                os.path.join(result_dir, 'panoptic', str(seq_id), pred_cls_name),
                os.path.join(result_dir, 'depth', str(seq_id), pred_dep_name) if pred_dep_name is not None else None,
                os.path.join(gt_dir, gt_pan_seq_name),
                os.path.join(gt_dir, gt_dep_seq_name) if gt_dep_seq_name is not None else None,
                dstq_obj,
                seq_id
            )
    result = dstq_obj.result()
    print(result)


if __name__ == '__main__':
    args = parse_args()
    result_path = args.result_path
    gt_path = args.gt_path
    split = args.split
    eval_dstq(result_path, os.path.join(gt_path, 'video_sequence', split), [2, 6, 7, 8, 10, 13, 14, 16, 18], args.depth)


================================================
FILE: tools/eval_dstq_vipseg.py
================================================
import argparse
import os

import mmcv
import numpy as np
import torch
from mmcv import ProgressBar

import torch.nn.functional as F

from tools.utils.DSTQ import DSTQuality
from tools.utils.STQ import STQuality

CLASSES = [
    {"id": 0, "name": "wall", "isthing": 0, "color": [120, 120, 120]},
    {"id": 1, "name": "ceiling", "isthing": 0, "color": [180, 120, 120]},
    {"id": 2, "name": "door", "isthing": 1, "color": [6, 230, 230]},
    {"id": 3, "name": "stair", "isthing": 0, "color": [80, 50, 50]},
    {"id": 4, "name": "ladder", "isthing": 1, "color": [4, 200, 3]},
    {"id": 5, "name": "escalator", "isthing": 0, "color": [120, 120, 80]},
    {"id": 6, "name": "Playground_slide", "isthing": 0, "color": [140, 140, 140]},
    {"id": 7, "name": "handrail_or_fence", "isthing": 0, "color": [204, 5, 255]},
    {"id": 8, "name": "window", "isthing": 1, "color": [230, 230, 230]},
    {"id": 9, "name": "rail", "isthing": 0, "color": [4, 250, 7]},
    {"id": 10, "name": "goal", "isthing": 1, "color": [224, 5, 255]},
    {"id": 11, "name": "pillar", "isthing": 0, "color": [235, 255, 7]},
    {"id": 12, "name": "pole", "isthing": 0, "color": [150, 5, 61]},
    {"id": 13, "name": "floor", "isthing": 0, "color": [120, 120, 70]},
    {"id": 14, "name": "ground", "isthing": 0, "color": [8, 255, 51]},
    {"id": 15, "name": "grass", "isthing": 0, "color": [255, 6, 82]},
    {"id": 16, "name": "sand", "isthing": 0, "color": [143, 255, 140]},
    {"id": 17, "name": "athletic_field", "isthing": 0, "color": [204, 255, 4]},
    {"id": 18, "name": "road", "isthing": 0, "color": [255, 51, 7]},
    {"id": 19, "name": "path", "isthing": 0, "color": [204, 70, 3]},
    {"id": 20, "name": "crosswalk", "isthing": 0, "color": [0, 102, 200]},
    {"id": 21, "name": "building", "isthing": 0, "color": [61, 230, 250]},
    {"id": 22, "name": "house", "isthing": 0, "color": [255, 6, 51]},
    {"id": 23, "name": "bridge", "isthing": 0, "color": [11, 102, 255]},
    {"id": 24, "name": "tower", "isthing": 0, "color": [255, 7, 71]},
    {"id": 25, "name": "windmill", "isthing": 0, "color": [255, 9, 224]},
    {"id": 26, "name": "well_or_well_lid", "isthing": 0, "color": [9, 7, 230]},
    {"id": 27, "name": "other_construction", "isthing": 0, "color": [220, 220, 220]},
    {"id": 28, "name": "sky", "isthing": 0, "color": [255, 9, 92]},
    {"id": 29, "name": "mountain", "isthing": 0, "color": [112, 9, 255]},
    {"id": 30, "name": "stone", "isthing": 0, "color": [8, 255, 214]},
    {"id": 31, "name": "wood", "isthing": 0, "color": [7, 255, 224]},
    {"id": 32, "name": "ice", "isthing": 0, "color": [255, 184, 6]},
    {"id": 33, "name": "snowfield", "isthing": 0, "color": [10, 255, 71]},
    {"id": 34, "name": "grandstand", "isthing": 0, "color": [255, 41, 10]},
    {"id": 35, "name": "sea", "isthing": 0, "color": [7, 255, 255]},
    {"id": 36, "name": "river", "isthing": 0, "color": [224, 255, 8]},
    {"id": 37, "name": "lake", "isthing": 0, "color": [102, 8, 255]},
    {"id": 38, "name": "waterfall", "isthing": 0, "color": [255, 61, 6]},
    {"id": 39, "name": "water", "isthing": 0, "color": [255, 194, 7]},
    {"id": 40, "name": "billboard_or_Bulletin_Board", "isthing": 0, "color": [255, 122, 8]},
    {"id": 41, "name": "sculpture", "isthing": 1, "color": [0, 255, 20]},
    {"id": 42, "name": "pipeline", "isthing": 0, "color": [255, 8, 41]},
    {"id": 43, "name": "flag", "isthing": 1, "color": [255, 5, 153]},
    {"id": 44, "name": "parasol_or_umbrella", "isthing": 1, "color": [6, 51, 255]},
    {"id": 45, "name": "cushion_or_carpet", "isthing": 0, "color": [235, 12, 255]},
    {"id": 46, "name": "tent", "isthing": 1, "color": [160, 150, 20]},
    {"id": 47, "name": "roadblock", "isthing": 1, "color": [0, 163, 255]},
    {"id": 48, "name": "car", "isthing": 1, "color": [140, 140, 140]},
    {"id": 49, "name": "bus", "isthing": 1, "color": [250, 10, 15]},
    {"id": 50, "name": "truck", "isthing": 1, "color": [20, 255, 0]},
    {"id": 51, "name": "bicycle", "isthing": 1, "color": [31, 255, 0]},
    {"id": 52, "name": "motorcycle", "isthing": 1, "color": [255, 31, 0]},
    {"id": 53, "name": "wheeled_machine", "isthing": 0, "color": [255, 224, 0]},
    {"id": 54, "name": "ship_or_boat", "isthing": 1, "color": [153, 255, 0]},
    {"id": 55, "name": "raft", "isthing": 1, "color": [0, 0, 255]},
    {"id": 56, "name": "airplane", "isthing": 1, "color": [255, 71, 0]},
    {"id": 57, "name": "tyre", "isthing": 0, "color": [0, 235, 255]},
    {"id": 58, "name": "traffic_light", "isthing": 0, "color": [0, 173, 255]},
    {"id": 59, "name": "lamp", "isthing": 0, "color": [31, 0, 255]},
    {"id": 60, "name": "person", "isthing": 1, "color": [11, 200, 200]},
    {"id": 61, "name": "cat", "isthing": 1, "color": [255, 82, 0]},
    {"id": 62, "name": "dog", "isthing": 1, "color": [0, 255, 245]},
    {"id": 63, "name": "horse", "isthing": 1, "color": [0, 61, 255]},
    {"id": 64, "name": "cattle", "isthing": 1, "color": [0, 255, 112]},
    {"id": 65, "name": "other_animal", "isthing": 1, "color": [0, 255, 133]},
    {"id": 66, "name": "tree", "isthing": 0, "color": [255, 0, 0]},
    {"id": 67, "name": "flower", "isthing": 0, "color": [255, 163, 0]},
    {"id": 68, "name": "other_plant", "isthing": 0, "color": [255, 102, 0]},
    {"id": 69, "name": "toy", "isthing": 0, "color": [194, 255, 0]},
    {"id": 70, "name": "ball_net", "isthing": 0, "color": [0, 143, 255]},
    {"id": 71, "name": "backboard", "isthing": 0, "color": [51, 255, 0]},
    {"id": 72, "name": "skateboard", "isthing": 1, "color": [0, 82, 255]},
    {"id": 73, "name": "bat", "isthing": 0, "color": [0, 255, 41]},
    {"id": 74, "name": "ball", "isthing": 1, "color": [0, 255, 173]},
    {"id": 75, "name": "cupboard_or_showcase_or_storage_rack", "isthing": 0, "color": [10, 0, 255]},
    {"id": 76, "name": "box", "isthing": 1, "color": [173, 255, 0]},
    {"id": 77, "name": "traveling_case_or_trolley_case", "isthing": 1, "color": [0, 255, 153]},
    {"id": 78, "name": "basket", "isthing": 1, "color": [255, 92, 0]},
    {"id": 79, "name": "bag_or_package", "isthing": 1, "color": [255, 0, 255]},
    {"id": 80, "name": "trash_can", "isthing": 0, "color": [255, 0, 245]},
    {"id": 81, "name": "cage", "isthing": 0, "color": [255, 0, 102]},
    {"id": 82, "name": "plate", "isthing": 1, "color": [255, 173, 0]},
    {"id": 83, "name": "tub_or_bowl_or_pot", "isthing": 1, "color": [255, 0, 20]},
    {"id": 84, "name": "bottle_or_cup", "isthing": 1, "color": [255, 184, 184]},
    {"id": 85, "name": "barrel", "isthing": 1, "color": [0, 31, 255]},
    {"id": 86, "name": "fishbowl", "isthing": 1, "color": [0, 255, 61]},
    {"id": 87, "name": "bed", "isthing": 1, "color": [0, 71, 255]},
    {"id": 88, "name": "pillow", "isthing": 1, "color": [255, 0, 204]},
    {"id": 89, "name": "table_or_desk", "isthing": 1, "color": [0, 255, 194]},
    {"id": 90, "name": "chair_or_seat", "isthing": 1, "color": [0, 255, 82]},
    {"id": 91, "name": "bench", "isthing": 1, "color": [0, 10, 255]},
    {"id": 92, "name": "sofa", "isthing": 1, "color": [0, 112, 255]},
    {"id": 93, "name": "shelf", "isthing": 0, "color": [51, 0, 255]},
    {"id": 94, "name": "bathtub", "isthing": 0, "color": [0, 194, 255]},
    {"id": 95, "name": "gun", "isthing": 1, "color": [0, 122, 255]},
    {"id": 96, "name": "commode", "isthing": 1, "color": [0, 255, 163]},
    {"id": 97, "name": "roaster", "isthing": 1, "color": [255, 153, 0]},
    {"id": 98, "name": "other_machine", "isthing": 0, "color": [0, 255, 10]},
    {"id": 99, "name": "refrigerator", "isthing": 1, "color": [255, 112, 0]},
    {"id": 100, "name": "washing_machine", "isthing": 1, "color": [143, 255, 0]},
    {"id": 101, "name": "Microwave_oven", "isthing": 1, "color": [82, 0, 255]},
    {"id": 102, "name": "fan", "isthing": 1, "color": [163, 255, 0]},
    {"id": 103, "name": "curtain", "isthing": 0, "color": [255, 235, 0]},
    {"id": 104, "name": "textiles", "isthing": 0, "color": [8, 184, 170]},
    {"id": 105, "name": "clothes", "isthing": 0, "color": [133, 0, 255]},
    {"id": 106, "name": "painting_or_poster", "isthing": 1, "color": [0, 255, 92]},
    {"id": 107, "name": "mirror", "isthing": 1, "color": [184, 0, 255]},
    {"id": 108, "name": "flower_pot_or_vase", "isthing": 1, "color": [255, 0, 31]},
    {"id": 109, "name": "clock", "isthing": 1, "color": [0, 184, 255]},
    {"id": 110, "name": "book", "isthing": 0, "color": [0, 214, 255]},
    {"id": 111, "name": "tool", "isthing": 0, "color": [255, 0, 112]},
    {"id": 112, "name": "blackboard", "isthing": 0, "color": [92, 255, 0]},
    {"id": 113, "name": "tissue", "isthing": 0, "color": [0, 224, 255]},
    {"id": 114, "name": "screen_or_television", "isthing": 1, "color": [112, 224, 255]},
    {"id": 115, "name": "computer", "isthing": 1, "color": [70, 184, 160]},
    {"id": 116, "name": "printer", "isthing": 1, "color": [163, 0, 255]},
    {"id": 117, "name": "Mobile_phone", "isthing": 1, "color": [153, 0, 255]},
    {"id": 118, "name": "keyboard", "isthing": 1, "color": [71, 255, 0]},
    {"id": 119, "name": "other_electronic_product", "isthing": 0, "color": [255, 0, 163]},
    {"id": 120, "name": "fruit", "isthing": 0, "color": [255, 204, 0]},
    {"id": 121, "name": "food", "isthing": 0, "color": [255, 0, 143]},
    {"id": 122, "name": "instrument", "isthing": 1, "color": [0, 255, 235]},
    {"id": 123, "name": "train", "isthing": 1, "color": [133, 255, 0]}
]

CLASSES_THING = [
    {'id': 2, 'name': 'door', 'isthing': 1, 'color': [6, 230, 230]},
    {'id': 4, 'name': 'ladder', 'isthing': 1, 'color': [4, 200, 3]},
    {'id': 8, 'name': 'window', 'isthing': 1, 'color': [230, 230, 230]},
    {'id': 10, 'name': 'goal', 'isthing': 1, 'color': [224, 5, 255]},
    {'id': 41, 'name': 'sculpture', 'isthing': 1, 'color': [0, 255, 20]},
    {'id': 43, 'name': 'flag', 'isthing': 1, 'color': [255, 5, 153]},
    {'id': 44, 'name': 'parasol_or_umbrella', 'isthing': 1, 'color': [6, 51, 255]},
    {'id': 46, 'name': 'tent', 'isthing': 1, 'color': [160, 150, 20]},
    {'id': 47, 'name': 'roadblock', 'isthing': 1, 'color': [0, 163, 255]},
    {'id': 48, 'name': 'car', 'isthing': 1, 'color': [140, 140, 140]},
    {'id': 49, 'name': 'bus', 'isthing': 1, 'color': [250, 10, 15]},
    {'id': 50, 'name': 'truck', 'isthing': 1, 'color': [20, 255, 0]},
    {'id': 51, 'name': 'bicycle', 'isthing': 1, 'color': [31, 255, 0]},
    {'id': 52, 'name': 'motorcycle', 'isthing': 1, 'color': [255, 31, 0]},
    {'id': 54, 'name': 'ship_or_boat', 'isthing': 1, 'color': [153, 255, 0]},
    {'id': 55, 'name': 'raft', 'isthing': 1, 'color': [0, 0, 255]},
    {'id': 56, 'name': 'airplane', 'isthing': 1, 'color': [255, 71, 0]},
    {'id': 60, 'name': 'person', 'isthing': 1, 'color': [11, 200, 200]},
    {'id': 61, 'name': 'cat', 'isthing': 1, 'color': [255, 82, 0]},
    {'id': 62, 'name': 'dog', 'isthing': 1, 'color': [0, 255, 245]},
    {'id': 63, 'name': 'horse', 'isthing': 1, 'color': [0, 61, 255]},
    {'id': 64, 'name': 'cattle', 'isthing': 1, 'color': [0, 255, 112]},
    {'id': 65, 'name': 'other_animal', 'isthing': 1, 'color': [0, 255, 133]},
    {'id': 72, 'name': 'skateboard', 'isthing': 1, 'color': [0, 82, 255]},
    {'id': 74, 'name': 'ball', 'isthing': 1, 'color': [0, 255, 173]},
    {'id': 76, 'name': 'box', 'isthing': 1, 'color': [173, 255, 0]},
    {'id': 77, 'name': 'traveling_case_or_trolley_case', 'isthing': 1, 'color': [0, 255, 153]},
    {'id': 78, 'name': 'basket', 'isthing': 1, 'color': [255, 92, 0]},
    {'id': 79, 'name': 'bag_or_package', 'isthing': 1, 'color': [255, 0, 255]},
    {'id': 82, 'name': 'plate', 'isthing': 1, 'color': [255, 173, 0]},
    {'id': 83, 'name': 'tub_or_bowl_or_pot', 'isthing': 1, 'color': [255, 0, 20]},
    {'id': 84, 'name': 'bottle_or_cup', 'isthing': 1, 'color': [255, 184, 184]},
    {'id': 85, 'name': 'barrel', 'isthing': 1, 'color': [0, 31, 255]},
    {'id': 86, 'name': 'fishbowl', 'isthing': 1, 'color': [0, 255, 61]},
    {'id': 87, 'name': 'bed', 'isthing': 1, 'color': [0, 71, 255]},
    {'id': 88, 'name': 'pillow', 'isthing': 1, 'color': [255, 0, 204]},
    {'id': 89, 'name': 'table_or_desk', 'isthing': 1, 'color': [0, 255, 194]},
    {'id': 90, 'name': 'chair_or_seat', 'isthing': 1, 'color': [0, 255, 82]},
    {'id': 91, 'name': 'bench', 'isthing': 1, 'color': [0, 10, 255]},
    {'id': 92, 'name': 'sofa', 'isthing': 1, 'color': [0, 112, 255]},
    {'id': 95, 'name': 'gun', 'isthing': 1, 'color': [0, 122, 255]},
    {'id': 96, 'name': 'commode', 'isthing': 1, 'color': [0, 255, 163]},
    {'id': 97, 'name': 'roaster', 'isthing': 1, 'color': [255, 153, 0]},
    {'id': 99, 'name': 'refrigerator', 'isthing': 1, 'color': [255, 112, 0]},
    {'id': 100, 'name': 'washing_machine', 'isthing': 1, 'color': [143, 255, 0]},
    {'id': 101, 'name': 'Microwave_oven', 'isthing': 1, 'color': [82, 0, 255]},
    {'id': 102, 'name': 'fan', 'isthing': 1, 'color': [163, 255, 0]},
    {'id': 106, 'name': 'painting_or_poster', 'isthing': 1, 'color': [0, 255, 92]},
    {'id': 107, 'name': 'mirror', 'isthing': 1, 'color': [184, 0, 255]},
    {'id': 108, 'name': 'flower_pot_or_vase', 'isthing': 1, 'color': [255, 0, 31]},
    {'id': 109, 'name': 'clock', 'isthing': 1, 'color': [0, 184, 255]},
    {'id': 114, 'name': 'screen_or_television', 'isthing': 1, 'color': [112, 224, 255]},
    {'id': 115, 'name': 'computer', 'isthing': 1, 'color': [70, 184, 160]},
    {'id': 116, 'name': 'printer', 'isthing': 1, 'color': [163, 0, 255]},
    {'id': 117, 'name': 'Mobile_phone', 'isthing': 1, 'color': [153, 0, 255]},
    {'id': 118, 'name': 'keyboard', 'isthing': 1, 'color': [71, 255, 0]},
    {'id': 122, 'name': 'instrument', 'isthing': 1, 'color': [0, 255, 235]},
    {'id': 123, 'name': 'train', 'isthing': 1, 'color': [133, 255, 0]}
]

CLASSES_STUFF = [
    {'id': 0, 'name': 'wall', 'isthing': 0, 'color': [120, 120, 120]},
    {'id': 1, 'name': 'ceiling', 'isthing': 0, 'color': [180, 120, 120]},
    {'id': 3, 'name': 'stair', 'isthing': 0, 'color': [80, 50, 50]},
    {'id': 5, 'name': 'escalator', 'isthing': 0, 'color': [120, 120, 80]},
    {'id': 6, 'name': 'Playground_slide', 'isthing': 0, 'color': [140, 140, 140]},
    {'id': 7, 'name': 'handrail_or_fence', 'isthing': 0, 'color': [204, 5, 255]},
    {'id': 9, 'name': 'rail', 'isthing': 0, 'color': [4, 250, 7]},
    {'id': 11, 'name': 'pillar', 'isthing': 0, 'color': [235, 255, 7]},
    {'id': 12, 'name': 'pole', 'isthing': 0, 'color': [150, 5, 61]},
    {'id': 13, 'name': 'floor', 'isthing': 0, 'color': [120, 120, 70]},
    {'id': 14, 'name': 'ground', 'isthing': 0, 'color': [8, 255, 51]},
    {'id': 15, 'name': 'grass', 'isthing': 0, 'color': [255, 6, 82]},
    {'id': 16, 'name': 'sand', 'isthing': 0, 'color': [143, 255, 140]},
    {'id': 17, 'name': 'athletic_field', 'isthing': 0, 'color': [204, 255, 4]},
    {'id': 18, 'name': 'road', 'isthing': 0, 'color': [255, 51, 7]},
    {'id': 19, 'name': 'path', 'isthing': 0, 'color': [204, 70, 3]},
    {'id': 20, 'name': 'crosswalk', 'isthing': 0, 'color': [0, 102, 200]},
    {'id': 21, 'name': 'building', 'isthing': 0, 'color': [61, 230, 250]},
    {'id': 22, 'name': 'house', 'isthing': 0, 'color': [255, 6, 51]},
    {'id': 23, 'name': 'bridge', 'isthing': 0, 'color': [11, 102, 255]},
    {'id': 24, 'name': 'tower', 'isthing': 0, 'color': [255, 7, 71]},
    {'id': 25, 'name': 'windmill', 'isthing': 0, 'color': [255, 9, 224]},
    {'id': 26, 'name': 'well_or_well_lid', 'isthing': 0, 'color': [9, 7, 230]},
    {'id': 27, 'name': 'other_construction', 'isthing': 0, 'color': [220, 220, 220]},
    {'id': 28, 'name': 'sky', 'isthing': 0, 'color': [255, 9, 92]},
    {'id': 29, 'name': 'mountain', 'isthing': 0, 'color': [112, 9, 255]},
    {'id': 30, 'name': 'stone', 'isthing': 0, 'color': [8, 255, 214]},
    {'id': 31, 'name': 'wood', 'isthing': 0, 'color': [7, 255, 224]},
    {'id': 32, 'name': 'ice', 'isthing': 0, 'color': [255, 184, 6]},
    {'id': 33, 'name': 'snowfield', 'isthing': 0, 'color': [10, 255, 71]},
    {'id': 34, 'name': 'grandstand', 'isthing': 0, 'color': [255, 41, 10]},
    {'id': 35, 'name': 'sea', 'isthing': 0, 'color': [7, 255, 255]},
    {'id': 36, 'name': 'river', 'isthing': 0, 'color': [224, 255, 8]},
    {'id': 37, 'name': 'lake', 'isthing': 0, 'color': [102, 8, 255]},
    {'id': 38, 'name': 'waterfall', 'isthing': 0, 'color': [255, 61, 6]},
    {'id': 39, 'name': 'water', 'isthing': 0, 'color': [255, 194, 7]},
    {'id': 40, 'name': 'billboard_or_Bulletin_Board', 'isthing': 0, 'color': [255, 122, 8]},
    {'id': 42, 'name': 'pipeline', 'isthing': 0, 'color': [255, 8, 41]},
    {'id': 45, 'name': 'cushion_or_carpet', 'isthing': 0, 'color': [235, 12, 255]},
    {'id': 53, 'name': 'wheeled_machine', 'isthing': 0, 'color': [255, 224, 0]},
    {'id': 57, 'name': 'tyre', 'isthing': 0, 'color': [0, 235, 255]},
    {'id': 58, 'name': 'traffic_light', 'isthing': 0, 'color': [0, 173, 255]},
    {'id': 59, 'name': 'lamp', 'isthing': 0, 'color': [31, 0, 255]},
    {'id': 66, 'name': 'tree', 'isthing': 0, 'color': [255, 0, 0]},
    {'id': 67, 'name': 'flower', 'isthing': 0, 'color': [255, 163, 0]},
    {'id': 68, 'name': 'other_plant', 'isthing': 0, 'color': [255, 102, 0]},
    {'id': 69, 'name': 'toy', 'isthing': 0, 'color': [194, 255, 0]},
    {'id': 70, 'name': 'ball_net', 'isthing': 0, 'color': [0, 143, 255]},
    {'id': 71, 'name': 'backboard', 'isthing': 0, 'color': [51, 255, 0]},
    {'id': 73, 'name': 'bat', 'isthing': 0, 'color': [0, 255, 41]},
    {'id': 75, 'name': 'cupboard_or_showcase_or_storage_rack', 'isthing': 0, 'color': [10, 0, 255]},
    {'id': 80, 'name': 'trash_can', 'isthing': 0, 'color': [255, 0, 245]},
    {'id': 81, 'name': 'cage', 'isthing': 0, 'color': [255, 0, 102]},
    {'id': 93, 'name': 'shelf', 'isthing': 0, 'color': [51, 0, 255]},
    {'id': 94, 'name': 'bathtub', 'isthing': 0, 'color': [0, 194, 255]},
    {'id': 98, 'name': 'other_machine', 'isthing': 0, 'color': [0, 255, 10]},
    {'id': 103, 'name': 'curtain', 'isthing': 0, 'color': [255, 235, 0]},
    {'id': 104, 'name': 'textiles', 'isthing': 0, 'color': [8, 184, 170]},
    {'id': 105, 'name': 'clothes', 'isthing': 0, 'color': [133, 0, 255]},
    {'id': 110, 'name': 'book', 'isthing': 0, 'color': [0, 214, 255]},
    {'id': 111, 'name': 'tool', 'isthing': 0, 'color': [255, 0, 112]},
    {'id': 112, 'name': 'blackboard', 'isthing': 0, 'color': [92, 255, 0]},
    {'id': 113, 'name': 'tissue', 'isthing': 0, 'color': [0, 224, 255]},
    {'id': 119, 'name': 'other_electronic_product', 'isthing': 0, 'color': [255, 0, 163]},
    {'id': 120, 'name': 'fruit', 'isthing': 0, 'color': [255, 204, 0]},
    {'id': 121, 'name': 'food', 'isthing': 0, 'color': [255, 0, 143]}
]

NO_OBJ = 0
NO_OBJ_HB = 255
DIVISOR_PAN = 100
DIVISOR_NEW = 1000
NUM_THING = 58
NUM_STUFF = 66
THING_B_STUFF = False


def vip2hb(pan_map):
    assert not THING_B_STUFF, "VIPSeg only supports stuff -> thing"
    pan_new = - np.ones_like(pan_map)
    vip2hb_thing = {itm['id'] + 1: idx for idx, itm in enumerate(CLASSES_THING)}
    vip2hb_stuff = {itm['id'] + 1: idx for idx, itm in enumerate(CLASSES_STUFF)}
    for idx in np.unique(pan_map):
        if idx == NO_OBJ or idx == 200:
            pan_new[pan_map == idx] = NO_OBJ_HB * DIVISOR_NEW
        elif idx > 128:
            cls_id = idx // DIVISOR_PAN
            cls_new_id = vip2hb_thing[cls_id]
            inst_id = idx % DIVISOR_PAN
            # since stuff first -> thing the second
            cls_new_id += NUM_STUFF
            pan_new[pan_map == idx] = cls_new_id * DIVISOR_NEW + inst_id + 1
        else:
            pan_new[pan_map == idx] = vip2hb_stuff[idx] * DIVISOR_NEW
    assert -1. not in np.unique(pan_new)
    return pan_new


def parse_args():
    parser = argparse.ArgumentParser(description='Evaluation of DSTQ')
    parser.add_argument('result_path')
    parser.add_argument('--gt-path', default='data/kitti-step')
    parser.add_argument('--split', default='val')
    parser.add_argument(
        '--depth',
        action='store_true',
        help='eval depth')
    parser.add_argument('--nproc', default=1, type=int, help='number of process')
    args = parser.parse_args()
    return args


def updater(pred_ins_name,
            pred_cls_name,
            pred_dep_name,
            gt_pan_seq_name,
            gt_dep_seq_name,
            updater_obj,
            seq_id):
    pred_ins = mmcv.imread(pred_ins_name, flag='unchanged').astype(np.int32)
    pred_cls = mmcv.imread(pred_cls_name, flag='unchanged').astype(np.int32)
    pred_dep = mmcv.imread(pred_dep_name, flag='unchanged').astype(np.float32) if pred_dep_name is not None else None

    gt_pan = mmcv.imread(gt_pan_seq_name,  flag='unchanged').astype(np.int64)
    gt_pan = vip2hb(gt_pan)

    gt_cls = gt_pan // DIVISOR_NEW
    gt_ins = gt_pan % DIVISOR_NEW
    gt_dep = mmcv.imread(gt_dep_seq_name, flag='unchanged').astype(np.float32) if gt_dep_seq_name is not None else None
    if pred_dep is not None:
        pred_dep = F.interpolate(torch.from_numpy(pred_dep)[None][None], size=gt_dep.shape)[0][0].numpy()

    valid_mask_seg = gt_cls != NO_OBJ_HB

    pred_masked_ps = pred_cls[valid_mask_seg] * (2 ** 16) + pred_ins[valid_mask_seg]
    gt_masked_ps = gt_cls[valid_mask_seg] * (2 ** 16) + gt_ins[valid_mask_seg]

    if pred_dep_name is not None:
        valid_mask_dep = gt_dep > 0.

        pred_masked_depth = pred_dep[valid_mask_dep]
        gt_masked_depth = gt_dep[valid_mask_dep]

        updater_obj.update_state(gt_masked_ps, pred_masked_ps, gt_masked_depth, pred_masked_depth, seq_id)
    else:
        updater_obj.update_state(gt_masked_ps, pred_masked_ps, seq_id)


def eval_dstq(result_dir, gt_dir, with_depth=True):
    if with_depth:
        dstq_obj = DSTQuality(
            num_classes=len(CLASSES),
            things_list=list(range(66, 124)),
            ignore_label=NO_OBJ_HB,
            label_bit_shift=16,
            offset=2 ** 16 * 256,
            depth_threshold=(1.25,),
        )
    else:
        dstq_obj = STQuality(
            num_classes=len(CLASSES),
            things_list=list(range(66, 124)),
            ignore_label=NO_OBJ_HB,
            label_bit_shift=16,
            offset=2 ** 16 * 256,
        )
    ann_folders = mmcv.list_from_file(os.path.join(gt_dir, "{}.txt".format(split)),
                                      prefix=os.path.join(gt_dir, 'panomasks') + '/')
    seq_ids = np.arange(0, len(ann_folders)).tolist()

    for seq_id in seq_ids:

        gt_names = list(mmcv.scandir(ann_folders[seq_id]))
        gt_pan_names = sorted(list(filter(lambda x: '.png' in x, gt_names)))
        if with_depth:
            gt_dep_names = sorted(list(filter(lambda x: 'depth' in x, gt_names)))
        else:
            gt_dep_names = [None] * len(gt_pan_names)
        pred_name_panoptic = list(mmcv.scandir(os.path.join(result_dir, 'panoptic', str(seq_id))))
        pred_ins_names = sorted(list(filter(lambda x: 'ins' in x, pred_name_panoptic)))
        pred_cls_names = sorted(list(filter(lambda x: 'cat' in x, pred_name_panoptic)))
        if len(gt_pan_names) != len(pred_ins_names):
            print("Error when seq_id is {}. But cal existing seqs.".format(seq_id))
            break
        if with_depth:
            pred_name_depth = list(mmcv.scandir(os.path.join(result_dir, 'depth', str(seq_id))))
            pred_dep_names = sorted(pred_name_depth)
        else:
            pred_dep_names = [None] * len(pred_ins_names)
        prog_bar = ProgressBar(len(pred_ins_names))
        for pred_ins_name, pred_cls_name, pred_dep_name, gt_pan_seq_name, gt_dep_seq_name in zip(
                pred_ins_names, pred_cls_names, pred_dep_names, gt_pan_names, gt_dep_names
        ):
            prog_bar.update()
            updater(
                os.path.join(result_dir, 'panoptic', str(seq_id), pred_ins_name),
                os.path.join(result_dir, 'panoptic', str(seq_id), pred_cls_name),
                os.path.join(result_dir, 'depth', str(seq_id), pred_dep_name) if pred_dep_name is not None else None,
                os.path.join(ann_folders[seq_id], gt_pan_seq_name),
                os.path.join(ann_folders[seq_id], gt_dep_seq_name) if gt_dep_seq_name is not None else None,
                dstq_obj,
                seq_id
            )
    result = dstq_obj.result()
    print(result)

# usage python eval_dstq_vipseg.py /opt/data/results/test --gt-path /opt/data/VIPSeg
if __name__ == '__main__':
    args = parse_args()
    result_path = args.result_path
    gt_path = args.gt_path
    split = args.split
    eval_dstq(result_path, gt_path, args.depth)


================================================
FILE: tools/eval_dvpq_step.py
================================================
import numpy as np
from PIL import Image
import six
import os
import multiprocessing as mp
import argparse

parser = argparse.ArgumentParser(description='')
parser.add_argument('result_path')
parser.add_argument('--eval_frames', type=int, default=1)
parser.add_argument('--depth_thres', type=float, default=0)
args = parser.parse_args()

eval_frames = args.eval_frames
pred_dir_all = os.path.join(args.result_path, 'panoptic')
depth_dir_all = os.path.join(args.result_path, 'depth')
gt_dir = 'data/kitti-step/video_sequence/val/'
depth_thres = args.depth_thres


def vpq_eval(element):
    pred_ids, gt_ids = element
    max_ins = 2 ** 16
    ign_id = 255
    offset = 2 ** 30
    num_cat = 20

    iou_per_class = np.zeros(num_cat, dtype=np.float64)
    tp_per_class = np.zeros(num_cat, dtype=np.float64)
    fn_per_class = np.zeros(num_cat, dtype=np.float64)
    fp_per_class = np.zeros(num_cat, dtype=np.float64)

    def _ids_to_counts(id_array):
        ids, counts = np.unique(id_array, return_counts=True)
        return dict(six.moves.zip(ids, counts))

    pred_areas = _ids_to_counts(pred_ids)
    gt_areas = _ids_to_counts(gt_ids)

    void_id = ign_id * max_ins
    ign_ids = {
        gt_id for gt_id in six.iterkeys(gt_areas)
        if (gt_id // max_ins) == ign_id
    }

    int_ids = gt_ids.astype(np.int64) * offset + pred_ids.astype(np.int64)
    int_areas = _ids_to_counts(int_ids)

    def prediction_void_overlap(pred_id):
        void_int_id = void_id * offset + pred_id
        return int_areas.get(void_int_id, 0)

    def prediction_ignored_overlap(pred_id):
        total_ignored_overlap = 0
        for _ign_id in ign_ids:
            int_id = _ign_id * offset + pred_id
            total_ignored_overlap += int_areas.get(int_id, 0)
        return total_ignored_overlap

    gt_matched = set()
    pred_matched = set()

    for int_id, int_area in six.iteritems(int_areas):
        gt_id = int(int_id // offset)
        gt_cat = int(gt_id // max_ins)
        pred_id = int(int_id % offset)
        pred_cat = int(pred_id // max_ins)
        if gt_cat != pred_cat:
            continue
        union = (
            gt_areas[gt_id] + pred_areas[pred_id] - int_area -
            prediction_void_overlap(pred_id)
        )
        iou = int_area / union
        if iou > 0.5:
            tp_per_class[gt_cat] += 1
            iou_per_class[gt_cat] += iou
            gt_matched.add(gt_id)
            pred_matched.add(pred_id)

    for gt_id in six.iterkeys(gt_areas):
        if gt_id in gt_matched:
            continue
        cat_id = gt_id // max_ins
        if cat_id == ign_id:
            continue
        fn_per_class[cat_id] += 1

    for pred_id in six.iterkeys(pred_areas):
        if pred_id in pred_matched:
            continue
        if (prediction_ignored_overlap(pred_id) / pred_areas[pred_id]) > 0.5:
            continue
        cat = pred_id // max_ins
        fp_per_class[cat] += 1

    return (iou_per_class, tp_per_class, fn_per_class, fp_per_class)


def eval(element):
    max_ins = 2 ** 16

    pred_cat, pred_ins, gts, depth_preds, depth_gts = element
    pred_cat = [np.array(Image.open(image)) for image in pred_cat]
    pred_ins = [np.array(Image.open(image)) for image in pred_ins]
    pred_cat = np.concatenate(pred_cat, axis=1)
    pred_ins = np.concatenate(pred_ins, axis=1)
    pred = pred_cat.astype(np.int32) * max_ins + pred_ins.astype(np.int32)

    gts_pan = [np.array(Image.open(image)) for image in gts]
    gts = [gt_pan[..., 0].astype(np.int32) * max_ins +
           gt_pan[..., 1].astype(np.int32) * 256 + gt_pan[..., 2].astype(np.int32)
           for gt_pan in gts_pan]

    abs_rel = 0.
    if depth_thres > 0:
        depth_preds = [np.array(Image.open(name)) for name in depth_preds]
        depth_gts = [np.array(Image.open(name)) for name in depth_gts]
        depth_preds = np.concatenate(depth_preds, axis=1)
        depth_gts = np.concatenate(depth_gts, axis=1)
        depth_mask = depth_gts > 0
        abs_rel = np.mean(
            np.abs(
                depth_preds[depth_mask] -
                depth_gts[depth_mask]) /
            depth_gts[depth_mask])
        pred_in_mask = pred[:, :depth_preds.shape[1]]
        pred_in_depth_mask = pred_in_mask[depth_mask]
        ignored_pred_mask = (
            np.abs(
                depth_preds[depth_mask] -
                depth_gts[depth_mask]) /
            depth_gts[depth_mask]) > depth_thres
        pred_in_depth_mask[ignored_pred_mask] = 19 * max_ins
        pred_in_mask[depth_mask] = pred_in_depth_mask
        pred[:, :depth_preds.shape[1]] = pred_in_mask

    gt = np.concatenate(gts, axis=1)
    result = vpq_eval([pred, gt])

    return result + (abs_rel, )


def main():
    gt_names_all = os.scandir(gt_dir)
    gt_names_all = [name.name for name in gt_names_all if 'panoptic' in name.name]
    gt_names_all = [os.path.join(gt_dir, name) for name in gt_names_all]
    gt_names_all = sorted(gt_names_all)

    if args.depth_thres > 0:
        depth_gt_names_all = os.scandir(gt_dir)
        depth_gt_names_all = [
            name.name for name in depth_gt_names_all if 'depth' in name.name]
        depth_gt_names_all = [os.path.join(gt_dir, name) for name in depth_gt_names_all]
        depth_gt_names_all = sorted(depth_gt_names_all)

    iou_per_class_all = []
    tp_per_class_all = []
    fn_per_class_all = []
    fp_per_class_all = []

    things_index = np.zeros((19,)).astype(bool)
    things_index[11] = True
    things_index[13] = True

    for i in [2, 6, 7, 8, 10, 13, 14, 16, 18]:
        if args.depth_thres > 0:
            depth_dir = os.path.join(depth_dir_all, str(i))
            depth_pred_names = os.scandir(depth_dir)
            depth_pred_names = [name.name for name in depth_pred_names]
            depth_pred_names = [os.path.join(depth_dir, name)
                                for name in depth_pred_names]
            depth_pred_names = sorted(depth_pred_names)

        pred_dir = os.path.join(pred_dir_all, str(i))
        pred_names = os.scandir(pred_dir)
        pred_names = [os.path.join(pred_dir, name.name) for name in pred_names]
        cat_pred_names = [name for name in pred_names if name.endswith('cat.png')]
        ins_pred_names = [name for name in pred_names if name.endswith('ins.png')]
        cat_pred_names = sorted(cat_pred_names)
        ins_pred_names = sorted(ins_pred_names)

        all_lst = []
        gt_names = sorted(list(filter(lambda x: os.path.basename(x).startswith('{:06d}'.format(i)), gt_names_all)))
        if args.depth_thres > 0:
            depth_gt_names = sorted(list(filter(lambda x: os.path.basename(x).startswith('{:06d}'.format(i)), depth_gt_names_all)))
        for i in range(len(cat_pred_names) - eval_frames + 1):
            all_lst.append([cat_pred_names[i: i + eval_frames],
                            ins_pred_names[i: i + eval_frames],
                            gt_names[i: i + eval_frames],
                            depth_pred_names[i: i + eval_frames] if args.depth_thres > 0 else None,
                            depth_gt_names[i: i + eval_frames] if args.depth_thres > 0 else None
                            ])

        N = mp.cpu_count() // 2
        with mp.Pool(processes=N) as p:
            results = p.map(eval, all_lst)
        iou_per_class = np.stack([result[0] for result in results])
        iou_per_class_all.append(iou_per_class)
        tp_per_class = np.stack([result[1] for result in results])
        tp_per_class_all.append(tp_per_class)
        fn_per_class = np.stack([result[2] for result in results])
        fn_per_class_all.append(fn_per_class)
        fp_per_class = np.stack([result[3] for result in results])
        fp_per_class_all.append(fp_per_class)
        # abs_rel = np.stack([result[4] for result in results]).mean(axis=0)
        epsilon = 1e-10
        iou_per_class = iou_per_class.sum(axis=0)[:19]
        tp_per_class = tp_per_class.sum(axis=0)[:19]
        fn_per_class = fn_per_class.sum(axis=0)[:19]
        fp_per_class = fp_per_class.sum(axis=0)[:19]
        sq = iou_per_class / (tp_per_class + epsilon)
        rq = tp_per_class / (tp_per_class + 0.5 *
                             fn_per_class + 0.5 * fp_per_class + epsilon)
        pq = sq * rq
        spq = pq[np.logical_not(things_index)]
        tpq = pq[things_index]
        print(
            r'{:.1f} {:.1f} {:.1f}'.format(
                pq.mean() * 100,
                tpq.mean() * 100,
                spq.mean() * 100))

    print("----------------final-----------------")
    iou_per_class_all = np.concatenate(iou_per_class_all, axis=0).sum(axis=0)[:19]
    tp_per_class_all = np.concatenate(tp_per_class_all, axis=0).sum(axis=0)[:19]
    fn_per_class_all = np.concatenate(fn_per_class_all, axis=0).sum(axis=0)[:19]
    fp_per_class_all = np.concatenate(fp_per_class_all, axis=0).sum(axis=0)[:19]

    sq = iou_per_class_all / (tp_per_class_all + epsilon)
    rq = tp_per_class_all / (tp_per_class_all + 0.5 *
                         fn_per_class_all + 0.5 * fp_per_class_all + epsilon)
    pq = sq * rq
    spq = pq[np.logical_not(things_index)]
    tpq = pq[things_index]
    print(
        r'{:.1f} {:.1f} {:.1f}'.format(
            pq.mean() * 100,
            tpq.mean() * 100,
            spq.mean() * 100))


if __name__ == '__main__':
    main()


================================================
FILE: tools/eval_dvpq_vipseg.py
================================================
import argparse
import os

import mmcv
import numpy as np
import six
import multiprocessing as mp

CLASSES = [
    {"id": 0, "name": "wall", "isthing": 0, "color": [120, 120, 120]},
    {"id": 1, "name": "ceiling", "isthing": 0, "color": [180, 120, 120]},
    {"id": 2, "name": "door", "isthing": 1, "color": [6, 230, 230]},
    {"id": 3, "name": "stair", "isthing": 0, "color": [80, 50, 50]},
    {"id": 4, "name": "ladder", "isthing": 1, "color": [4, 200, 3]},
    {"id": 5, "name": "escalator", "isthing": 0, "color": [120, 120, 80]},
    {"id": 6, "name": "Playground_slide", "isthing": 0, "color": [140, 140, 140]},
    {"id": 7, "name": "handrail_or_fence", "isthing": 0, "color": [204, 5, 255]},
    {"id": 8, "name": "window", "isthing": 1, "color": [230, 230, 230]},
    {"id": 9, "name": "rail", "isthing": 0, "color": [4, 250, 7]},
    {"id": 10, "name": "goal", "isthing": 1, "color": [224, 5, 255]},
    {"id": 11, "name": "pillar", "isthing": 0, "color": [235, 255, 7]},
    {"id": 12, "name": "pole", "isthing": 0, "color": [150, 5, 61]},
    {"id": 13, "name": "floor", "isthing": 0, "color": [120, 120, 70]},
    {"id": 14, "name": "ground", "isthing": 0, "color": [8, 255, 51]},
    {"id": 15, "name": "grass", "isthing": 0, "color": [255, 6, 82]},
    {"id": 16, "name": "sand", "isthing": 0, "color": [143, 255, 140]},
    {"id": 17, "name": "athletic_field", "isthing": 0, "color": [204, 255, 4]},
    {"id": 18, "name": "road", "isthing": 0, "color": [255, 51, 7]},
    {"id": 19, "name": "path", "isthing": 0, "color": [204, 70, 3]},
    {"id": 20, "name": "crosswalk", "isthing": 0, "color": [0, 102, 200]},
    {"id": 21, "name": "building", "isthing": 0, "color": [61, 230, 250]},
    {"id": 22, "name": "house", "isthing": 0, "color": [255, 6, 51]},
    {"id": 23, "name": "bridge", "isthing": 0, "color": [11, 102, 255]},
    {"id": 24, "name": "tower", "isthing": 0, "color": [255, 7, 71]},
    {"id": 25, "name": "windmill", "isthing": 0, "color": [255, 9, 224]},
    {"id": 26, "name": "well_or_well_lid", "isthing": 0, "color": [9, 7, 230]},
    {"id": 27, "name": "other_construction", "isthing": 0, "color": [220, 220, 220]},
    {"id": 28, "name": "sky", "isthing": 0, "color": [255, 9, 92]},
    {"id": 29, "name": "mountain", "isthing": 0, "color": [112, 9, 255]},
    {"id": 30, "name": "stone", "isthing": 0, "color": [8, 255, 214]},
    {"id": 31, "name": "wood", "isthing": 0, "color": [7, 255, 224]},
    {"id": 32, "name": "ice", "isthing": 0, "color": [255, 184, 6]},
    {"id": 33, "name": "snowfield", "isthing": 0, "color": [10, 255, 71]},
    {"id": 34, "name": "grandstand", "isthing": 0, "color": [255, 41, 10]},
    {"id": 35, "name": "sea", "isthing": 0, "color": [7, 255, 255]},
    {"id": 36, "name": "river", "isthing": 0, "color": [224, 255, 8]},
    {"id": 37, "name": "lake", "isthing": 0, "color": [102, 8, 255]},
    {"id": 38, "name": "waterfall", "isthing": 0, "color": [255, 61, 6]},
    {"id": 39, "name": "water", "isthing": 0, "color": [255, 194, 7]},
    {"id": 40, "name": "billboard_or_Bulletin_Board", "isthing": 0, "color": [255, 122, 8]},
    {"id": 41, "name": "sculpture", "isthing": 1, "color": [0, 255, 20]},
    {"id": 42, "name": "pipeline", "isthing": 0, "color": [255, 8, 41]},
    {"id": 43, "name": "flag", "isthing": 1, "color": [255, 5, 153]},
    {"id": 44, "name": "parasol_or_umbrella", "isthing": 1, "color": [6, 51, 255]},
    {"id": 45, "name": "cushion_or_carpet", "isthing": 0, "color": [235, 12, 255]},
    {"id": 46, "name": "tent", "isthing": 1, "color": [160, 150, 20]},
    {"id": 47, "name": "roadblock", "isthing": 1, "color": [0, 163, 255]},
    {"id": 48, "name": "car", "isthing": 1, "color": [140, 140, 140]},
    {"id": 49, "name": "bus", "isthing": 1, "color": [250, 10, 15]},
    {"id": 50, "name": "truck", "isthing": 1, "color": [20, 255, 0]},
    {"id": 51, "name": "bicycle", "isthing": 1, "color": [31, 255, 0]},
    {"id": 52, "name": "motorcycle", "isthing": 1, "color": [255, 31, 0]},
    {"id": 53, "name": "wheeled_machine", "isthing": 0, "color": [255, 224, 0]},
    {"id": 54, "name": "ship_or_boat", "isthing": 1, "color": [153, 255, 0]},
    {"id": 55, "name": "raft", "isthing": 1, "color": [0, 0, 255]},
    {"id": 56, "name": "airplane", "isthing": 1, "color": [255, 71, 0]},
    {"id": 57, "name": "tyre", "isthing": 0, "color": [0, 235, 255]},
    {"id": 58, "name": "traffic_light", "isthing": 0, "color": [0, 173, 255]},
    {"id": 59, "name": "lamp", "isthing": 0, "color": [31, 0, 255]},
    {"id": 60, "name": "person", "isthing": 1, "color": [11, 200, 200]},
    {"id": 61, "name": "cat", "isthing": 1, "color": [255, 82, 0]},
    {"id": 62, "name": "dog", "isthing": 1, "color": [0, 255, 245]},
    {"id": 63, "name": "horse", "isthing": 1, "color": [0, 61, 255]},
    {"id": 64, "name": "cattle", "isthing": 1, "color": [0, 255, 112]},
    {"id": 65, "name": "other_animal", "isthing": 1, "color": [0, 255, 133]},
    {"id": 66, "name": "tree", "isthing": 0, "color": [255, 0, 0]},
    {"id": 67, "name": "flower", "isthing": 0, "color": [255, 163, 0]},
    {"id": 68, "name": "other_plant", "isthing": 0, "color": [255, 102, 0]},
    {"id": 69, "name": "toy", "isthing": 0, "color": [194, 255, 0]},
    {"id": 70, "name": "ball_net", "isthing": 0, "color": [0, 143, 255]},
    {"id": 71, "name": "backboard", "isthing": 0, "color": [51, 255, 0]},
    {"id": 72, "name": "skateboard", "isthing": 1, "color": [0, 82, 255]},
    {"id": 73, "name": "bat", "isthing": 0, "color": [0, 255, 41]},
    {"id": 74, "name": "ball", "isthing": 1, "color": [0, 255, 173]},
    {"id": 75, "name": "cupboard_or_showcase_or_storage_rack", "isthing": 0, "color": [10, 0, 255]},
    {"id": 76, "name": "box", "isthing": 1, "color": [173, 255, 0]},
    {"id": 77, "name": "traveling_case_or_trolley_case", "isthing": 1, "color": [0, 255, 153]},
    {"id": 78, "name": "basket", "isthing": 1, "color": [255, 92, 0]},
    {"id": 79, "name": "bag_or_package", "isthing": 1, "color": [255, 0, 255]},
    {"id": 80, "name": "trash_can", "isthing": 0, "color": [255, 0, 245]},
    {"id": 81, "name": "cage", "isthing": 0, "color": [255, 0, 102]},
    {"id": 82, "name": "plate", "isthing": 1, "color": [255, 173, 0]},
    {"id": 83, "name": "tub_or_bowl_or_pot", "isthing": 1, "color": [255, 0, 20]},
    {"id": 84, "name": "bottle_or_cup", "isthing": 1, "color": [255, 184, 184]},
    {"id": 85, "name": "barrel", "isthing": 1, "color": [0, 31, 255]},
    {"id": 86, "name": "fishbowl", "isthing": 1, "color": [0, 255, 61]},
    {"id": 87, "name": "bed", "isthing": 1, "color": [0, 71, 255]},
    {"id": 88, "name": "pillow", "isthing": 1, "color": [255, 0, 204]},
    {"id": 89, "name": "table_or_desk", "isthing": 1, "color": [0, 255, 194]},
    {"id": 90, "name": "chair_or_seat", "isthing": 1, "color": [0, 255, 82]},
    {"id": 91, "name": "bench", "isthing": 1, "color": [0, 10, 255]},
    {"id": 92, "name": "sofa", "isthing": 1, "color": [0, 112, 255]},
    {"id": 93, "name": "shelf", "isthing": 0, "color": [51, 0, 255]},
    {"id": 94, "name": "bathtub", "isthing": 0, "color": [0, 194, 255]},
    {"id": 95, "name": "gun", "isthing": 1, "color": [0, 122, 255]},
    {"id": 96, "name": "commode", "isthing": 1, "color": [0, 255, 163]},
    {"id": 97, "name": "roaster", "isthing": 1, "color": [255, 153, 0]},
    {"id": 98, "name": "other_machine", "isthing": 0, "color": [0, 255, 10]},
    {"id": 99, "name": "refrigerator", "isthing": 1, "color": [255, 112, 0]},
    {"id": 100, "name": "washing_machine", "isthing": 1, "color": [143, 255, 0]},
    {"id": 101, "name": "Microwave_oven", "isthing": 1, "color": [82, 0, 255]},
    {"id": 102, "name": "fan", "isthing": 1, "color": [163, 255, 0]},
    {"id": 103, "name": "curtain", "isthing": 0, "color": [255, 235, 0]},
    {"id": 104, "name": "textiles", "isthing": 0, "color": [8, 184, 170]},
    {"id": 105, "name": "clothes", "isthing": 0, "color": [133, 0, 255]},
    {"id": 106, "name": "painting_or_poster", "isthing": 1, "color": [0, 255, 92]},
    {"id": 107, "name": "mirror", "isthing": 1, "color": [184, 0, 255]},
    {"id": 108, "name": "flower_pot_or_vase", "isthing": 1, "color": [255, 0, 31]},
    {"id": 109, "name": "clock", "isthing": 1, "color": [0, 184, 255]},
    {"id": 110, "name": "book", "isthing": 0, "color": [0, 214, 255]},
    {"id": 111, "name": "tool", "isthing": 0, "color": [255, 0, 112]},
    {"id": 112, "name": "blackboard", "isthing": 0, "color": [92, 255, 0]},
    {"id": 113, "name": "tissue", "isthing": 0, "color": [0, 224, 255]},
    {"id": 114, "name": "screen_or_television", "isthing": 1, "color": [112, 224, 255]},
    {"id": 115, "name": "computer", "isthing": 1, "color": [70, 184, 160]},
    {"id": 116, "name": "printer", "isthing": 1, "color": [163, 0, 255]},
    {"id": 117, "name": "Mobile_phone", "isthing": 1, "color": [153, 0, 255]},
    {"id": 118, "name": "keyboard", "isthing": 1, "color": [71, 255, 0]},
    {"id": 119, "name": "other_electronic_product", "isthing": 0, "color": [255, 0, 163]},
    {"id": 120, "name": "fruit", "isthing": 0, "color": [255, 204, 0]},
    {"id": 121, "name": "food", "isthing": 0, "color": [255, 0, 143]},
    {"id": 122, "name": "instrument", "isthing": 1, "color": [0, 255, 235]},
    {"id": 123, "name": "train", "isthing": 1, "color": [133, 255, 0]}
]

CLASSES_THING = [
    {'id': 2, 'name': 'door', 'isthing': 1, 'color': [6, 230, 230]},
    {'id': 4, 'name': 'ladder', 'isthing': 1, 'color': [4, 200, 3]},
    {'id': 8, 'name': 'window', 'isthing': 1, 'color': [230, 230, 230]},
    {'id': 10, 'name': 'goal', 'isthing': 1, 'color': [224, 5, 255]},
    {'id': 41, 'name': 'sculpture', 'isthing': 1, 'color': [0, 255, 20]},
    {'id': 43, 'name': 'flag', 'isthing': 1, 'color': [255, 5, 153]},
    {'id': 44, 'name': 'parasol_or_umbrella', 'isthing': 1, 'color': [6, 51, 255]},
    {'id': 46, 'name': 'tent', 'isthing': 1, 'color': [160, 150, 20]},
    {'id': 47, 'name': 'roadblock', 'isthing': 1, 'color': [0, 163, 255]},
    {'id': 48, 'name': 'car', 'isthing': 1, 'color': [140, 140, 140]},
    {'id': 49, 'name': 'bus', 'isthing': 1, 'color': [250, 10, 15]},
    {'id': 50, 'name': 'truck', 'isthing': 1, 'color': [20, 255, 0]},
    {'id': 51, 'name': 'bicycle', 'isthing': 1, 'color': [31, 255, 0]},
    {'id': 52, 'name': 'motorcycle', 'isthing': 1, 'color': [255, 31, 0]},
    {'id': 54, 'name': 'ship_or_boat', 'isthing': 1, 'color': [153, 255, 0]},
    {'id': 55, 'name': 'raft', 'isthing': 1, 'color': [0, 0, 255]},
    {'id': 56, 'name': 'airplane', 'isthing': 1, 'color': [255, 71, 0]},
    {'id': 60, 'name': 'person', 'isthing': 1, 'color': [11, 200, 200]},
    {'id': 61, 'name': 'cat', 'isthing': 1, 'color': [255, 82, 0]},
    {'id': 62, 'name': 'dog', 'isthing': 1, 'color': [0, 255, 245]},
    {'id': 63, 'name': 'horse', 'isthing': 1, 'color': [0, 61, 255]},
    {'id': 64, 'name': 'cattle', 'isthing': 1, 'color': [0, 255, 112]},
    {'id': 65, 'name': 'other_animal', 'isthing': 1, 'color': [0, 255, 133]},
    {'id': 72, 'name': 'skateboard', 'isthing': 1, 'color': [0, 82, 255]},
    {'id': 74, 'name': 'ball', 'isthing': 1, 'color': [0, 255, 173]},
    {'id': 76, 'name': 'box', 'isthing': 1, 'color': [173, 255, 0]},
    {'id': 77, 'name': 'traveling_case_or_trolley_case', 'isthing': 1, 'color': [0, 255, 153]},
    {'id': 78, 'name': 'basket', 'isthing': 1, 'color': [255, 92, 0]},
    {'id': 79, 'name': 'bag_or_package', 'isthing': 1, 'color': [255, 0, 255]},
    {'id': 82, 'name': 'plate', 'isthing': 1, 'color': [255, 173, 0]},
    {'id': 83, 'name': 'tub_or_bowl_or_pot', 'isthing': 1, 'color': [255, 0, 20]},
    {'id': 84, 'name': 'bottle_or_cup', 'isthing': 1, 'color': [255, 184, 184]},
    {'id': 85, 'name': 'barrel', 'isthing': 1, 'color': [0, 31, 255]},
    {'id': 86, 'name': 'fishbowl', 'isthing': 1, 'color': [0, 255, 61]},
    {'id': 87, 'name': 'bed', 'isthing': 1, 'color': [0, 71, 255]},
    {'id': 88, 'name': 'pillow', 'isthing': 1, 'color': [255, 0, 204]},
    {'id': 89, 'name': 'table_or_desk', 'isthing': 1, 'color': [0, 255, 194]},
    {'id': 90, 'name': 'chair_or_seat', 'isthing': 1, 'color': [0, 255, 82]},
    {'id': 91, 'name': 'bench', 'isthing': 1, 'color': [0, 10, 255]},
    {'id': 92, 'name': 'sofa', 'isthing': 1, 'color': [0, 112, 255]},
    {'id': 95, 'name': 'gun', 'isthing': 1, 'color': [0, 122, 255]},
    {'id': 96, 'name': 'commode', 'isthing': 1, 'color': [0, 255, 163]},
    {'id': 97, 'name': 'roaster', 'isthing': 1, 'color': [255, 153, 0]},
    {'id': 99, 'name': 'refrigerator', 'isthing': 1, 'color': [255, 112, 0]},
    {'id': 100, 'name': 'washing_machine', 'isthing': 1, 'color': [143, 255, 0]},
    {'id': 101, 'name': 'Microwave_oven', 'isthing': 1, 'color': [82, 0, 255]},
    {'id': 102, 'name': 'fan', 'isthing': 1, 'color': [163, 255, 0]},
    {'id': 106, 'name': 'painting_or_poster', 'isthing': 1, 'color': [0, 255, 92]},
    {'id': 107, 'name': 'mirror', 'isthing': 1, 'color': [184, 0, 255]},
    {'id': 108, 'name': 'flower_pot_or_vase', 'isthing': 1, 'color': [255, 0, 31]},
    {'id': 109, 'name': 'clock', 'isthing': 1, 'color': [0, 184, 255]},
    {'id': 114, 'name': 'screen_or_television', 'isthing': 1, 'color': [112, 224, 255]},
    {'id': 115, 'name': 'computer', 'isthing': 1, 'color': [70, 184, 160]},
    {'id': 116, 'name': 'printer', 'isthing': 1, 'color': [163, 0, 255]},
    {'id': 117, 'name': 'Mobile_phone', 'isthing': 1, 'color': [153, 0, 255]},
    {'id': 118, 'name': 'keyboard', 'isthing': 1, 'color': [71, 255, 0]},
    {'id': 122, 'name': 'instrument', 'isthing': 1, 'color': [0, 255, 235]},
    {'id': 123, 'name': 'train', 'isthing': 1, 'color': [133, 255, 0]}
]

CLASSES_STUFF = [
    {'id': 0, 'name': 'wall', 'isthing': 0, 'color': [120, 120, 120]},
    {'id': 1, 'name': 'ceiling', 'isthing': 0, 'color': [180, 120, 120]},
    {'id': 3, 'name': 'stair', 'isthing': 0, 'color': [80, 50, 50]},
    {'id': 5, 'name': 'escalator', 'isthing': 0, 'color': [120, 120, 80]},
    {'id': 6, 'name': 'Playground_slide', 'isthing': 0, 'color': [140, 140, 140]},
    {'id': 7, 'name': 'handrail_or_fence', 'isthing': 0, 'color': [204, 5, 255]},
    {'id': 9, 'name': 'rail', 'isthing': 0, 'color': [4, 250, 7]},
    {'id': 11, 'name': 'pillar', 'isthing': 0, 'color': [235, 255, 7]},
    {'id': 12, 'name': 'pole', 'isthing': 0, 'color': [150, 5, 61]},
    {'id': 13, 'name': 'floor', 'isthing': 0, 'color': [120, 120, 70]},
    {'id': 14, 'name': 'ground', 'isthing': 0, 'color': [8, 255, 51]},
    {'id': 15, 'name': 'grass', 'isthing': 0, 'color': [255, 6, 82]},
    {'id': 16, 'name': 'sand', 'isthing': 0, 'color': [143, 255, 140]},
    {'id': 17, 'name': 'athletic_field', 'isthing': 0, 'color': [204, 255, 4]},
    {'id': 18, 'name': 'road', 'isthing': 0, 'color': [255, 51, 7]},
    {'id': 19, 'name': 'path', 'isthing': 0, 'color': [204, 70, 3]},
    {'id': 20, 'name': 'crosswalk', 'isthing': 0, 'color': [0, 102, 200]},
    {'id': 21, 'name': 'building', 'isthing': 0, 'color': [61, 230, 250]},
    {'id': 22, 'name': 'house', 'isthing': 0, 'color': [255, 6, 51]},
    {'id': 23, 'name': 'bridge', 'isthing': 0, 'color': [11, 102, 255]},
    {'id': 24, 'name': 'tower', 'isthing': 0, 'color': [255, 7, 71]},
    {'id': 25, 'name': 'windmill', 'isthing': 0, 'color': [255, 9, 224]},
    {'id': 26, 'name': 'well_or_well_lid', 'isthing': 0, 'color': [9, 7, 230]},
    {'id': 27, 'name': 'other_construction', 'isthing': 0, 'color': [220, 220, 220]},
    {'id': 28, 'name': 'sky', 'isthing': 0, 'color': [255, 9, 92]},
    {'id': 29, 'name': 'mountain', 'isthing': 0, 'color': [112, 9, 255]},
    {'id': 30, 'name': 'stone', 'isthing': 0, 'color': [8, 255, 214]},
    {'id': 31, 'name': 'wood', 'isthing': 0, 'color': [7, 255, 224]},
    {'id': 32, 'name': 'ice', 'isthing': 0, 'color': [255, 184, 6]},
    {'id': 33, 'name': 'snowfield', 'isthing': 0, 'color': [10, 255, 71]},
    {'id': 34, 'name': 'grandstand', 'isthing': 0, 'color': [255, 41, 10]},
    {'id': 35, 'name': 'sea', 'isthing': 0, 'color': [7, 255, 255]},
    {'id': 36, 'name': 'river', 'isthing': 0, 'color': [224, 255, 8]},
    {'id': 37, 'name': 'lake', 'isthing': 0, 'color': [102, 8, 255]},
    {'id': 38, 'name': 'waterfall', 'isthing': 0, 'color': [255, 61, 6]},
    {'id': 39, 'name': 'water', 'isthing': 0, 'color': [255, 194, 7]},
    {'id': 40, 'name': 'billboard_or_Bulletin_Board', 'isthing': 0, 'color': [255, 122, 8]},
    {'id': 42, 'name': 'pipeline', 'isthing': 0, 'color': [255, 8, 41]},
    {'id': 45, 'name': 'cushion_or_carpet', 'isthing': 0, 'color': [235, 12, 255]},
    {'id': 53, 'name': 'wheeled_machine', 'isthing': 0, 'color': [255, 224, 0]},
    {'id': 57, 'name': 'tyre', 'isthing': 0, 'color': [0, 235, 255]},
    {'id': 58, 'name': 'traffic_light', 'isthing': 0, 'color': [0, 173, 255]},
    {'id': 59, 'name': 'lamp', 'isthing': 0, 'color': [31, 0, 255]},
    {'id': 66, 'name': 'tree', 'isthing': 0, 'color': [255, 0, 0]},
    {'id': 67, 'name': 'flower', 'isthing': 0, 'color': [255, 163, 0]},
    {'id': 68, 'name': 'other_plant', 'isthing': 0, 'color': [255, 102, 0]},
    {'id': 69, 'name': 'toy', 'isthing': 0, 'color': [194, 255, 0]},
    {'id': 70, 'name': 'ball_net', 'isthing': 0, 'color': [0, 143, 255]},
    {'id': 71, 'name': 'backboard', 'isthing': 0, 'color': [51, 255, 0]},
    {'id': 73, 'name': 'bat', 'isthing': 0, 'color': [0, 255, 41]},
    {'id': 75, 'name': 'cupboard_or_showcase_or_storage_rack', 'isthing': 0, 'color': [10, 0, 255]},
    {'id': 80, 'name': 'trash_can', 'isthing': 0, 'color': [255, 0, 245]},
    {'id': 81, 'name': 'cage', 'isthing': 0, 'color': [255, 0, 102]},
    {'id': 93, 'name': 'shelf', 'isthing': 0, 'color': [51, 0, 255]},
    {'id': 94, 'name': 'bathtub', 'isthing': 0, 'color': [0, 194, 255]},
    {'id': 98, 'name': 'other_machine', 'isthing': 0, 'color': [0, 255, 10]},
    {'id': 103, 'name': 'curtain', 'isthing': 0, 'color': [255, 235, 0]},
    {'id': 104, 'name': 'textiles', 'isthing': 0, 'color': [8, 184, 170]},
    {'id': 105, 'name': 'clothes', 'isthing': 0, 'color': [133, 0, 255]},
    {'id': 110, 'name': 'book', 'isthing': 0, 'color': [0, 214, 255]},
    {'id': 111, 'name': 'tool', 'isthing': 0, 'color': [255, 0, 112]},
    {'id': 112, 'name': 'blackboard', 'isthing': 0, 'color': [92, 255, 0]},
    {'id': 113, 'name': 'tissue', 'isthing': 0, 'color': [0, 224, 255]},
    {'id': 119, 'name': 'other_electronic_product', 'isthing': 0, 'color': [255, 0, 163]},
    {'id': 120, 'name': 'fruit', 'isthing': 0, 'color': [255, 204, 0]},
    {'id': 121, 'name': 'food', 'isthing': 0, 'color': [255, 0, 143]}
]

NO_OBJ = 0
NO_OBJ_HB = 255
DIVISOR_PAN = 100
DIVISOR_NEW = 1000
NUM_THING = 58
NUM_STUFF = 66
THING_B_STUFF = False


def vip2hb(pan_map):
    assert not THING_B_STUFF, "VIPSeg only supports stuff -> thing"
    pan_new = - np.ones_like(pan_map)
    vip2hb_thing = {itm['id'] + 1: idx for idx, itm in enumerate(CLASSES_THING)}
    vip2hb_stuff = {itm['id'] + 1: idx for idx, itm in enumerate(CLASSES_STUFF)}
    for idx in np.unique(pan_map):
        if idx == NO_OBJ or idx == 200:
            pan_new[pan_map == idx] = NO_OBJ_HB * DIVISOR_NEW
        elif idx > 128:
            cls_id = idx // DIVISOR_PAN
            cls_new_id = vip2hb_thing[cls_id]
            inst_id = idx % DIVISOR_PAN
            # since stuff -> thing
            cls_new_id += NUM_STUFF
            pan_new[pan_map == idx] = cls_new_id * DIVISOR_NEW + inst_id + 1
        else:
            pan_new[pan_map == idx] = vip2hb_stuff[idx] * DIVISOR_NEW
    assert -1. not in np.unique(pan_new)
    return pan_new


def parse_args():
    parser = argparse.ArgumentParser(description='Evaluation of DSTQ')
    parser.add_argument('result_path')
    parser.add_argument('--gt-path', default='data/kitti-step')
    parser.add_argument('--split', default='val')
    parser.add_argument(
        '--depth',
        action='store_true',
        help='eval depth')
    parser.add_argument('--nproc', default=32, type=int, help='number of process')
    args = parser.parse_args()
    return args


def vpq_eval(element):
    pred_ids, gt_ids = element
    max_ins = 2 ** 16
    ign_id = 255
    offset = 2 ** 30
    num_cat = NUM_THING + NUM_STUFF + 1

    iou_per_class = np.zeros(num_cat, dtype=np.float64)
    tp_per_class = np.zeros(num_cat, dtype=np.float64)
    fn_per_class = np.zeros(num_cat, dtype=np.float64)
    fp_per_class = np.zeros(num_cat, dtype=np.float64)

    def _ids_to_counts(id_array):
        ids, counts = np.unique(id_array, return_counts=True)
        return dict(six.moves.zip(ids, counts))

    pred_areas = _ids_to_counts(pred_ids)
    gt_areas = _ids_to_counts(gt_ids)

    void_id = ign_id * max_ins
    ign_ids = {
        gt_id for gt_id in six.iterkeys(gt_areas)
        if (gt_id // max_ins) == ign_id
    }

    int_ids = gt_ids.astype(np.int64) * offset + pred_ids.astype(np.int64)
    int_areas = _ids_to_counts(int_ids)

    def prediction_void_overlap(pred_id):
        void_int_id = void_id * offset + pred_id
        return int_areas.get(void_int_id, 0)

    def prediction_ignored_overlap(pred_id):
        total_ignored_overlap = 0
        for _ign_id in ign_ids:
            int_id = _ign_id * offset + pred_id
            total_ignored_overlap += int_areas.get(int_id, 0)
        return total_ignored_overlap

    gt_matched = set()
    pred_matched = set()

    for int_id, int_area in six.iteritems(int_areas):
        gt_id = int(int_id // offset)
        gt_cat = int(gt_id // max_ins)
        pred_id = int(int_id % offset)
        pred_cat = int(pred_id // max_ins)
        if gt_cat != pred_cat:
            continue
        union = (
                gt_areas[gt_id] + pred_areas[pred_id] - int_area -
                prediction_void_overlap(pred_id)
        )
        iou = int_area / union
        if iou > 0.5:
            tp_per_class[gt_cat] += 1
            iou_per_class[gt_cat] += iou
            gt_matched.add(gt_id)
            pred_matched.add(pred_id)

    for gt_id in six.iterkeys(gt_areas):
        if gt_id in gt_matched:
            continue
        cat_id = gt_id // max_ins
        if cat_id == ign_id:
            continue
        fn_per_class[cat_id] += 1

    for pred_id in six.iterkeys(pred_areas):
        if pred_id in pred_matched:
            continue
        if (prediction_ignored_overlap(pred_id) / pred_areas[pred_id]) > 0.5:
            continue
        cat = pred_id // max_ins
        fp_per_class[cat] += 1

    return iou_per_class, tp_per_class, fn_per_class, fp_per_class


def read_to_eval(element):
    max_ins = 2 ** 16

    pred_list, gt_list = element
    pred_cat = [mmcv.imread(image[0], flag='unchanged').astype(np.int32) for image in pred_list]
    pred_ins = [mmcv.imread(image[1], flag='unchanged').astype(np.int32) for image in pred_list]
    pred_cat = np.concatenate(pred_cat, axis=1)
    pred_ins = np.concatenate(pred_ins, axis=1)
    pred = pred_cat.astype(np.int32) * max_ins + pred_ins.astype(np.int32)

    gt_pan = [mmcv.imread(image, flag='unchanged').astype(np.int64) for image in gt_list]
    gt_pan = np.concatenate(gt_pan, axis=1)
    gt_pan = vip2hb(gt_pan)

    gt_cls = gt_pan // DIVISOR_NEW
    gt_ins = gt_pan % DIVISOR_NEW

    gt = gt_cls * max_ins + gt_ins
    result = vpq_eval([pred, gt])

    return result


def eval_dvpq(result_dir, gt_dir, split='val', k=1, with_depth=True):
    if with_depth:
        raise NotImplementedError
    ann_folders = mmcv.list_from_file(os.path.join(gt_dir, "{}.txt".format(split)),
                                      prefix=os.path.join(gt_dir, 'panomasks') + '/')
    seq_ids = np.arange(0, len(ann_folders)).tolist()

    iou_per_class_all = []
    tp_per_class_all = []
    fn_per_class_all = []
    fp_per_class_all = []

    for seq_id in seq_ids:
        gt_names = list(mmcv.scandir(ann_folders[seq_id]))
        gt_pan_names = sorted(list(filter(lambda x: '.png' in x, gt_names)))
        if not os.path.exists(os.path.join(result_dir, 'panoptic', str(seq_id))):
            print("Error when seq_id is {}. But cal existing seqs.".format(seq_id))
            break
        pred_name_panoptic = list(mmcv.scandir(os.path.join(result_dir, 'panoptic', str(seq_id))))
        pred_ins_names = sorted(list(filter(lambda x: 'ins' in x, pred_name_panoptic)))
        pred_cls_names = sorted(list(filter(lambda x: 'cat' in x, pred_name_panoptic)))
        if len(gt_pan_names) != len(pred_ins_names):
            print("Error when seq_id is {}. But cal existing seqs.".format(seq_id))
            break
        elements = []
        assert len(pred_ins_names) == len(pred_cls_names)
        assert len(pred_cls_names) == len(gt_pan_names)
        len_seq = len(pred_ins_names)

        k = min(k, len_seq)

        for idx in range(len_seq):
            if idx + k - 1 >= len_seq:
                break
            pred = []
            gt = []
            for j in range(k):
                pred_cur = (os.path.join(result_dir, 'panoptic', str(seq_id), pred_cls_names[idx + j]),
                            os.path.join(result_dir, 'panoptic', str(seq_id), pred_cls_names[idx + j]))
                gt_cur = os.path.join(ann_folders[seq_id], gt_pan_names[idx + j])
                pred.append(pred_cur)
                gt.append(gt_cur)
            elements.append((pred, gt))

        N = mp.cpu_count()
        with mp.Pool(processes=N) as p:
            results = p.map(read_to_eval, elements)

        iou_per_class = np.stack([result[0] for result in results])
        iou_per_class_all.append(iou_per_class)
        tp_per_class = np.stack([result[1] for result in results])
        tp_per_class_all.append(tp_per_class)
        fn_per_class = np.stack([result[2] for result in results])
        fn_per_class_all.append(fn_per_class)
        fp_per_class = np.stack([result[3] for result in results])
        fp_per_class_all.append(fp_per_class)

    epsilon = 1e-10
    iou_per_class_all = np.concatenate(iou_per_class_all, axis=0).sum(axis=0)[:NUM_THING + NUM_STUFF]
    tp_per_class_all = np.concatenate(tp_per_class_all, axis=0).sum(axis=0)[:NUM_THING + NUM_STUFF]
    fn_per_class_all = np.concatenate(fn_per_class_all, axis=0).sum(axis=0)[:NUM_THING + NUM_STUFF]
    fp_per_class_all = np.concatenate(fp_per_class_all, axis=0).sum(axis=0)[:NUM_THING + NUM_STUFF]

    sq = iou_per_class_all / (tp_per_class_all + epsilon)
    rq = tp_per_class_all / (tp_per_class_all + 0.5 * fn_per_class_all + 0.5 * fp_per_class_all + epsilon)
    pq = sq * rq
    spq = pq[:NUM_STUFF]
    tpq = pq[NUM_STUFF:]
    print(
        r'PQ : {:.3f} PQ_thing : {:.3f} PQ_stuff : {:.3f}'.format(
            pq.mean() * 100,
            tpq.mean() * 100,
            spq.mean() * 100)
    )


# usage python eval_dstq_vipseg.py /opt/data/results/test --gt-path /opt/data/VIPSeg
if __name__ == '__main__':
    args = parse_args()
    result_path = args.result_path
    gt_path = args.gt_path
    split = args.split
    for k in [1, 2, 4, 6]:
        print("k={}".format(k))
        eval_dvpq(result_path, gt_path, split=split, with_depth=args.depth, k=k)


================================================
FILE: tools/flops_counter.py
================================================
'''
Copyright (C) 2019 Sovrasov V. - All Rights Reserved
 * You may use, distribute and modify this code under the
 * terms of the MIT license.
 * You should have received a copy of the MIT license with
 * this file. If not visit https://opensource.org/licenses/MIT
'''

import sys
from functools import partial

import mmcv.cnn.bricks.transformer
import numpy as np
import torch
import torch.nn as nn

import mmcv

def get_model_complexity_info(model, input_res,
                              print_per_layer_stat=True,
                              as_strings=True,
                              input_constructor=None, ost=sys.stdout,
                              verbose=False, ignore_modules=[],
                              custom_modules_hooks={}):
    assert type(input_res) is tuple
    assert len(input_res) >= 1
    assert isinstance(model, nn.Module)
    global CUSTOM_MODULES_MAPPING
    CUSTOM_MODULES_MAPPING = custom_modules_hooks
    flops_model = add_flops_counting_methods(model)
    flops_model.eval()
    flops_model.start_flops_count(ost=ost, verbose=verbose,
                                  ignore_list=ignore_modules)
    if input_constructor:
        input = input_constructor(input_res)
        _ = flops_model(**input)
    else:
        try:
            batch = torch.ones(()).new_empty((1, *input_res),
                                             dtype=next(flops_model.parameters()).dtype,
                                             device=next(flops_model.parameters()).device)
        except StopIteration:
            batch = torch.ones(()).new_empty((1, *input_res))

        _ = flops_model(batch)

    flops_count, params_count = flops_model.compute_average_flops_cost()
    if print_per_layer_stat:
        print_model_with_flops(flops_model, flops_count, params_count, ost=ost)
    flops_model.stop_flops_count()
    CUSTOM_MODULES_MAPPING = {}

    if as_strings:
        return flops_to_string(flops_count), params_to_string(params_count)

    return flops_count, params_count


def flops_to_string(flops, units='GMac', precision=2):
    if units is None:
        if flops // 10**9 > 0:
            return str(round(flops / 10.**9, precision)) + ' GMac'
        elif flops // 10**6 > 0:
            return str(round(flops / 10.**6, precision)) + ' MMac'
        elif flops // 10**3 > 0:
            return str(round(flops / 10.**3, precision)) + ' KMac'
        else:
            return str(flops) + ' Mac'
    else:
        if units == 'GMac':
            return str(round(flops / 10.**9, precision)) + ' ' + units
        elif units == 'MMac':
            return str(round(flops / 10.**6, precision)) + ' ' + units
        elif units == 'KMac':
            return str(round(flops / 10.**3, precision)) + ' ' + units
        else:
            return str(flops) + ' Mac'


def params_to_string(params_num, units=None, precision=2):
    if units is None:
        if params_num // 10 ** 6 > 0:
            return str(round(params_num / 10 ** 6, 2)) + ' M'
        elif params_num // 10 ** 3:
            return str(round(params_num / 10 ** 3, 2)) + ' k'
        else:
            return str(params_num)
    else:
        if units == 'M':
            return str(round(params_num / 10.**6, precision)) + ' ' + units
        elif units == 'K':
            return str(round(params_num / 10.**3, precision)) + ' ' + units
        else:
            return str(params_num)


def accumulate_flops(self):
    if is_supported_instance(self):
        return self.__flops__
    else:
        sum = 0
        for m in self.children():
            sum += m.accumulate_flops()
        return sum


def print_model_with_flops(model, total_flops, total_params, units='GMac',
                           precision=3, ost=sys.stdout):
    if total_flops < 1:
        total_flops = 1

    def accumulate_params(self):
        if is_supported_instance(self):
            return self.__params__
        else:
            sum = 0
            for m in self.children():
                sum += m.accumulate_params()
            return sum

    def flops_repr(self):
        accumulated_params_num = self.accumulate_params()
        accumulated_flops_cost = self.accumulate_flops() / model.__batch_counter__
        return ', '.join([params_to_string(accumulated_params_num,
                                           units='M', precision=precision),
                          '{:.3%} Params'.format(accumulated_params_num / total_params),
                          flops_to_string(accumulated_flops_cost,
                                          units=units, precision=precision),
                          '{:.3%} MACs'.format(accumulated_flops_cost / total_flops),
                          self.original_extra_repr()])

    def add_extra_repr(m):
        m.accumulate_flops = accumulate_flops.__get__(m)
        m.accumulate_params = accumulate_params.__get__(m)
        flops_extra_repr = flops_repr.__get__(m)
        if m.extra_repr != flops_extra_repr:
            m.original_extra_repr = m.extra_repr
            m.extra_repr = flops_extra_repr
            assert m.extra_repr != m.original_extra_repr

    def del_extra_repr(m):
        if hasattr(m, 'original_extra_repr'):
            m.extra_repr = m.original_extra_repr
            del m.original_extra_repr
        if hasattr(m, 'accumulate_flops'):
            del m.accumulate_flops

    model.apply(add_extra_repr)
    print(repr(model), file=ost)
    model.apply(del_extra_repr)


def get_model_parameters_number(model):
    params_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return params_num


def add_flops_counting_methods(net_main_module):
    # adding additional methods to the existing module object,
    # this is done this way so that each function has access to self object
    net_main_module.start_flops_count = start_flops_count.__get__(net_main_module)
    net_main_module.stop_flops_count = stop_flops_count.__get__(net_main_module)
    net_main_module.reset_flops_count = reset_flops_count.__get__(net_main_module)
    net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__(
                                                    net_main_module)

    net_main_module.reset_flops_count()

    return net_main_module


def compute_average_flops_cost(self):
    """
    A method that will be available after add_flops_counting_methods() is called
    on a desired net object.
    Returns current mean flops consumption per image.
    """

    for m in self.modules():
        m.accumulate_flops = accumulate_flops.__get__(m)

    flops_sum = self.accumulate_flops()

    for m in self.modules():
        if hasattr(m, 'accumulate_flops'):
            del m.accumulate_flops

    params_sum = get_model_parameters_number(self)
    return flops_sum / self.__batch_counter__, params_sum


def start_flops_count(self, **kwargs):
    """
    A method that will be available after add_flops_counting_methods() is called
    on a desired net object.
    Activates the computation of mean flops consumption per image.
    Call it before you run the network.
    """
    add_batch_counter_hook_function(self)

    seen_types = set()

    def add_flops_counter_hook_function(module, ost, verbose, ignore_list):
        if type(module) in ignore_list:
            seen_types.add(type(module))
            if is_supported_instance(module):
                module.__params__ = 0
        elif is_supported_instance(module):
            if hasattr(module, '__flops_handle__'):
                return
            if type(module) in CUSTOM_MODULES_MAPPING:
                handle = module.register_forward_hook(
                                        CUSTOM_MODULES_MAPPING[type(module)])
            else:
                handle = module.register_forward_hook(MODULES_MAPPING[type(module)])
            module.__flops_handle__ = handle
            seen_types.add(type(module))
        else:
            if verbose and not type(module) in (nn.Sequential, nn.ModuleList) and \
               not type(module) in seen_types:
                print('Warning: module ' + type(module).__name__ +
                      ' is treated as a zero-op.', file=ost)
            seen_types.add(type(module))

    self.apply(partial(add_flops_counter_hook_function, **kwargs))


def stop_flops_count(self):
    """
    A method that will be available after add_flops_counting_methods() is called
    on a desired net object.
    Stops computing the mean flops consumption per image.
    Call whenever you want to pause the computation.
    """
    remove_batch_counter_hook_function(self)
    self.apply(remove_flops_counter_hook_function)


def reset_flops_count(self):
    """
    A method that will be available after add_flops_counting_methods() is called
    on a desired net object.
    Resets statistics computed so far.
    """
    add_batch_counter_variables_or_reset(self)
    self.apply(add_flops_counter_variable_or_reset)


# ---- Internal functions
def empty_flops_counter_hook(module, input, output):
    module.__flops__ += 0


def upsample_flops_counter_hook(module, input, output):
    output_size = output[0]
    batch_size = output_size.shape[0]
    output_elements_count = batch_size
    for val in output_size.shape[1:]:
        output_elements_count *= val
    module.__flops__ += int(output_elements_count)


def relu_flops_counter_hook(module, input, output):
    active_elements_count = output.numel()
    module.__flops__ += int(active_elements_count)


def linear_flops_counter_hook(module, input, output):
    input = input[0]
    # pytorch checks dimensions, so here we don't care much
    output_last_dim = output.shape[-1]
    bias_flops = output_last_dim if module.bias is not None else 0
    module.__flops__ += int(np.prod(input.shape) * output_last_dim + bias_flops)


def pool_flops_counter_hook(module, input, output):
    input = input[0]
    module.__flops__ += int(np.prod(input.shape))


def bn_flops_counter_hook(module, input, output):
    input = input[0]

    batch_flops = np.prod(input.shape)
    if module.affine:
        batch_flops *= 2
    module.__flops__ += int(batch_flops)


def conv_flops_counter_hook(conv_module, input, output):
    # Can have multiple inputs, getting the first one
    input = input[0]

    batch_size = input.shape[0]
    output_dims = list(output.shape[2:])

    kernel_dims = list(conv_module.kernel_size)
    in_channels = conv_module.in_channels
    out_channels = conv_module.out_channels
    groups = conv_module.groups

    filters_per_channel = out_channels // groups
    conv_per_position_flops = int(np.prod(kernel_dims)) * \
        in_channels * filters_per_channel

    active_elements_count = batch_size * int(np.prod(output_dims))

    overall_conv_flops = conv_per_position_flops * active_elements_count

    bias_flops = 0

    if conv_module.bias is not None:

        bias_flops = out_channels * active_elements_count

    overall_flops = overall_conv_flops + bias_flops

    conv_module.__flops__ += int(overall_flops)


def batch_counter_hook(module, input, output):
    batch_size = 1
    if len(input) > 0:
        # Can have multiple inputs, getting the first one
        input = input[0]
        batch_size = len(input)
    else:
        pass
        print('Warning! No positional inputs found for a module,'
              ' assuming batch size is 1.')
    module.__batch_counter__ += batch_size


def rnn_flops(flops, rnn_module, w_ih, w_hh, input_size):
    # matrix matrix mult ih state and internal state
    flops += w_ih.shape[0]*w_ih.shape[1]
    # matrix matrix mult hh state and internal state
    flops += w_hh.shape[0]*w_hh.shape[1]
    if isinstance(rnn_module, (nn.RNN, nn.RNNCell)):
        # add both operations
        flops += rnn_module.hidden_size
    elif isinstance(rnn_module, (nn.GRU, nn.GRUCell)):
        # hadamard of r
        flops += rnn_module.hidden_size
        # adding operations from both states
        flops += rnn_module.hidden_size*3
        # last two hadamard product and add
        flops += rnn_module.hidden_size*3
    elif isinstance(rnn_module, (nn.LSTM, nn.LSTMCell)):
        # adding operations from both states
        flops += rnn_module.hidden_size*4
        # two hadamard product and add for C state
        flops += rnn_module.hidden_size + rnn_module.hidden_size + rnn_module.hidden_size
        # final hadamard
        flops += rnn_module.hidden_size + rnn_module.hidden_size + rnn_module.hidden_size
    return flops


def rnn_flops_counter_hook(rnn_module, input, output):
    """
    Takes into account batch goes at first position, contrary
    to pytorch common rule (but actually it doesn't matter).
    IF sigmoid and tanh are made hard, only a comparison FLOPS should be accurate
    """
    flops = 0
    # input is a tuple containing a sequence to process and (optionally) hidden state
    inp = input[0]
    batch_size = inp.shape[0]
    seq_length = inp.shape[1]
    num_layers = rnn_module.num_layers

    for i in range(num_layers):
        w_ih = rnn_module.__getattr__('weight_ih_l' + str(i))
        w_hh = rnn_module.__getattr__('weight_hh_l' + str(i))
        if i == 0:
            input_size = rnn_module.input_size
        else:
            input_size = rnn_module.hidden_size
        flops = rnn_flops(flops, rnn_module, w_ih, w_hh, input_size)
        if rnn_module.bias:
            b_ih = rnn_module.__getattr__('bias_ih_l' + str(i))
            b_hh = rnn_module.__getattr__('bias_hh_l' + str(i))
            flops += b_ih.shape[0] + b_hh.shape[0]

    flops *= batch_size
    flops *= seq_length
    if rnn_module.bidirectional:
        flops *= 2
    rnn_module.__flops__ += int(flops)


def rnn_cell_flops_counter_hook(rnn_cell_module, input, output):
    flops = 0
    inp = input[0]
    batch_size = inp.shape[0]
    w_ih = rnn_cell_module.__getattr__('weight_ih')
    w_hh = rnn_cell_module.__getattr__('weight_hh')
    input_size = inp.shape[1]
    flops = rnn_flops(flops, rnn_cell_module, w_ih, w_hh, input_size)
    if rnn_cell_module.bias:
        b_ih = rnn_cell_module.__getattr__('bias_ih')
        b_hh = rnn_cell_module.__getattr__('bias_hh')
        flops += b_ih.shape[0] + b_hh.shape[0]

    flops *= batch_size
    rnn_cell_module.__flops__ += int(flops)

def ffn_hook(module, input, output):
    input = input[0]
    for layer in module.layers:
        if isinstance(layer, nn.Sequential):
            layer_cur = layer[0]
        else:
            layer_cur = layer
        if not isinstance(layer_cur, nn.Linear):
            continue
        # pytorch checks dimensions, so here we don't care much
        output_last_dim = layer_cur.out_features
        bias_flops = output_last_dim if layer_cur.bias is not None else 0
        module.__flops__ += int(input.shape[0] * input.shape[1] * layer_cur.in_features) * output_last_dim + bias_flops

def multihead_attention_counter_hook(multihead_attention_module, input, output):
    flops = 0
    if len(input) == 0:
        print(len(output))
        for i in output:
            print(i.shape)
        # unknown problem
        q,k,v = output[0], output[0], output[0]
    else:
        print("Successful!")
        q, k, v = input
    batch_size = q.shape[1]

    num_heads = multihead_attention_module.num_heads
    embed_dims = multihead_attention_module.embed_dims
    kdim = multihead_attention_module.kdim
    vdim = multihead_attention_module.vdim
    if kdim is None:
        kdim = embed_dims
    if vdim is None:
        vdim = embed_dims

    # initial projections
    flops = q.shape[0] * q.shape[2] * embed_dims + \
        k.shape[0] * k.shape[2] * kdim + \
        v.shape[0] * v.shape[2] * vdim
    if multihead_attention_module.in_proj_bias is not None:
        flops += (q.shape[0] + k.shape[0] + v.shape[0]) * embed_dims

    # attention heads: scale, matmul, softmax, matmul
    head_dim = embed_dims // num_heads
    head_flops = q.shape[0] * head_dim + \
        head_dim * q.shape[0] * k.shape[0] + \
        q.shape[0] * k.shape[0] + \
        q.shape[0] * k.shape[0] * head_dim

    flops += num_heads * head_flops

    # final projection, bias is always enabled
    flops += q.shape[0] * embed_dims * (embed_dims + 1)

    flops *= batch_size
    multihead_attention_module.__flops__ += int(flops)


def add_batch_counter_variables_or_reset(module):

    module.__batch_counter__ = 0


def add_batch_counter_hook_function(module):
    if hasattr(module, '__batch_counter_handle__'):
        return

    handle = module.register_forward_hook(batch_counter_hook)
    module.__batch_counter_handle__ = handle


def remove_batch_counter_hook_function(module):
    if hasattr(module, '__batch_counter_handle__'):
        module.__batch_counter_handle__.remove()
        del module.__batch_counter_handle__


def add_flops_counter_variable_or_reset(module):
    if is_supported_instance(module):
        if hasattr(module, '__flops__') or hasattr(module, '__params__'):
            print('Warning: variables __flops__ or __params__ are already '
                  'defined for the module' + type(module).__name__ +
                  ' ptflops can affect your code!')
        module.__flops__ = 0
        module.__params__ = get_model_parameters_number(module)


CUSTOM_MODULES_MAPPING = {}

def norm_flops_counter_hook(module, input, output):
    input = input[0]

    batch_flops = np.prod(input.shape)
    if (getattr(module, 'affine', False)
            or getattr(module, 'elementwise_affine', False)):
        batch_flops *= 2
    module.__flops__ += int(batch_flops)

MODULES_MAPPING = {
    # convolutions
    nn.Conv1d: conv_flops_counter_hook,
    nn.Conv2d: conv_flops_counter_hook,
    nn.Conv3d: conv_flops_counter_hook,
    # activations
    nn.ReLU: relu_flops_counter_hook,
    nn.PReLU: relu_flops_counter_hook,
    nn.ELU: relu_flops_counter_hook,
    nn.LeakyReLU: relu_flops_counter_hook,
    nn.ReLU6: relu_flops_counter_hook,
    # poolings
    nn.MaxPool1d: pool_flops_counter_hook,
    nn.AvgPool1d: pool_flops_counter_hook,
    nn.AvgPool2d: pool_flops_counter_hook,
    nn.MaxPool2d: pool_flops_counter_hook,
    nn.MaxPool3d: pool_flops_counter_hook,
    nn.AvgPool3d: pool_flops_counter_hook,
    nn.AdaptiveMaxPool1d: pool_flops_counter_hook,
    nn.AdaptiveAvgPool1d: pool_flops_counter_hook,
    nn.AdaptiveMaxPool2d: pool_flops_counter_hook,
    nn.AdaptiveAvgPool2d: pool_flops_counter_hook,
    nn.AdaptiveMaxPool3d: pool_flops_counter_hook,
    nn.AdaptiveAvgPool3d: pool_flops_counter_hook,
    # BNs
    nn.BatchNorm1d: bn_flops_counter_hook,
    nn.BatchNorm2d: bn_flops_counter_hook,
    nn.BatchNorm3d: bn_flops_counter_hook,

    nn.InstanceNorm1d: bn_flops_counter_hook,
    nn.InstanceNorm2d: bn_flops_counter_hook,
    nn.InstanceNorm3d: bn_flops_counter_hook,
    nn.GroupNorm: bn_flops_counter_hook,

    # normalizations
    # nn.BatchNorm1d: norm_flops_counter_hook,
    # nn.BatchNorm2d: norm_flops_counter_hook,
    # nn.BatchNorm3d: norm_flops_counter_hook,
    # nn.GroupNorm: norm_flops_counter_hook,
    # nn.InstanceNorm1d: norm_flops_counter_hook,
    # nn.InstanceNorm2d: norm_flops_counter_hook,
    # nn.InstanceNorm3d: norm_flops_counter_hook,
    nn.LayerNorm: norm_flops_counter_hook,

    # FC
    nn.Linear: linear_flops_counter_hook,
    # Upscale
    nn.Upsample: upsample_flops_counter_hook,
    # Deconvolution
    nn.ConvTranspose1d: conv_flops_counter_hook,
    nn.ConvTranspose2d: conv_flops_counter_hook,
    nn.ConvTranspose3d: conv_flops_counter_hook,
    # RNN
    nn.RNN: rnn_flops_counter_hook,
    nn.GRU: rnn_flops_counter_hook,
    nn.LSTM: rnn_flops_counter_hook,
    nn.RNNCell: rnn_cell_flops_counter_hook,
    nn.LSTMCell: rnn_cell_flops_counter_hook,
    nn.GRUCell: rnn_cell_flops_counter_hook,
    nn.MultiheadAttention: multihead_attention_counter_hook,

    mmcv.cnn.bricks.transformer.FFN:ffn_hook
}


def is_supported_instance(module):
    if type(module) in MODULES_MAPPING or type(module) in CUSTOM_MODULES_MAPPING:
        return True
    return False


def remove_flops_counter_hook_function(module):
    if is_supported_instance(module):
        if hasattr(module, '__flops_handle__'):
            module.__flops_handle__.remove()
            del module.__flops_handle__


================================================
FILE: tools/get_flops.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import argparse

import numpy as np
import torch
from mmcv import Config, DictAction

from mmdet.models import build_detector

try:
    from mmcv.cnn import get_model_complexity_info
    # from tools.flops_counter import get_model_complexity_info
except ImportError:
    raise ImportError('Please upgrade mmcv to >0.6.2')


def parse_args():
    parser = argparse.ArgumentParser(description='Train a detector')
    parser.add_argument('config', help='train config file path')
    parser.add_argument(
        '--shape',
        type=int,
        nargs='+',
        default=[1280, 800],
        help='input image size')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. If the value to '
        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
        'Note that the quotation marks are necessary and that no white space '
        'is allowed.')
    parser.add_argument(
        '--size-divisor',
        type=int,
        default=32,
        help='Pad the input image, the minimum size that is divisible '
        'by size_divisor, -1 means do not pad the image.')
    args = parser.parse_args()
    return args


def main():

    args = parse_args()

    if len(args.shape) == 1:
        h = w = args.shape[0]
    elif len(args.shape) == 2:
        h, w = args.shape
    else:
        raise ValueError('invalid input shape')
    orig_shape = (3, h, w)
    divisor = args.size_divisor
    if divisor > 0:
        h = int(np.ceil(h / divisor)) * divisor
        w = int(np.ceil(w / divisor)) * divisor

    input_shape = (3, h, w)

    cfg = Config.fromfile(args.config)
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)
    # import modules from string list.
    if cfg.get('custom_imports', None):
        from mmcv.utils import import_modules_from_strings
        import_modules_from_strings(**cfg['custom_imports'])

    model = build_detector(
        cfg.model,
        train_cfg=cfg.get('train_cfg'),
        test_cfg=cfg.get('test_cfg'))
    if torch.cuda.is_available():
        model.cuda()
    model.eval()

    if hasattr(model, 'forward_dummy'):
        model.forward = model.forward_dummy
    else:
        raise NotImplementedError(
            'FLOPs counter is currently not currently supported with {}'.
            format(model.__class__.__name__))

    flops, params = get_model_complexity_info(model, input_shape)
    split_line = '=' * 30

    if divisor > 0 and \
            input_shape != orig_shape:
        print(f'{split_line}\nUse size divisor set input shape '
              f'from {orig_shape} to {input_shape}\n')
    print(f'{split_line}\nInput shape: {input_shape}\n'
          f'Flops: {flops}\nParams: {params}\n{split_line}')
    print('!!!Please be cautious if you use the results in papers. '
          'You may need to check if all ops are supported and verify that the '
          'flops computation is correct.')


if __name__ == '__main__':
    main()


================================================
FILE: tools/inference_kitti_step.sh
================================================
#!/usr/bin/env bash

CONFIG=$1
CHECKPOINT=$2
LOG=$3

# configs/det/video_knet_kitti_step/video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2.py logger/models/video_knet_vis/video_knet_step_quansi_r50.pth logger/results/kitti_step_merge_joint_semantic_filter
# configs/det/video_knet_kitti_step/video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2.py logger/models/video_knet_vis/video_knet_step_quansi_r50.pth logger/results/kitti_step_semantic_filter

# --cfg-options data.test.split=val model.roi_head.merge_joint=True model.semantic_filter=True
# --cfg-options data.test.split=val model.roi_head.merge_joint=False model.semantic_filter=True

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python $(dirname "$0")/test_dvps.py $CONFIG $CHECKPOINT --eval dummy --show-dir $LOG ${@:4}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python $(dirname "$0")/eval_dstq_step.py $LOG


================================================
FILE: tools/slurm_test.sh
================================================
#!/usr/bin/env bash

set -x

PARTITION=$1
JOB_NAME=$2
CONFIG=$3
CHECKPOINT=$4
GPUS=${GPUS:-8}
GPUS_PER_NODE=${GPUS_PER_NODE:-8}
CPUS_PER_TASK=${CPUS_PER_TASK:-5}
PY_ARGS=${@:5}
SRUN_ARGS=${SRUN_ARGS:-""}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
srun -p ${PARTITION} \
    --job-name=${JOB_NAME} \
    --gres=gpu:${GPUS_PER_NODE} \
    --ntasks=${GPUS} \
    --ntasks-per-node=${GPUS_PER_NODE} \
    --cpus-per-task=${CPUS_PER_TASK} \
    --kill-on-bad-exit=1 \
    ${SRUN_ARGS} \
    python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}


================================================
FILE: tools/slurm_test_dvps.sh
================================================
#!/usr/bin/env bash

set -x

PARTITION=$1
JOB_NAME=$2
CONFIG=$3
CHECKPOINT=$4
GPUS=${GPUS:-1}
GPUS_PER_NODE=${GPUS_PER_NODE:-1}
CPUS_PER_TASK=${CPUS_PER_TASK:-5}
PY_ARGS=${@:5}
SRUN_ARGS=${SRUN_ARGS:-""}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
srun -p ${PARTITION} \
    --job-name=${JOB_NAME} \
    --gres=gpu:${GPUS_PER_NODE} \
    --ntasks=${GPUS} \
    --ntasks-per-node=${GPUS_PER_NODE} \
    --cpus-per-task=${CPUS_PER_TASK} \
    --kill-on-bad-exit=1 \
    ${SRUN_ARGS} \
    python -u tools/test_dvps.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}

================================================
FILE: tools/slurm_test_step.sh
================================================
#!/usr/bin/env bash

set -x

PARTITION=$1
JOB_NAME=$2
CONFIG=$3
CHECKPOINT=$4
GPUS=${GPUS:-1}
GPUS_PER_NODE=${GPUS_PER_NODE:-1}
CPUS_PER_TASK=${CPUS_PER_TASK:-5}
PY_ARGS=${@:5}
SRUN_ARGS=${SRUN_ARGS:-""}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
srun -p ${PARTITION} \
    --job-name=${JOB_NAME} \
    --gres=gpu:${GPUS_PER_NODE} \
    --ntasks=${GPUS} \
    --ntasks-per-node=${GPUS_PER_NODE} \
    --cpus-per-task=${CPUS_PER_TASK} \
    --kill-on-bad-exit=1 \
    ${SRUN_ARGS} \
    python -u tools/test_step.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}

================================================
FILE: tools/slurm_test_vis.sh
================================================
#!/usr/bin/env bash

set -x

PARTITION=$1
JOB_NAME=$2
CONFIG=$3
CHECKPOINT=$4
GPUS=${GPUS:-1}
GPUS_PER_NODE=${GPUS_PER_NODE:-1}
CPUS_PER_TASK=${CPUS_PER_TASK:-5}
PY_ARGS=${@:5}
SRUN_ARGS=${SRUN_ARGS:-""}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
srun -p ${PARTITION} \
    --job-name=${JOB_NAME} \
    --gres=gpu:${GPUS_PER_NODE} \
    --ntasks=${GPUS} \
    --ntasks-per-node=${GPUS_PER_NODE} \
    --cpus-per-task=${CPUS_PER_TASK} \
    --kill-on-bad-exit=1 \
    ${SRUN_ARGS} \
    python -u tools/test_vis.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}

================================================
FILE: tools/slurm_test_vps.sh
================================================
#!/usr/bin/env bash

set -x

PARTITION=$1
JOB_NAME=$2
CONFIG=$3
CHECKPOINT=$4
GPUS=${GPUS:-1}
GPUS_PER_NODE=${GPUS_PER_NODE:-8}
CPUS_PER_TASK=${CPUS_PER_TASK:-5}
PY_ARGS=${@:5}
SRUN_ARGS=${SRUN_ARGS:-""}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
srun -p ${PARTITION} \
    --job-name=${JOB_NAME} \
    --gres=gpu:${GPUS_PER_NODE} \
    --ntasks=${GPUS} \
    --ntasks-per-node=${GPUS_PER_NODE} \
    --cpus-per-task=${CPUS_PER_TASK} \
    --kill-on-bad-exit=1 \
    ${SRUN_ARGS} \
    python -u tools/test_vps_two_frames.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}

================================================
FILE: tools/slurm_train.sh
================================================
#!/usr/bin/env bash

set -x

PARTITION=$1
JOB_NAME=$2
CONFIG=$3
WORK_DIR=$4
GPUS=${GPUS:-8}
GPUS_PER_NODE=${GPUS_PER_NODE:-8}
CPUS_PER_TASK=${CPUS_PER_TASK:-5}
SRUN_ARGS=${SRUN_ARGS:-""}
PY_ARGS=${@:5}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
srun -p ${PARTITION} \
    --job-name=${JOB_NAME} \
    --gres=gpu:${GPUS_PER_NODE} \
    --ntasks=${GPUS} \
    --ntasks-per-node=${GPUS_PER_NODE} \
    --cpus-per-task=${CPUS_PER_TASK} \
    --kill-on-bad-exit=1 \
    ${SRUN_ARGS} \
    python -u tools/train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS}


================================================
FILE: tools/test.py
================================================
import argparse
import os
import warnings

import mmcv
import torch
from mmcv import Config, DictAction
from mmcv.cnn import fuse_conv_bn
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
                         wrap_fp16_model)
from mmdet.datasets import (build_dataloader, build_dataset,
                            replace_ImageToTensor)
from mmdet.models import build_detector

from external.test import multi_gpu_test, single_gpu_test


def parse_args():
    parser = argparse.ArgumentParser(
        description='MMDet test (and eval) a model')
    parser.add_argument('config', help='test config file path')
    parser.add_argument('checkpoint', help='checkpoint file')
    parser.add_argument('--out', help='output result file in pickle format')
    parser.add_argument(
        '--fuse-conv-bn',
        action='store_true',
        help='Whether to fuse conv and bn, this will slightly increase'
        'the inference speed')
    parser.add_argument(
        '--format-only',
        action='store_true',
        help='Format the output results without perform evaluation. It is'
        'useful when you want to format the result to a specific format and '
        'submit it to the test server')
    parser.add_argument(
        '--eval',
        type=str,
        nargs='+',
        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
    parser.add_argument('--show', action='store_true', help='show results')
    parser.add_argument(
        '--show-dir', help='directory where painted images will be saved')
    parser.add_argument(
        '--show-score-thr',
        type=float,
        default=0.3,
        help='score threshold (default: 0.3)')
    parser.add_argument(
        '--gpu-collect',
        action='store_true',
        help='whether to use gpu to collect results.')
    parser.add_argument(
        '--tmpdir',
        help='tmp directory used for collecting results from multiple '
        'workers, available when gpu-collect is not specified')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. If the value to '
        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
        'Note that the quotation marks are necessary and that no white space '
        'is allowed.')
    parser.add_argument(
        '--options',
        nargs='+',
        action=DictAction,
        help='custom options for evaluation, the key-value pair in xxx=yyy '
        'format will be kwargs for dataset.evaluate() function (deprecate), '
        'change to --eval-options instead.')
    parser.add_argument(
        '--eval-options',
        nargs='+',
        action=DictAction,
        help='custom options for evaluation, the key-value pair in xxx=yyy '
        'format will be kwargs for dataset.evaluate() function')
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='none',
        help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)

    if args.options and args.eval_options:
        raise ValueError(
            '--options and --eval-options cannot be both '
            'specified, --options is deprecated in favor of --eval-options')
    if args.options:
        warnings.warn('--options is deprecated in favor of --eval-options')
        args.eval_options = args.options
    return args


def main():
    args = parse_args()

    assert args.out or args.eval or args.format_only or args.show \
        or args.show_dir, \
        ('Please specify at least one operation (save/eval/format/show the '
         'results / save the results) with the argument "--out", "--eval"'
         ', "--format-only", "--show" or "--show-dir"')

    if args.eval and args.format_only:
        raise ValueError('--eval and --format_only cannot be both specified')

    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
        raise ValueError('The output file must be a pkl file.')

    cfg = Config.fromfile(args.config)
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)
    # import modules from string list.
    if cfg.get('custom_imports', None):
        from mmcv.utils import import_modules_from_strings
        import_modules_from_strings(**cfg['custom_imports'])
    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True
    # cfg.model.pretrained = None
    if cfg.model.get('neck'):
        if isinstance(cfg.model.neck, list):
            for neck_cfg in cfg.model.neck:
                if neck_cfg.get('rfp_backbone'):
                    if neck_cfg.rfp_backbone.get('pretrained'):
                        neck_cfg.rfp_backbone.pretrained = None
        elif cfg.model.neck.get('rfp_backbone'):
            if cfg.model.neck.rfp_backbone.get('pretrained'):
                cfg.model.neck.rfp_backbone.pretrained = None

    # in case the test dataset is concatenated
    samples_per_gpu = 1
    if isinstance(cfg.data.test, dict):
        cfg.data.test.test_mode = True
        samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
        if samples_per_gpu > 1:
            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
            cfg.data.test.pipeline = replace_ImageToTensor(
                cfg.data.test.pipeline)
    elif isinstance(cfg.data.test, list):
        for ds_cfg in cfg.data.test:
            ds_cfg.test_mode = True
        samples_per_gpu = max(
            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
        if samples_per_gpu > 1:
            for ds_cfg in cfg.data.test:
                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)

    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)

    # build the dataloader
    dataset = build_dataset(cfg.data.test)
    data_loader = build_dataloader(
        dataset,
        samples_per_gpu=samples_per_gpu,
        workers_per_gpu=cfg.data.workers_per_gpu,
        dist=distributed,
        shuffle=False)

    # build the model and load checkpoint
    cfg.model.train_cfg = None
    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        wrap_fp16_model(model)
    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
    if args.fuse_conv_bn:
        model = fuse_conv_bn(model)
    # old versions did not save class info in checkpoints, this walkaround is
    # for backward compatibility
    if 'CLASSES' in checkpoint.get('meta', {}):
        model.CLASSES = checkpoint['meta']['CLASSES']
    else:
        model.CLASSES = dataset.CLASSES

    if not distributed:
        model = MMDataParallel(model, device_ids=[0])
        outputs = single_gpu_test(model, data_loader, args.show, args.show_dir,
                                  args.show_score_thr)
    else:
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False)
        outputs = multi_gpu_test(model, data_loader, args.tmpdir,
                                 args.gpu_collect)

    rank, _ = get_dist_info()
    if rank == 0:
        if args.out:
            print(f'\nwriting results to {args.out}')
            mmcv.dump(outputs, args.out)
        kwargs = {} if args.eval_options is None else args.eval_options
        if args.format_only:
            dataset.format_results(outputs, **kwargs)
        if args.eval:
            eval_kwargs = cfg.get('evaluation', {}).copy()
            # hard-code way to remove EvalHook args
            for key in [
                    'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
                    'rule', 'by_epoch'
            ]:
                eval_kwargs.pop(key, None)
            eval_kwargs.update(dict(metric=args.eval, **kwargs))
            print(dataset.evaluate(outputs, **eval_kwargs))


if __name__ == '__main__':
    main()


================================================
FILE: tools/test_dvps.py
================================================
import argparse
import os
import os.path as osp
import warnings
import numpy as np
import pickle
import json
import mmcv
import torch
from mmcv import Config, DictAction
from mmcv.cnn import fuse_conv_bn
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
                         wrap_fp16_model)
from mmdet.datasets import (build_dataloader, build_dataset,
                            replace_ImageToTensor)
from mmdet.models import build_detector

from external.test import encode_mask_results, tensor2imgs


def single_gpu_test(model,
                    data_loader,
                    show=False,
                    out_dir=None,
                    show_score_thr=0.3,
                    with_semantic_input=False,
                    rescale_depth=False,
                    with_seq=False,
                    ):
    if out_dir is None:
        out_dir = './out'
    print("The output dir is {}".format(out_dir))
    model.eval()
    results = []
    dataset = data_loader.dataset
    prog_bar = mmcv.ProgressBar(len(dataset))

    pano_seg_2ch_list = []

    for i, data in enumerate(data_loader):
        seq_id = data['seq_id'][0].item()
        img_id = data['img_id'][0].item()
        data.pop('seq_id')
        if with_semantic_input:
            semantic_input = mmcv.imread(
                os.path.join('data/kitti-dvps/semantic/',
                             "{:06d}_{:06d}_semantic.png".format(seq_id, img_id)), flag='unchanged')
            semantic_input = torch.tensor(semantic_input, device=data['img'][0].device)
        else:
            semantic_input = None

        with torch.no_grad():
            segm_results = model(return_loss=False, rescale=True, semantic_input=semantic_input, **data)

        sseg_results, track_maps, depth_final, vis_sem, vis_tracker = segm_results
        batch_size = 1

        # dump results
        seq_folder = str(seq_id) if with_seq else ""
        cat_path = os.path.join(out_dir, 'panoptic', seq_folder, '{:06d}_{:06d}_cat.png'.format(seq_id, img_id))
        ins_path = os.path.join(out_dir, 'panoptic', seq_folder, '{:06d}_{:06d}_ins.png'.format(seq_id, img_id))
        dep_path = os.path.join(out_dir, 'depth', seq_folder, '{:06d}_{:06d}.png'.format(seq_id, img_id))
        vis_path = os.path.join(out_dir, 'vis', seq_folder, '{:06d}_{:06d}.png'.format(seq_id, img_id))
        depth_final_rescale = mmcv.imresize(depth_final, (300, 100), interpolation='bilinear') \
            if depth_final is not None else None
        mmcv.imwrite(sseg_results.astype(np.uint16), cat_path)
        mmcv.imwrite(track_maps.astype(np.uint16), ins_path)
        if depth_final_rescale is not None:
            mmcv.imwrite(((depth_final_rescale if rescale_depth else depth_final) * 256.).astype(np.uint16), dep_path)
        mmcv.imwrite(np.concatenate((vis_sem, vis_tracker), axis=0), vis_path)

        for _ in range(batch_size):
            prog_bar.update()

    return results, pano_seg_2ch_list


def parse_args():
    parser = argparse.ArgumentParser(
        description='MMDet test (and eval) a model')
    parser.add_argument('config', help='test config file path')
    parser.add_argument('checkpoint', help='checkpoint file')
    parser.add_argument('--out', help='output result file in pickle format')
    parser.add_argument(
        '--fuse-conv-bn',
        action='store_true',
        help='Whether to fuse conv and bn, this will slightly increase'
             'the inference speed')
    parser.add_argument(
        '--format-only',
        action='store_true',
        help='Format the output results without perform evaluation. It is'
             'useful when you want to format the result to a specific format and '
             'submit it to the test server')
    parser.add_argument(
        '--eval',
        type=str,
        nargs='+',
        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
             ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
    parser.add_argument('--show', action='store_true', help='show results')
    parser.add_argument(
        '--show-dir', help='directory where painted images will be saved')
    parser.add_argument(
        '--show-score-thr',
        type=float,
        default=0.3,
        help='score threshold (default: 0.3)')
    parser.add_argument(
        '--gpu-collect',
        action='store_true',
        help='whether to use gpu to collect results.')
    parser.add_argument(
        '--tmpdir',
        help='tmp directory used for collecting results from multiple '
             'workers, available when gpu-collect is not specified')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
             'in xxx=yyy format will be merged into config file. If the value to '
             'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
             'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
             'Note that the quotation marks are necessary and that no white space '
             'is allowed.')
    parser.add_argument(
        '--options',
        nargs='+',
        action=DictAction,
        help='custom options for evaluation, the key-value pair in xxx=yyy '
             'format will be kwargs for dataset.evaluate() function (deprecate), '
             'change to --eval-options instead.')
    parser.add_argument(
        '--eval-options',
        nargs='+',
        action=DictAction,
        help='custom options for evaluation, the key-value pair in xxx=yyy '
             'format will be kwargs for dataset.evaluate() function')

    parser.add_argument(
        '--semantic',
        action='store_true',
        help="semantic input"
    )
    parser.add_argument(
        '--rescale-depth',
        action='store_true',
        help=""
    )
    parser.add_argument(
        '--with-seq',
        action='store_true',
        help=""
    )
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='none',
        help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
    # parser.add_argument('--output_dir', default="./work_dirs/vps/vps_output",
    #                     help='output result file in pickle format to load')
    # parser.add_argument('--n_video', type=int, default=50, help="number of video clips")
    # parser.add_argument('--pan_im_json_file', type=str, default='data/cityscapes_vps/panoptic_im_val_city_vps.json')
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)

    if args.options and args.eval_options:
        raise ValueError(
            '--options and --eval-options cannot be both '
            'specified, --options is deprecated in favor of --eval-options')
    if args.options:
        warnings.warn('--options is deprecated in favor of --eval-options')
        args.eval_options = args.options
    return args


def main():
    args = parse_args()

    assert args.out or args.eval or args.format_only or args.show \
           or args.show_dir, \
        ('Please specify at least one operation (save/eval/format/show the '
         'results / save the results) with the argument "--out", "--eval"'
         ', "--format-only", "--show" or "--show-dir"')

    if args.eval and args.format_only:
        raise ValueError('--eval and --format_only cannot be both specified')

    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
        raise ValueError('The output file must be a pkl file.')
    print(args)

    cfg = Config.fromfile(args.config)
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)
    # import modules from string list.
    if cfg.get('custom_imports', None):
        from mmcv.utils import import_modules_from_strings
        import_modules_from_strings(**cfg['custom_imports'])
    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True
    cfg.model.pretrained = None
    if cfg.model.get('neck'):
        if isinstance(cfg.model.neck, list):
            for neck_cfg in cfg.model.neck:
                if neck_cfg.get('rfp_backbone'):
                    if neck_cfg.rfp_backbone.get('pretrained'):
                        neck_cfg.rfp_backbone.pretrained = None
        elif cfg.model.neck.get('rfp_backbone'):
            if cfg.model.neck.rfp_backbone.get('pretrained'):
                cfg.model.neck.rfp_backbone.pretrained = None

    # in case the test dataset is concatenated
    samples_per_gpu = 1
    if isinstance(cfg.data.test, dict):
        cfg.data.test.test_mode = True
        samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
        if samples_per_gpu > 1:
            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
            cfg.data.test.pipeline = replace_ImageToTensor(
                cfg.data.test.pipeline)
    elif isinstance(cfg.data.test, list):
        for ds_cfg in cfg.data.test:
            ds_cfg.test_mode = True
        samples_per_gpu = max(
            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
        if samples_per_gpu > 1:
            for ds_cfg in cfg.data.test:
                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)

    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)

    # build the dataloader
    dataset = build_dataset(cfg.data.test)
    data_loader = build_dataloader(
        dataset,
        samples_per_gpu=samples_per_gpu,
        workers_per_gpu=cfg.data.workers_per_gpu,
        dist=distributed,
        shuffle=False)

    # build the model and load checkpoint
    cfg.model.train_cfg = None
    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        wrap_fp16_model(model)
    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
    if args.fuse_conv_bn:
        model = fuse_conv_bn(model)
    # old versions did not save class info in checkpoints, this walkaround is
    # for backward compatibility
    if 'CLASSES' in checkpoint.get('meta', {}):
        model.CLASSES = checkpoint['meta']['CLASSES']
    else:
        model.CLASSES = dataset.CLASSES

    model = MMDataParallel(model, device_ids=[0])
    # Inference the sequence
    outputs, pred_pans_2ch = single_gpu_test(model, data_loader, args.show, args.show_dir,
                                             args.show_score_thr, with_semantic_input=args.semantic,
                                             rescale_depth=args.rescale_depth, with_seq=args.with_seq)
    print("==>Inference Depth VPS Done!")

    # Evaluation Part


if __name__ == '__main__':
    main()


================================================
FILE: tools/test_step.py
================================================
import argparse
import os
import os.path as osp
import warnings
import numpy as np
import pickle
import json
import cv2
import mmcv
import torch
from mmcv import Config, DictAction
from mmcv.cnn import fuse_conv_bn
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
                         wrap_fp16_model)
from mmdet.datasets import (build_dataloader, build_dataset,
                            replace_ImageToTensor)
from mmdet.models import build_detector
try:
    from mmcv.cnn import get_model_complexity_info
except ImportError:
    raise ImportError('Please upgrade mmcv to >0.6.2')

def single_gpu_test(model,
                    data_loader,
                    show=False,
                    out_dir=None,
                    show_score_thr=0.3,
                    with_semantic_input=False,):
    if out_dir is None:
        out_dir = './out'
    print("The output dir is {}".format(out_dir))
    model.eval()
    results = []
    dataset = data_loader.dataset
    prog_bar = mmcv.ProgressBar(len(dataset))

    pano_seg_2ch_list = []

    for i, data in enumerate(data_loader):
        seq_id = data['seq_id'][0].item()
        img_id = data['img_id'][0].item()
        data.pop('seq_id')

        with torch.no_grad():
            segm_results = model(return_loss=False, rescale=True, **data)

        sseg_results, track_maps, _, _, _ = segm_results
        batch_size = 1
        # merge

        # dump results
        cat_path = os.path.join(out_dir, 'panoptic', str(seq_id), '{:06d}_{:06d}_cat.png'.format(seq_id, img_id))
        ins_path = os.path.join(out_dir, 'panoptic', str(seq_id), '{:06d}_{:06d}_ins.png'.format(seq_id, img_id))
        vis_path = os.path.join(out_dir, 'vis', str(seq_id), '{:06d}_{:06d}.png'.format(seq_id, img_id))
        final_path = os.path.join(out_dir, 'final', '{:04d}'.format(seq_id), '{:06d}.png'.format(img_id))

        # depth_final_rescale = mmcv.imresize(depth_final, (300, 100), interpolation='bilinear') \
        #     if depth_final is not None else None
        final_map = np.stack([sseg_results.astype(np.uint8), (track_maps // 256).astype(np.uint8), (track_maps % 256).astype(np.uint8)], axis=-1)
        cv2.cvtColor(final_map, cv2.COLOR_RGB2BGR, final_map)
        mmcv.imwrite(sseg_results.astype(np.uint16), cat_path)
        mmcv.imwrite(track_maps.astype(np.uint16), ins_path)
        # final map for evaluation
        mmcv.imwrite(final_map, final_path)
        # depth
        # if depth_final_rescale is not None:
        #     mmcv.imwrite((depth_final_rescale * 256).astype(np.uint16), dep_path)
        #  vis
        # mmcv.imwrite(np.concatenate((vis_sem, vis_tracker), axis=0), vis_path)

        for _ in range(batch_size):
            prog_bar.update()

    return results, pano_seg_2ch_list


def parse_args():
    parser = argparse.ArgumentParser(
        description='MMDet test (and eval) a model')
    parser.add_argument('config', help='test config file path')
    parser.add_argument('checkpoint', help='checkpoint file')
    parser.add_argument('--out', help='output result file in pickle format')
    parser.add_argument(
        '--fuse-conv-bn',
        action='store_true',
        help='Whether to fuse conv and bn, this will slightly increase'
             'the inference speed')
    parser.add_argument(
        '--format-only',
        action='store_true',
        help='Format the output results without perform evaluation. It is'
             'useful when you want to format the result to a specific format and '
             'submit it to the test server')
    parser.add_argument(
        '--eval',
        type=str,
        nargs='+',
        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
             ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
    parser.add_argument('--show', action='store_true', help='show results')
    parser.add_argument(
        '--show-dir', help='directory where painted images will be saved')
    parser.add_argument(
        '--show-score-thr',
        type=float,
        default=0.3,
        help='score threshold (default: 0.3)')
    parser.add_argument(
        '--gpu-collect',
        action='store_true',
        help='whether to use gpu to collect results.')
    parser.add_argument(
        '--tmpdir',
        help='tmp directory used for collecting results from multiple '
             'workers, available when gpu-collect is not specified')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
             'in xxx=yyy format will be merged into config file. If the value to '
             'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
             'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
             'Note that the quotation marks are necessary and that no white space '
             'is allowed.')
    parser.add_argument(
        '--options',
        nargs='+',
        action=DictAction,
        help='custom options for evaluation, the key-value pair in xxx=yyy '
             'format will be kwargs for dataset.evaluate() function (deprecate), '
             'change to --eval-options instead.')
    parser.add_argument(
        '--eval-options',
        nargs='+',
        action=DictAction,
        help='custom options for evaluation, the key-value pair in xxx=yyy '
             'format will be kwargs for dataset.evaluate() function')

    parser.add_argument(
        '--semantic',
        action='store_true',
        help="semantic input"
    )
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='none',
        help='job launcher')
    parser.add_argument('--output_dir', default="./work_dirs/vps/vps_output",
                        help='output result file in pickle format to load')
    parser.add_argument('--n_video', type=int, default=50, help="number of video clips")
    parser.add_argument('--pan_im_json_file', type=str, default='data/cityscapes_vps/panoptic_im_val_city_vps.json')
    parser.add_argument('--local_rank', type=int, default=0)
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)

    if args.options and args.eval_options:
        raise ValueError(
            '--options and --eval-options cannot be both '
            'specified, --options is deprecated in favor of --eval-options')
    if args.options:
        warnings.warn('--options is deprecated in favor of --eval-options')
        args.eval_options = args.options
    return args


def main():
    args = parse_args()

    assert args.out or args.eval or args.format_only or args.show \
           or args.show_dir, \
        ('Please specify at least one operation (save/eval/format/show the '
         'results / save the results) with the argument "--out", "--eval"'
         ', "--format-only", "--show" or "--show-dir"')

    if args.eval and args.format_only:
        raise ValueError('--eval and --format_only cannot be both specified')

    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
        raise ValueError('The output file must be a pkl file.')
    print(args)

    cfg = Config.fromfile(args.config)
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)
    # import modules from string list.
    if cfg.get('custom_imports', None):
        from mmcv.utils import import_modules_from_strings
        import_modules_from_strings(**cfg['custom_imports'])
    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True
    cfg.model.pretrained = None
    if cfg.model.get('neck'):
        if isinstance(cfg.model.neck, list):
            for neck_cfg in cfg.model.neck:
                if neck_cfg.get('rfp_backbone'):
                    if neck_cfg.rfp_backbone.get('pretrained'):
                        neck_cfg.rfp_backbone.pretrained = None
        elif cfg.model.neck.get('rfp_backbone'):
            if cfg.model.neck.rfp_backbone.get('pretrained'):
                cfg.model.neck.rfp_backbone.pretrained = None

    # in case the test dataset is concatenated
    samples_per_gpu = 1
    if isinstance(cfg.data.test, dict):
        cfg.data.test.test_mode = True
        samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
        if samples_per_gpu > 1:
            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
            cfg.data.test.pipeline = replace_ImageToTensor(
                cfg.data.test.pipeline)
    elif isinstance(cfg.data.test, list):
        for ds_cfg in cfg.data.test:
            ds_cfg.test_mode = True
        samples_per_gpu = max(
            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
        if samples_per_gpu > 1:
            for ds_cfg in cfg.data.test:
                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)

    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)

    # build the dataloader
    dataset = build_dataset(cfg.data.test)
    data_loader = build_dataloader(
        dataset,
        samples_per_gpu=samples_per_gpu,
        workers_per_gpu=cfg.data.workers_per_gpu,
        dist=distributed,
        shuffle=False)

    # build the model and load checkpoint
    cfg.model.train_cfg = None
    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        wrap_fp16_model(model)
    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
    if args.fuse_conv_bn:
        model = fuse_conv_bn(model)
    # old versions did not save class info in checkpoints, this walkaround is
    # for backward compatibility
    if 'CLASSES' in checkpoint.get('meta', {}):
        model.CLASSES = checkpoint['meta']['CLASSES']
    else:
        model.CLASSES = dataset.CLASSES

    model = MMDataParallel(model, device_ids=[0])
    # Inference the sequence
    outputs, pred_pans_2ch = single_gpu_test(model, data_loader, args.show, args.show_dir,
                                             args.show_score_thr, with_semantic_input=args.semantic)
    print("==>Inference STEP Done!")

    # Evaluation Part


if __name__ == '__main__':
    main()


================================================
FILE: tools/test_vps.py
================================================
import argparse
import os
import os.path as osp
import warnings
import numpy as np
import pickle
import json
import mmcv
import torch
from mmcv import Config, DictAction
from mmcv.cnn import fuse_conv_bn
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
                         wrap_fp16_model)
from mmdet.datasets import (build_dataloader, build_dataset,
                            replace_ImageToTensor)
from mmdet.models import build_detector

from external.test import encode_mask_results, tensor2imgs


def single_gpu_test(model,
                    data_loader,
                    show=False,
                    out_dir=None,
                    show_score_thr=0.3,
                    with_semantic_input=False,
                    rescale_depth=False,
                    with_seq=False,
                    ):
    if out_dir is None:
        out_dir = './out'
    print("The output dir is {}".format(out_dir))
    model.eval()
    results = []
    dataset = data_loader.dataset
    prog_bar = mmcv.ProgressBar(len(dataset))

    pano_seg_2ch_list = []
    # print("data loader length:", len(data_loader))
    # exit()
    for i, data in enumerate(data_loader):
        seq_id = data['seq_id'][0].item()
        img_id = data['img_id'][0].item()
        data.pop('seq_id')
        with torch.no_grad():
            segm_results = model(return_loss=False, rescale=True, **data)

        sseg_results, track_maps, _,  _, _ = segm_results
        batch_size = 1

        # dump results
        seq_folder = str(seq_id) if with_seq else ""
        cat_path = os.path.join(out_dir, 'panoptic', seq_folder, '{:06d}_{:06d}_cat.png'.format(seq_id, img_id))
        ins_path = os.path.join(out_dir, 'panoptic', seq_folder, '{:06d}_{:06d}_ins.png'.format(seq_id, img_id))

        mmcv.imwrite(sseg_results.astype(np.uint16), cat_path)
        mmcv.imwrite(track_maps.astype(np.uint16), ins_path)
        # if depth_final_rescale is not None:
        #     mmcv.imwrite(((depth_final_rescale if rescale_depth else depth_final) * 256.).astype(np.uint16), dep_path)
        # mmcv.imwrite(np.concatenate((vis_sem, vis_tracker), axis=0), vis_path)

        for _ in range(batch_size):
            prog_bar.update()

    return results, pano_seg_2ch_list


def parse_args():
    parser = argparse.ArgumentParser(
        description='MMDet test (and eval) a model')
    parser.add_argument('config', help='test config file path')
    parser.add_argument('checkpoint', help='checkpoint file')
    parser.add_argument('--out', help='output result file in pickle format')
    parser.add_argument(
        '--fuse-conv-bn',
        action='store_true',
        help='Whether to fuse conv and bn, this will slightly increase'
             'the inference speed')
    parser.add_argument(
        '--format-only',
        action='store_true',
        help='Format the output results without perform evaluation. It is'
             'useful when you want to format the result to a specific format and '
             'submit it to the test server')
    parser.add_argument(
        '--eval',
        type=str,
        nargs='+',
        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
             ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
    parser.add_argument('--show', action='store_true', help='show results')
    parser.add_argument(
        '--show-dir', help='directory where painted images will be saved')
    parser.add_argument(
        '--show-score-thr',
        type=float,
        default=0.3,
        help='score threshold (default: 0.3)')
    parser.add_argument(
        '--gpu-collect',
        action='store_true',
        help='whether to use gpu to collect results.')
    parser.add_argument(
        '--tmpdir',
        help='tmp directory used for collecting results from multiple '
             'workers, available when gpu-collect is not specified')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
             'in xxx=yyy format will be merged into config file. If the value to '
             'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
             'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
             'Note that the quotation marks are necessary and that no white space '
             'is allowed.')
    parser.add_argument(
        '--options',
        nargs='+',
        action=DictAction,
        help='custom options for evaluation, the key-value pair in xxx=yyy '
             'format will be kwargs for dataset.evaluate() function (deprecate), '
             'change to --eval-options instead.')
    parser.add_argument(
        '--eval-options',
        nargs='+',
        action=DictAction,
        help='custom options for evaluation, the key-value pair in xxx=yyy '
             'format will be kwargs for dataset.evaluate() function')

    parser.add_argument(
        '--semantic',
        action='store_true',
        help="semantic input"
    )
    parser.add_argument(
        '--rescale-depth',
        action='store_true',
        help=""
    )
    parser.add_argument(
        '--with-seq',
        action='store_true',
        help=""
    )
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='none',
        help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
    # parser.add_argument('--output_dir', default="./work_dirs/vps/vps_output",
    #                     help='output result file in pickle format to load')
    # parser.add_argument('--n_video', type=int, default=50, help="number of video clips")
    # parser.add_argument('--pan_im_json_file', type=str, default='data/cityscapes_vps/panoptic_im_val_city_vps.json')
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)

    if args.options and args.eval_options:
        raise ValueError(
            '--options and --eval-options cannot be both '
            'specified, --options is deprecated in favor of --eval-options')
    if args.options:
        warnings.warn('--options is deprecated in favor of --eval-options')
        args.eval_options = args.options
    return args


def main():
    args = parse_args()

    assert args.out or args.eval or args.format_only or args.show \
           or args.show_dir, \
        ('Please specify at least one operation (save/eval/format/show the '
         'results / save the results) with the argument "--out", "--eval"'
         ', "--format-only", "--show" or "--show-dir"')

    if args.eval and args.format_only:
        raise ValueError('--eval and --format_only cannot be both specified')

    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
        raise ValueError('The output file must be a pkl file.')
    print(args)

    cfg = Config.fromfile(args.config)
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)
    # import modules from string list.
    if cfg.get('custom_imports', None):
        from mmcv.utils import import_modules_from_strings
        import_modules_from_strings(**cfg['custom_imports'])
    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True
    cfg.model.pretrained = None
    if cfg.model.get('neck'):
        if isinstance(cfg.model.neck, list):
            for neck_cfg in cfg.model.neck:
                if neck_cfg.get('rfp_backbone'):
                    if neck_cfg.rfp_backbone.get('pretrained'):
                        neck_cfg.rfp_backbone.pretrained = None
        elif cfg.model.neck.get('rfp_backbone'):
            if cfg.model.neck.rfp_backbone.get('pretrained'):
                cfg.model.neck.rfp_backbone.pretrained = None

    # in case the test dataset is concatenated
    samples_per_gpu = 1
    if isinstance(cfg.data.test, dict):
        cfg.data.test.test_mode = True
        samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
        if samples_per_gpu > 1:
            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
            cfg.data.test.pipeline = replace_ImageToTensor(
                cfg.data.test.pipeline)
    elif isinstance(cfg.data.test, list):
        for ds_cfg in cfg.data.test:
            ds_cfg.test_mode = True
        samples_per_gpu = max(
            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
        if samples_per_gpu > 1:
            for ds_cfg in cfg.data.test:
                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)

    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)

    # build the dataloader
    dataset = build_dataset(cfg.data.test)
    data_loader = build_dataloader(
        dataset,
        samples_per_gpu=samples_per_gpu,
        workers_per_gpu=cfg.data.workers_per_gpu,
        dist=distributed,
        shuffle=False)

    # build the model and load checkpoint
    cfg.model.train_cfg = None
    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        wrap_fp16_model(model)
    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
    if args.fuse_conv_bn:
        model = fuse_conv_bn(model)
    # old versions did not save class info in checkpoints, this walkaround is
    # for backward compatibility
    if 'CLASSES' in checkpoint.get('meta', {}):
        model.CLASSES = checkpoint['meta']['CLASSES']
    else:
        model.CLASSES = dataset.CLASSES

    model = MMDataParallel(model, device_ids=[0])
    # Inference the sequence
    outputs, pred_pans_2ch = single_gpu_test(model, data_loader, args.show, args.show_dir,
                                             args.show_score_thr, with_semantic_input=args.semantic,
                                             rescale_depth=args.rescale_depth, with_seq=args.with_seq)
    print("==>Inference Depth VPS Done!")

    # Evaluation Part


if __name__ == '__main__':
    main()


================================================
FILE: tools/train.py
================================================
import argparse
import copy
import os
import os.path as osp
import time
import warnings

import mmcv
import torch
import torch.distributed as dist
from mmcv import Config, DictAction
from mmcv.runner import get_dist_info, init_dist
from mmcv.utils import get_git_hash
from mmdet import __version__
from mmdet.apis import set_random_seed
from mmdet.datasets import build_dataset
from mmdet.models import build_detector
from mmdet.utils import collect_env, get_root_logger

from external.train import train_detector


def parse_args():
    parser = argparse.ArgumentParser(description='Train a detector')
    parser.add_argument('config', help='train config file path')
    parser.add_argument('--work-dir', help='the dir to save logs and models')
    parser.add_argument(
        '--resume-from', help='the checkpoint file to resume from')
    parser.add_argument(
        '--load-from', help='the checkpoint file to resume from')
    parser.add_argument(
        '--no-validate',
        action='store_true',
        help='whether not to evaluate the checkpoint during training')
    group_gpus = parser.add_mutually_exclusive_group()
    group_gpus.add_argument(
        '--gpus',
        type=int,
        help='number of gpus to use '
        '(only applicable to non-distributed training)')
    group_gpus.add_argument(
        '--gpu-ids',
        type=int,
        nargs='+',
        help='ids of gpus to use '
        '(only applicable to non-distributed training)')
    parser.add_argument('--seed', type=int, default=None, help='random seed')
    parser.add_argument(
        '--deterministic',
        action='store_true',
        help='whether to set deterministic options for CUDNN backend.')
    parser.add_argument(
        '--detect-anomaly',
        action='store_true',
        help='detect anomaly')
    parser.add_argument(
        '--options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file (deprecate), '
        'change to --cfg-options instead.')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. If the value to '
        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
        'Note that the quotation marks are necessary and that no white space '
        'is allowed.')
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='none',
        help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)

    if args.options and args.cfg_options:
        raise ValueError(
            '--options and --cfg-options cannot be both '
            'specified, --options is deprecated in favor of --cfg-options')
    if args.options:
        warnings.warn('--options is deprecated in favor of --cfg-options')
        args.cfg_options = args.options

    return args


def main():
    args = parse_args()

    cfg = Config.fromfile(args.config)
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)
    # import modules from string list.
    if cfg.get('custom_imports', None):
        from mmcv.utils import import_modules_from_strings
        import_modules_from_strings(**cfg['custom_imports'])
    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True

    # work_dir is determined in this priority: CLI > segment in file > filename
    if args.work_dir is not None:
        # update configs according to CLI args if args.work_dir is not None
        cfg.work_dir = args.work_dir
    elif cfg.get('work_dir', None) is None:
        # use config filename as default work_dir if cfg.work_dir is None
        cfg.work_dir = osp.join('./work_dirs',
                                osp.splitext(osp.basename(args.config))[0])
    if args.resume_from is not None:
        cfg.resume_from = args.resume_from
    if args.load_from is not None:
        cfg.load_from = args.load_from
    if args.gpu_ids is not None:
        cfg.gpu_ids = args.gpu_ids
    else:
        cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)

    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)
        dist.barrier()
        # re-set gpu_ids with distributed training mode
        _, world_size = get_dist_info()
        cfg.gpu_ids = range(world_size)

    # create work_dir
    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
    # dump config
    cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
    # init the logger before other steps
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)

    # Added in PansegMM
    # Log the git hash info to video_knet_vis the experiments
    logger.info('The repo is : https://github.com/lxtGH/PanopticSegMM/tree/{}/'.format(get_git_hash()))
    logger.info('The config is : https://github.com/lxtGH/PanopticSegMM/tree/{}/{}'.format(get_git_hash(), args.config))

    # init the meta dict to record some important information such as
    # environment info and seed, which will be logged
    meta = dict()
    # log env info
    env_info_dict = collect_env()
    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
    dash_line = '-' * 60 + '\n'
    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
                dash_line)
    meta['env_info'] = env_info
    meta['config'] = cfg.pretty_text
    # log some basic info
    logger.info(f'Distributed training: {distributed}')
    logger.info(f'Config:\n{cfg.pretty_text}')

    # set random seeds
    if args.seed is not None:
        logger.info(f'Set random seed to {args.seed}, '
                    f'deterministic: {args.deterministic}')
        set_random_seed(args.seed, deterministic=args.deterministic)
    else:
        set_random_seed(0, deterministic=args.deterministic)
    cfg.seed = args.seed
    meta['seed'] = args.seed
    meta['exp_name'] = osp.basename(args.config)

    model = build_detector(
        cfg.model,
        train_cfg=cfg.get('train_cfg'),
        test_cfg=cfg.get('test_cfg'))
    model.init_weights()

    logger.info(f'Model:\n{model}')
    datasets = [build_dataset(cfg.data.train)]
    if len(cfg.workflow) == 2:
        val_dataset = copy.deepcopy(cfg.data.val)
        val_dataset.pipeline = cfg.data.train.pipeline
        datasets.append(build_dataset(val_dataset))
    if cfg.checkpoint_config is not None:
        # save mmdet version, config file content and class names in
        # checkpoints as meta data
        cfg.checkpoint_config.meta = dict(
            mmdet_version=__version__ + get_git_hash()[:7],
            CLASSES=datasets[0].CLASSES)
    # add an attribute for visualization convenience
    model.CLASSES = datasets[0].CLASSES
    if args.detect_anomaly:
        with torch.autograd.detect_anomaly():
            train_detector(
                model,
                datasets,
                cfg,
                distributed=distributed,
                validate=(not args.no_validate),
                timestamp=timestamp,
                meta=meta)
    else:
        train_detector(
            model,
            datasets,
            cfg,
            distributed=distributed,
            validate=(not args.no_validate),
            timestamp=timestamp,
            meta=meta)


if __name__ == '__main__':
    main()


================================================
FILE: tools/utils/DSTQ.py
================================================
from typing import Sequence, Tuple
import collections

import numpy as np

from .STQ import STQuality


class DSTQuality(STQuality):
    def __init__(
            self,
            num_classes: int,
            things_list: Sequence[int],
            ignore_label: int,
            label_bit_shift: int,
            offset: int,
            depth_threshold: Tuple[float] = (1.25, 1.1),
            name: str = 'dstq'
    ):
        super().__init__(
            num_classes=num_classes,
            things_list=things_list,
            ignore_label=ignore_label,
            label_bit_shift=label_bit_shift,
            offset=offset
        )
        if not (isinstance(depth_threshold, tuple) or
                isinstance(depth_threshold, list)):
            raise TypeError('The type of depth_threshold must be tuple or list.')
        if not depth_threshold:
            raise ValueError('depth_threshold must be non-empty.')
        self._depth_threshold = tuple(depth_threshold)
        self._depth_total_counts = collections.OrderedDict()
        self._depth_inlier_counts = []
        for _ in range(len(self._depth_threshold)):
            self._depth_inlier_counts.append(collections.OrderedDict())

    def update_state(
            self,
            y_true: np.ndarray,
            y_pred: np.ndarray,
            d_true: np.ndarray,
            d_pred: np.ndarray,
            sequence_id: int = 0
    ):
        """Accumulates the depth-aware segmentation and tracking quality statistics.
        Args:
          y_true: The ground-truth panoptic label map for a particular video frame
            (defined as semantic_map * max_instances_per_category + instance_map).
          y_pred: The predicted panoptic label map for a particular video frame
            (defined as semantic_map * max_instances_per_category + instance_map).
          d_true: The ground-truth depth map for this video frame.
          d_pred: The predicted depth map for this video frame.
          sequence_id: The optional ID of the sequence the frames belong to. When no
            sequence is given, all frames are considered to belong to the same
            sequence (default: 0).
        """
        super().update_state(y_true, y_pred, sequence_id)
        # Valid depth labels contain positive values.
        d_valid_mask = d_true > 0
        d_valid_total = np.sum(d_valid_mask.astype(int))
        # Valid depth prediction is expected to contain positive values.
        # TODO : very wrong implementation because it is hackable
        d_valid_mask = np.logical_and(d_valid_mask, d_pred > 0)
        d_valid_true = d_true[d_valid_mask]
        d_valid_pred = d_pred[d_valid_mask]
        inlier_error = np.maximum(d_valid_pred / d_valid_true,
                                  d_valid_true / d_valid_pred)
        # For each threshold, count the number of inliers.
        for threshold_index, threshold in enumerate(self._depth_threshold):
            num_inliers = np.sum((inlier_error <= threshold).astype(int))
            inlier_counts = self._depth_inlier_counts[threshold_index]
            inlier_counts[sequence_id] = (inlier_counts.get(sequence_id, 0) + int(num_inliers))
        # Update the total counts of the depth labels.
        self._depth_total_counts[sequence_id] = (
                self._depth_total_counts.get(sequence_id, 0) + int(d_valid_total))

    def result(self):
        """Computes the depth-aware segmentation and tracking quality.
        Returns:
          A dictionary containing:
            - 'STQ': The total STQ score.
            - 'AQ': The total association quality (AQ) score.
            - 'IoU': The total mean IoU.
            - 'STQ_per_seq': A list of the STQ score per sequence.
            - 'AQ_per_seq': A list of the AQ score per sequence.
            - 'IoU_per_seq': A list of mean IoU per sequence.
            - 'Id_per_seq': A list of sequence Ids to map list index to sequence.
            - 'Length_per_seq': A list of the length of each sequence.
            - 'DSTQ': The total DSTQ score.
            - 'DSTQ@thres': The total DSTQ score for threshold thres
            - 'DSTQ_per_seq@thres': A list of DSTQ score per sequence for thres.
            - 'DQ': The total DQ score.
            - 'DQ@thres': The total DQ score for threshold thres.
            - 'DQ_per_seq@thres': A list of DQ score per sequence for thres.
        """
        # Gather the results for STQ.
        stq_results = super().result()
        # Collect results for depth quality per sequecne and threshold.
        dq_per_seq_at_threshold = {}
        dq_at_threshold = {}
        for threshold_index, threshold in enumerate(self._depth_threshold):
            dq_per_seq_at_threshold[threshold] = [0] * len(self._ground_truth)
            total_count = 0
            inlier_count = 0
            # Follow the order of computing STQ by enumerating _ground_truth.
            for index, sequence_id in enumerate(self._ground_truth):
                sequence_inlier = self._depth_inlier_counts[threshold_index][sequence_id]
                sequence_total = self._depth_total_counts[sequence_id]
                if sequence_total > 0:
                    dq_per_seq_at_threshold[threshold][
                        index] = sequence_inlier / sequence_total
                total_count += sequence_total
                inlier_count += sequence_inlier
            if total_count == 0:
                dq_at_threshold[threshold] = 0
            else:
                dq_at_threshold[threshold] = inlier_count / total_count
        # Compute DQ as the geometric mean of DQ's at different thresholds.
        dq = 1
        for _, threshold in enumerate(self._depth_threshold):
            dq *= dq_at_threshold[threshold]
        dq = dq ** (1 / len(self._depth_threshold))
        dq_results = {}
        dq_results['DQ'] = dq
        for _, threshold in enumerate(self._depth_threshold):
            dq_results['DQ@{}'.format(threshold)] = dq_at_threshold[threshold]
            dq_results['DQ_per_seq@{}'.format(
                threshold)] = dq_per_seq_at_threshold[threshold]
        # Combine STQ and DQ to get DSTQ.
        dstq_results = {}
        dstq_results['DSTQ'] = (stq_results['STQ'] ** 2 * dq) ** (1 / 3)
        for _, threshold in enumerate(self._depth_threshold):
            dstq_results['DSTQ@{}'.format(threshold)] = (stq_results['STQ'] ** 2 * dq_at_threshold[
                                                            threshold]) ** (1 / 3)
            dstq_results['DSTQ_per_seq@{}'.format(threshold)] = [
                (stq_result ** 2 * dq_result) ** (1 / 3) for stq_result, dq_result in zip(
                    stq_results['STQ_per_seq'], dq_per_seq_at_threshold[threshold])
            ]
        # Merge all the results.
        dstq_results.update(stq_results)
        dstq_results.update(dq_results)
        return dstq_results

    def reset_states(self):
        """Resets all states that accumulated data."""
        super().reset_states()
        self._depth_total_counts = collections.OrderedDict()
        self._depth_inlier_counts = []
        for _ in range(len(self._depth_threshold)):
            self._depth_inlier_counts.append(collections.OrderedDict())


================================================
FILE: tools/utils/STQ.py
================================================
# This file is copied from deeplab2, please refer to https://github.com/google-research/deeplab2/
# for details. Please cite their papers if this file is helpful.

# coding=utf-8
# Copyright 2021 The Deeplab2 Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Numpy Implementation of the Segmentation and Tracking Quality (STQ) metric.
This implementation is designed to work stand-alone. Please feel free to copy
this file and the corresponding unit-test to your project.
"""

import collections
from typing import Mapping, MutableMapping, Sequence, Text, Any
import numpy as np

_EPSILON = 1e-15


def _update_dict_stats(stat_dict: MutableMapping[int, np.ndarray],
                       id_array: np.ndarray):
    """Updates a given dict with corresponding counts."""
    ids, counts = np.unique(id_array, return_counts=True)
    for idx, count in zip(ids, counts):
        if idx in stat_dict:
            stat_dict[idx] += count
        else:
            stat_dict[idx] = count


class STQuality(object):
    """Metric class for the Segmentation and Tracking Quality (STQ).
    Please see the following paper for more details about the metric:
    "STEP: Segmenting and Tracking Every Pixel", Weber et al., arXiv:2102.11859,
    2021.
    The metric computes the geometric mean of two terms.
    - Association Quality: This term measures the quality of the video_knet_vis ID
        assignment for `thing` classes. It is formulated as a weighted IoU
        measure.
    - Segmentation Quality: This term measures the semantic segmentation quality.
        The standard class IoU measure is used for this.
    Example usage:
    stq_obj = segmentation_tracking_quality.STQuality(num_classes, things_list,
      ignore_label, label_bit_shift, offset)
    stq_obj.update_state(y_true_1, y_pred_1)
    stq_obj.update_state(y_true_2, y_pred_2)
    ...
    result = stq_obj.result()
    """

    def __init__(self, num_classes: int, things_list: Sequence[int],
                 ignore_label: int, label_bit_shift: int, offset: int):
        """Initialization of the STQ metric.
        Args:
          num_classes: Number of classes in the dataset as an integer.
          things_list: A sequence of class ids that belong to `things`.
          ignore_label: The class id to be ignored in evaluation as an integer or
            integer tensor.
          label_bit_shift: The number of bits the class label is shifted as an
            integer -> (class_label << bits) + trackingID
          offset: The maximum number of unique labels as an integer or integer
            tensor.
        """
        self._num_classes = num_classes
        self._ignore_label = ignore_label
        self._things_list = things_list
        self._label_bit_shift = label_bit_shift
        self._bit_mask = (2 ** label_bit_shift) - 1

        if ignore_label >= num_classes:
            self._confusion_matrix_size = num_classes + 1
            self._include_indices = np.arange(self._num_classes)
        else:
            self._confusion_matrix_size = num_classes
            self._include_indices = np.array(
                [i for i in range(num_classes) if i != self._ignore_label])

        self._iou_confusion_matrix_per_sequence = collections.OrderedDict()
        self._predictions = collections.OrderedDict()
        self._ground_truth = collections.OrderedDict()
        self._intersections = collections.OrderedDict()
        self._sequence_length = collections.OrderedDict()
        self._offset = offset
        lower_bound = num_classes << self._label_bit_shift
        if offset < lower_bound:
            raise ValueError('The provided offset %d is too small. No guarantess '
                             'about the correctness of the results can be made. '
                             'Please choose an offset that is higher than num_classes'
                             ' * max_instances_per_category = %d' % lower_bound)

    def get_semantic(self, y: np.ndarray) -> np.ndarray:
        """Returns the semantic class from a panoptic label map."""
        return y >> self._label_bit_shift

    def update_state(self, y_true: np.ndarray, y_pred: np.ndarray, sequence_id=0):
        """Accumulates the segmentation and tracking quality statistics.
        IMPORTANT: When encoding the parameters y_true and y_pred, please be aware
        that the `+` operator binds higher than the label shift `<<` operator.
        Args:
          y_true: The ground-truth panoptic label map for a particular video frame
            (defined as (semantic_map << label_bit_shift) + instance_map).
          y_pred: The predicted panoptic label map for a particular video frame
            (defined as (semantic_map << label_bit_shift) + instance_map).
          sequence_id: The optional ID of the sequence the frames belong to. When no
            sequence is given, all frames are considered to belong to the same
            sequence (default: 0).
        """
        y_true = y_true.astype(np.int64)
        y_pred = y_pred.astype(np.int64)

        semantic_label = self.get_semantic(y_true)
        semantic_prediction = self.get_semantic(y_pred)
        # Check if the ignore value is outside the range [0, num_classes]. If yes,
        # map `_ignore_label` to `_num_classes`, so it can be used to create the
        # confusion matrix.
        if self._ignore_label > self._num_classes:
            semantic_label = np.where(semantic_label != self._ignore_label,
                                      semantic_label, self._num_classes)
            semantic_prediction = np.where(semantic_prediction != self._ignore_label,
                                           semantic_prediction, self._num_classes)
        if sequence_id in self._iou_confusion_matrix_per_sequence:
            idxs = (np.reshape(semantic_label, [-1]) <<
                    self._label_bit_shift) + np.reshape(semantic_prediction, [-1])
            unique_idxs, counts = np.unique(idxs, return_counts=True)
            self._iou_confusion_matrix_per_sequence[sequence_id][
                unique_idxs >> self._label_bit_shift,
                unique_idxs & self._bit_mask] += counts
            self._sequence_length[sequence_id] += 1
        else:
            self._iou_confusion_matrix_per_sequence[sequence_id] = np.zeros(
                (self._confusion_matrix_size, self._confusion_matrix_size),
                dtype=np.int64)
            idxs = np.stack([
                np.reshape(semantic_label, [-1]),
                np.reshape(semantic_prediction, [-1])
            ],
                axis=0)
            np.add.at(self._iou_confusion_matrix_per_sequence[sequence_id],
                      tuple(idxs), 1)

            self._predictions[sequence_id] = {}
            self._ground_truth[sequence_id] = {}
            self._intersections[sequence_id] = {}
            self._sequence_length[sequence_id] = 1

        instance_label = y_true & self._bit_mask  # 0xFFFF == 2 ^ 16 - 1

        label_mask = np.zeros_like(semantic_label, dtype=np.bool)
        prediction_mask = np.zeros_like(semantic_prediction, dtype=np.bool)
        for things_class_id in self._things_list:
            label_mask = np.logical_or(label_mask, semantic_label == things_class_id)
            prediction_mask = np.logical_or(prediction_mask,
                                            semantic_prediction == things_class_id)

        # Select the `crowd` region of the current class. This region is encoded
        # instance id `0`.
        is_crowd = np.logical_and(instance_label == 0, label_mask)
        # Select the non-crowd region of the corresponding class as the `crowd`
        # region is ignored for the tracking term.
        label_mask = np.logical_and(label_mask, np.logical_not(is_crowd))
        # Do not punish id assignment for regions that are annotated as `crowd` in
        # the ground-truth.
        prediction_mask = np.logical_and(prediction_mask, np.logical_not(is_crowd))

        seq_preds = self._predictions[sequence_id]
        seq_gts = self._ground_truth[sequence_id]
        seq_intersects = self._intersections[sequence_id]

        # Compute and update areas of ground-truth, predictions and intersections.
        _update_dict_stats(seq_preds, y_pred[prediction_mask])
        _update_dict_stats(seq_gts, y_true[label_mask])

        non_crowd_intersection = np.logical_and(label_mask, prediction_mask)
        intersection_ids = (
                y_true[non_crowd_intersection] * self._offset +
                y_pred[non_crowd_intersection])
        _update_dict_stats(seq_intersects, intersection_ids)

    def result(self) -> Mapping[Text, Any]:
        """Computes the segmentation and tracking quality.
        Returns:
          A dictionary containing:
            - 'STQ': The total STQ score.
            - 'AQ': The total association quality (AQ) score.
            - 'IoU': The total mean IoU.
            - 'STQ_per_seq': A list of the STQ score per sequence.
            - 'AQ_per_seq': A list of the AQ score per sequence.
            - 'IoU_per_seq': A list of mean IoU per sequence.
            - 'Id_per_seq': A list of string-type sequence Ids to map list index to
                sequence.
            - 'Length_per_seq': A list of the length of each sequence.
        """
        # Compute association quality (AQ)
        num_tubes_per_seq = [0] * len(self._ground_truth)
        aq_per_seq = [0] * len(self._ground_truth)
        iou_per_seq = [0] * len(self._ground_truth)
        id_per_seq = [''] * len(self._ground_truth)

        for index, sequence_id in enumerate(self._ground_truth):
            outer_sum = 0.0
            predictions = self._predictions[sequence_id]
            ground_truth = self._ground_truth[sequence_id]
            intersections = self._intersections[sequence_id]
            num_tubes_per_seq[index] = len(ground_truth)
            id_per_seq[index] = sequence_id

            for gt_id, gt_size in ground_truth.items():
                inner_sum = 0.0
                for pr_id, pr_size in predictions.items():
                    tpa_key = self._offset * gt_id + pr_id
                    if tpa_key in intersections:
                        tpa = intersections[tpa_key]
                        fpa = pr_size - tpa
                        fna = gt_size - tpa
                        inner_sum += tpa * (tpa / (tpa + fpa + fna))

                outer_sum += 1.0 / gt_size * inner_sum
            aq_per_seq[index] = outer_sum

        aq_mean = np.sum(aq_per_seq) / np.maximum(
            np.sum(num_tubes_per_seq), _EPSILON)
        aq_per_seq = aq_per_seq / np.maximum(num_tubes_per_seq, _EPSILON)

        # Compute IoU scores.
        # The rows correspond to ground-truth and the columns to predictions.
        # Remove fp from confusion matrix for the void/ignore class.
        total_confusion = np.zeros(
            (self._confusion_matrix_size, self._confusion_matrix_size),
            dtype=np.int64)
        for index, confusion in enumerate(
                self._iou_confusion_matrix_per_sequence.values()):
            removal_matrix = np.zeros_like(confusion)
            removal_matrix[self._include_indices, :] = 1.0
            confusion *= removal_matrix
            total_confusion += confusion

            # `intersections` corresponds to true positives.
            intersections = confusion.diagonal()
            fps = confusion.sum(axis=0) - intersections
            fns = confusion.sum(axis=1) - intersections
            unions = intersections + fps + fns

            num_classes = np.count_nonzero(unions)
            ious = (
                    intersections.astype(np.double) /
                    np.maximum(unions, 1e-15).astype(np.double))
            iou_per_seq[index] = np.sum(ious) / num_classes

        # `intersections` corresponds to true positives.
        intersections = total_confusion.diagonal()
        fps = total_confusion.sum(axis=0) - intersections
        fns = total_confusion.sum(axis=1) - intersections
        unions = intersections + fps + fns

        num_classes = np.count_nonzero(unions)
        ious = (
                intersections.astype(np.double) /
                np.maximum(unions, _EPSILON).astype(np.double))
        iou_mean = np.sum(ious) / num_classes

        st_quality = np.sqrt(aq_mean * iou_mean)
        st_quality_per_seq = np.sqrt(aq_per_seq * iou_per_seq)
        return {
            'STQ': st_quality,
            'AQ': aq_mean,
            'IoU': float(iou_mean),
            'STQ_per_seq': st_quality_per_seq,
            'AQ_per_seq': aq_per_seq,
            'IoU_per_seq': iou_per_seq,
            'ID_per_seq': id_per_seq,
            'Length_per_seq': list(self._sequence_length.values()),
        }

    def reset_states(self):
        """Resets all states that accumulated data."""
        self._iou_confusion_matrix_per_sequence = collections.OrderedDict()
        self._predictions = collections.OrderedDict()
        self._ground_truth = collections.OrderedDict()
        self._intersections = collections.OrderedDict()
        self._sequence_length = collections.OrderedDict()


================================================
FILE: tools/utils/cityscapesvps_eval.py
================================================
from __future__ import print_function

import argparse
import os
import os.path as osp
import torch.multiprocessing as multiprocessing
import numpy as np
import json
from PIL import Image
import pickle
from torch.utils.data import Dataset


class CityscapesVps(Dataset):

    def __init__(self):

        super(CityscapesVps, self).__init__()

        self.nframes_per_video = 6
        self.lambda_ = 5
        self.labeled_fid = 20

    def _save_image_single_core(self, proc_id, images_set, names_set, colors = None):

        def colorize(gray, palette):
            # gray: numpy array of the label and 1*3N size list palette
            color = Image.fromarray(gray.astype(np.uint8)).convert('P')
            color.putpalette(palette)
            return color

        for working_idx, (image, name) in enumerate(zip(images_set, names_set)):
            if colors is not None:
                image = colorize(image, colors)
            else:
                image = Image.fromarray(image)
            os.makedirs(os.path.dirname(name), exist_ok=True)
            image.save(name)

    def inference_panoptic_video(self, pred_pans_2ch, output_dir,
                                 categories,
                                 names,
                                 n_video=0):
        from panopticapi.utils import IdGenerator

        # Sample only frames with GT annotations.
        if len(pred_pans_2ch) != len(names):
            pred_pans_2ch = pred_pans_2ch[(self.labeled_fid // self.lambda_)::self.lambda_]
        categories = {el['id']: el for el in categories}
        color_generator = IdGenerator(categories)

        def get_pred_large(pan_2ch_all, vid_num, nframes_per_video=6):
            vid_num = len(pan_2ch_all) // nframes_per_video  # 10
            cpu_num = multiprocessing.cpu_count() // 2  # 32 --> 16
            nprocs = min(vid_num, cpu_num)  # 10
            max_nframes = cpu_num * nframes_per_video
            nsplits = (len(pan_2ch_all) - 1) // max_nframes + 1
            annotations, pan_all = [], []
            for i in range(0, len(pan_2ch_all), max_nframes):
                print('==> Read and convert VPS output - split %d/%d' % ((i // max_nframes) + 1, nsplits))
                pan_2ch_part = pan_2ch_all[i:min(
                    i + max_nframes, len(pan_2ch_all))]
                pan_2ch_split = np.array_split(pan_2ch_part, nprocs)
                workers = multiprocessing.Pool(processes=nprocs)
                processes = []
                for proc_id, pan_2ch_set in enumerate(pan_2ch_split):
                    p = workers.apply_async(
                        self.converter_2ch_track_core,
                        (proc_id, pan_2ch_set, color_generator))
                    processes.append(p)
                workers.close()
                workers.join()

                for p in processes:
                    p = p.get()
                    annotations.extend(p[0])
                    pan_all.extend(p[1])

            pan_json = {'annotations': annotations}
            return pan_all, pan_json

        def save_image(images, save_folder, names, colors=None):
            os.makedirs(save_folder, exist_ok=True)

            names = [osp.join(save_folder,
                              name.replace('_leftImg8bit', '').replace('_newImg8bit', '').replace('jpg', 'png').replace(
                                  'jpeg', 'png')) for name in names]
            cpu_num = multiprocessing.cpu_count() // 2
            images_split = np.array_split(images, cpu_num)
            names_split = np.array_split(names, cpu_num)
            workers = multiprocessing.Pool(processes=cpu_num)
            for proc_id, (images_set, names_set) in enumerate(zip(images_split, names_split)):
                workers.apply_async(self._save_image_single_core, (proc_id, images_set, names_set, colors))
            workers.close()
            workers.join()

        # inference_panoptic_video
        pred_pans, pred_json = get_pred_large(pred_pans_2ch,
                                              vid_num=n_video)
        print('--------------------------------------')
        print('==> Saving VPS output png files')
        os.makedirs(output_dir, exist_ok=True)
        save_image(pred_pans_2ch, osp.join(output_dir, 'pan_2ch'), names)
        save_image(pred_pans, osp.join(output_dir, 'pan_pred'), names)
        print('==> Saving pred.jsons file')
        json.dump(pred_json, open(osp.join(output_dir, 'pred.json'), 'w'))
        print('--------------------------------------')

        return pred_pans, pred_json

    def converter_2ch_track_core(self, proc_id, pan_2ch_set, color_generator):
        from panopticapi.utils import rgb2id

        OFFSET = 1000
        VOID = 255
        annotations, pan_all = [], []
        # reference dict to used color
        inst2color = {}
        for idx in range(len(pan_2ch_set)):
            pan_2ch = np.uint32(pan_2ch_set[idx])
            # pan_2ch: ss-seg maps[:,:,0], id-seg maps[:,:,1]
            pan = OFFSET * pan_2ch[:, :, 0] + pan_2ch[:, :, 1]

            pan_format = np.zeros((pan_2ch.shape[0], pan_2ch.shape[1], 3), dtype=np.uint8)
            l = np.unique(pan)

            segm_info = {}
            for el in l:
                sem = el // OFFSET

                if sem == VOID:
                    continue
                mask = pan == el
                #### handling used color for inst id
                if el % OFFSET > 0:
                    # if el > OFFSET:
                    # things class
                    if el in inst2color:
                        color = inst2color[el]
                    else:
                        color = color_generator.get_color(sem)
                        inst2color[el] = color
                else:
                    # stuff class
                    color = color_generator.get_color(sem)

                pan_format[mask] = color
                index = np.where(mask)
                x = index[1].min()
                y = index[0].min()
                width = index[1].max() - x
                height = index[0].max() - y

                dt = {"category_id": sem.item(), "iscrowd": 0, "id": int(rgb2id(color)),
                      "bbox": [x.item(), y.item(), width.item(), height.item()], "area": mask.sum().item()}
                segment_id = int(rgb2id(color))
                segm_info[segment_id] = dt

            # annotations.append({"segments_info": segm_info})
            pan_all.append(pan_format)

            gt_pan = np.uint32(pan_format)
            # rgb2id for evaluation
            pan_gt = gt_pan[:, :, 0] + gt_pan[:, :, 1] * 256 + gt_pan[:, :, 2] * 256 * 256
            labels, labels_cnt = np.unique(pan_gt, return_counts=True)
            for label, area in zip(labels, labels_cnt):
                if label == 0:
                    continue
                if label not in segm_info.keys():
                    print('label:', label)
                    raise KeyError('label not in segm_info keys.')

                segm_info[label]["area"] = int(area)
            segm_info = [v for k, v in segm_info.items()]

            annotations.append({"segments_info": segm_info})

        return annotations, pan_all

================================================
FILE: tools/visualization.py
================================================
import argparse
import os
import os.path as osp
import warnings
import numpy as np
import pickle
import json
import mmcv
import torch
from mmcv import Config, DictAction
from mmcv.cnn import fuse_conv_bn
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
                         wrap_fp16_model)
from mmdet.datasets import (build_dataloader, build_dataset,
                            replace_ImageToTensor)
from mmdet.models import build_detector

from external.test import encode_mask_results, tensor2imgs


def single_gpu_test(model,
                    data_loader,
                    out_dir=None,
                    ):
    if out_dir is None:
        out_dir = 'logger/blackhole'
    print("The output dir is {}".format(out_dir))
    model.eval()
    dataset = data_loader.dataset
    prog_bar = mmcv.ProgressBar(len(dataset))

    for i, data in enumerate(data_loader):
        with torch.no_grad():
            visualizations = model(return_loss=False, rescale=True, **data)

        instance_map = visualizations['instance_map']
        seg_infos = visualizations['segments_info']
        depth = visualizations['depth_final']
        prog_bar.update()
    return None


def parse_args():
    parser = argparse.ArgumentParser(
        description='MMDet test (and eval) a model')
    parser.add_argument('config', help='test config file path')
    parser.add_argument('checkpoint', help='checkpoint file')
    parser.add_argument('--out', help='output result file in pickle format')
    parser.add_argument(
        '--fuse-conv-bn',
        action='store_true',
        help='Whether to fuse conv and bn, this will slightly increase'
             'the inference speed')
    parser.add_argument(
        '--format-only',
        action='store_true',
        help='Format the output results without perform evaluation. It is'
             'useful when you want to format the result to a specific format and '
             'submit it to the test server')
    parser.add_argument(
        '--eval',
        type=str,
        nargs='+',
        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
             ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
    parser.add_argument('--show', action='store_true', help='show results')
    parser.add_argument(
        '--show-dir', help='directory where painted images will be saved')
    parser.add_argument(
        '--show-score-thr',
        type=float,
        default=0.3,
        help='score threshold (default: 0.3)')
    parser.add_argument(
        '--gpu-collect',
        action='store_true',
        help='whether to use gpu to collect results.')
    parser.add_argument(
        '--tmpdir',
        help='tmp directory used for collecting results from multiple '
             'workers, available when gpu-collect is not specified')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
             'in xxx=yyy format will be merged into config file. If the value to '
             'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
             'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
             'Note that the quotation marks are necessary and that no white space '
             'is allowed.')
    parser.add_argument(
        '--options',
        nargs='+',
        action=DictAction,
        help='custom options for evaluation, the key-value pair in xxx=yyy '
             'format will be kwargs for dataset.evaluate() function (deprecate), '
             'change to --eval-options instead.')
    parser.add_argument(
        '--eval-options',
        nargs='+',
        action=DictAction,
        help='custom options for evaluation, the key-value pair in xxx=yyy '
             'format will be kwargs for dataset.evaluate() function')
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='none',
        help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)

    if args.options and args.eval_options:
        raise ValueError(
            '--options and --eval-options cannot be both '
            'specified, --options is deprecated in favor of --eval-options')
    if args.options:
        warnings.warn('--options is deprecated in favor of --eval-options')
        args.eval_options = args.options
    return args


def main():
    args = parse_args()
    print(args)

    cfg = Config.fromfile(args.config)
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)
    # import modules from string list.
    if cfg.get('custom_imports', None):
        from mmcv.utils import import_modules_from_strings
        import_modules_from_strings(**cfg['custom_imports'])
    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True
    cfg.model.pretrained = None
    if cfg.model.get('neck'):
        if isinstance(cfg.model.neck, list):
            for neck_cfg in cfg.model.neck:
                if neck_cfg.get('rfp_backbone'):
                    if neck_cfg.rfp_backbone.get('pretrained'):
                        neck_cfg.rfp_backbone.pretrained = None
        elif cfg.model.neck.get('rfp_backbone'):
            if cfg.model.neck.rfp_backbone.get('pretrained'):
                cfg.model.neck.rfp_backbone.pretrained = None

    # in case the test dataset is concatenated
    samples_per_gpu = 1
    if isinstance(cfg.data.test, dict):
        cfg.data.test.test_mode = True
        samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
        if samples_per_gpu > 1:
            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
            cfg.data.test.pipeline = replace_ImageToTensor(
                cfg.data.test.pipeline)
    elif isinstance(cfg.data.test, list):
        for ds_cfg in cfg.data.test:
            ds_cfg.test_mode = True
        samples_per_gpu = max(
            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
        if samples_per_gpu > 1:
            for ds_cfg in cfg.data.test:
                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)

    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)

    # build the dataloader
    dataset = build_dataset(cfg.data.test)
    data_loader = build_dataloader(
        dataset,
        samples_per_gpu=samples_per_gpu,
        workers_per_gpu=cfg.data.workers_per_gpu,
        dist=distributed,
        shuffle=False)

    # build the model and load checkpoint
    cfg.model.train_cfg = None
    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        wrap_fp16_model(model)
    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu', strict=True)
    if args.fuse_conv_bn:
        model = fuse_conv_bn(model)
    # old versions did not save class info in checkpoints, this walkaround is
    # for backward compatibility
    if 'CLASSES' in checkpoint.get('meta', {}):
        model.CLASSES = checkpoint['meta']['CLASSES']
    else:
        model.CLASSES = dataset.CLASSES

    model = MMDataParallel(model, device_ids=[0])
    # Inference the sequence
    single_gpu_test(model, data_loader, args.show_dir)


if __name__ == '__main__':
    main()


================================================
FILE: tools_vis/apis/__init__.py
================================================
from .test import single_gpu_test, multi_gpu_test

================================================
FILE: tools_vis/apis/test.py
================================================
# Modified from mmdet 2.20.0 / https://github.com/open-mmlab/mmdetection/tree/ff9bc

import os.path as osp
import pickle
import shutil
import tempfile
import time

import mmcv
import torch
import torch.distributed as dist
from mmcv.image import tensor2imgs
from mmcv.runner import get_dist_info

from mmdet.core import encode_mask_results


def single_gpu_test(model,
                    data_loader,
                    show=False,
                    out_dir=None,
                    show_score_thr=0.3):
    model.eval()
    results = []
    dataset = data_loader.dataset
    prog_bar = mmcv.ProgressBar(len(dataset))
    for i, data in enumerate(data_loader):
        with torch.no_grad():
            result = model(return_loss=False, rescale=True, **data)

        batch_size = len(result)

        # encode mask results
        for idx in range(len(result)):
            if isinstance(result[idx][0], tuple):
                result[idx] = [(bbox_results, encode_mask_results(mask_results))
                               for bbox_results, mask_results in result[idx]]

        results.extend(result)

        for _ in range(batch_size):
            prog_bar.update()

    results = sum(results, [])
    return results


def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
    """Test model with multiple gpus.

    This method tests model with multiple gpus and collects the results
    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
    it encodes results to gpu tensors and use gpu communication for results
    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
    and collects them by the rank 0 worker.

    Args:
        model (nn.Module): Model to be tested.
        data_loader (nn.Dataloader): Pytorch data loader.
        tmpdir (str): Path of directory to save the temporary results from
            different gpus under cpu mode.
        gpu_collect (bool): Option to use either gpu or cpu to collect results.

    Returns:
        list: The prediction results.
    """
    model.eval()
    results = []
    dataset = data_loader.dataset
    rank, world_size = get_dist_info()
    if rank == 0:
        prog_bar = mmcv.ProgressBar(len(dataset))
    time.sleep(2)  # This line can prevent deadlock problem in some cases.
    for i, data in enumerate(data_loader):
        with torch.no_grad():
            result = model(return_loss=False, rescale=True, **data)
            # encode mask results
            for idx in range(len(result)):
                if isinstance(result[idx][0], tuple):
                    result[idx] = [(bbox_results, encode_mask_results(mask_results))
                              for bbox_results, mask_results in result[idx]]
        results.extend(result)

        if rank == 0:
            batch_size = len(result)
            for _ in range(batch_size * world_size):
                prog_bar.update()

    # collect results from all ranks
    if gpu_collect:
        results = collect_results_gpu(results, size=len(dataset))
    else:
        results = collect_results_cpu(results, size=len(dataset), tmpdir=tmpdir)
    if rank == 0:
        results = sum(results, [])
    return results


def collect_results_cpu(result_part, size, tmpdir=None):
    rank, world_size = get_dist_info()
    # create a tmp dir if it is not specified
    if tmpdir is None:
        MAX_LEN = 512
        # 32 is whitespace
        dir_tensor = torch.full((MAX_LEN, ),
                                32,
                                dtype=torch.uint8,
                                device='cuda')
        if rank == 0:
            mmcv.mkdir_or_exist('.dist_test')
            tmpdir = tempfile.mkdtemp(dir='.dist_test')
            tmpdir = torch.tensor(
                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
            dir_tensor[:len(tmpdir)] = tmpdir
        dist.broadcast(dir_tensor, 0)
        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
    else:
        mmcv.mkdir_or_exist(tmpdir)
    # dump the part result to the dir
    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
    dist.barrier()
    # collect all parts
    if rank != 0:
        return None
    else:
        # load results of all parts from tmp dir
        part_list = []
        for i in range(world_size):
            part_file = osp.join(tmpdir, f'part_{i}.pkl')
            part_list.append(mmcv.load(part_file))
        # sort the results
        ordered_results = []
        for res in zip(*part_list):
            ordered_results.extend(list(res))
        # the dataloader may pad some samples
        ordered_results = ordered_results[:size]
        # remove tmp dir
        shutil.rmtree(tmpdir)
        return ordered_results


def collect_results_gpu(result_part, size):
    rank, world_size = get_dist_info()
    # dump result part to tensor with pickle
    part_tensor = torch.tensor(
        bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda')
    # gather all result part tensor shape
    shape_tensor = torch.tensor(part_tensor.shape, device='cuda')
    shape_list = [shape_tensor.clone() for _ in range(world_size)]
    dist.all_gather(shape_list, shape_tensor)
    # padding result part tensor to max length
    shape_max = torch.tensor(shape_list).max()
    part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda')
    part_send[:shape_tensor[0]] = part_tensor
    part_recv_list = [
        part_tensor.new_zeros(shape_max) for _ in range(world_size)
    ]
    # gather all result part
    dist.all_gather(part_recv_list, part_send)

    if rank == 0:
        part_list = []
        for recv, shape in zip(part_recv_list, shape_list):
            part_list.append(
                pickle.loads(recv[:shape[0]].cpu().numpy().tobytes()))
        # sort the results
        ordered_results = []
        for res in zip(*part_list):
            ordered_results.extend(list(res))
        # the dataloader may pad some samples
        ordered_results = ordered_results[:size]
        return ordered_results


================================================
FILE: tools_vis/dist_test_whole_video.sh
================================================
#!/usr/bin/env bash

CONFIG=$1
CHECKPOINT=$2
GPUS=$3
PORT=${PORT:-$((29500 + $RANDOM % 29))}

if command -v torchrun &> /dev/null
then
  echo "Using torchrun mode."
  PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
    torchrun --nproc_per_node=$GPUS --master_port=$PORT \
    $(dirname "$0")/test_whole_video.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}
else
  echo "Using launch mode."
  PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
    python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
    $(dirname "$0")/test_whole_video.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}
fi


================================================
FILE: tools_vis/docker.sh
================================================
#!/bin/bash

DATALOC=${DATALOC:-`realpath ../datasets`}
LOGLOC=${LOGLOC:-`realpath ../logger`}
IMG=${IMG:-"harbory/openmmlab:eccv-2022"}

docker run --gpus all -it --rm --ipc=host --net=host \
  --mount src=$(pwd),target=/data,type=bind \
  --mount src=$DATALOC,target=/data/data,type=bind \
  --mount src=$LOGLOC,target=/data/logger,type=bind \
  $IMG


================================================
FILE: tools_vis/slurm_test_vis.sh
================================================
#!/usr/bin/env bash

set -x

PARTITION=$1
JOB_NAME=$2
CONFIG=$3
CHECKPOINT=$4
GPUS=${GPUS:-8}
GPUS_PER_NODE=${GPUS_PER_NODE:-8}
CPUS_PER_TASK=${CPUS_PER_TASK:-5}
PY_ARGS=${@:5}
SRUN_ARGS=${SRUN_ARGS:-""}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
srun -p ${PARTITION} \
    --job-name=${JOB_NAME} \
    --gres=gpu:${GPUS_PER_NODE} \
    --ntasks=${GPUS} \
    --ntasks-per-node=${GPUS_PER_NODE} \
    --cpus-per-task=${CPUS_PER_TASK} \
    --kill-on-bad-exit=1 \
    ${SRUN_ARGS} \
    python -u tools2/test_whole_video.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}

================================================
FILE: tools_vis/test.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
# Modified from mmdet 2.20.0 / https://github.com/open-mmlab/mmdetection/tree/ff9bc
import argparse
import os
import os.path as osp
import time
import warnings

import mmcv
import torch
from mmcv import Config, DictAction
from mmcv.cnn import fuse_conv_bn
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
                         wrap_fp16_model)

from mmdet.apis import multi_gpu_test, single_gpu_test
from mmdet.datasets import (build_dataloader, build_dataset,
                            replace_ImageToTensor)
from mmdet.models import build_detector


def parse_args():
    parser = argparse.ArgumentParser(
        description='MMDet test (and eval) a model')
    parser.add_argument('config', help='test config file path')
    parser.add_argument('checkpoint', help='checkpoint file')
    parser.add_argument(
        '--work-dir',
        help='the directory to save the file containing evaluation metrics')
    parser.add_argument('--out', help='output result file in pickle format')
    parser.add_argument(
        '--fuse-conv-bn',
        action='store_true',
        help='Whether to fuse conv and bn, this will slightly increase'
        'the inference speed')
    parser.add_argument(
        '--gpu-ids',
        type=int,
        nargs='+',
        help='ids of gpus to use '
        '(only applicable to non-distributed testing)')
    parser.add_argument(
        '--format-only',
        action='store_true',
        help='Format the output results without perform evaluation. It is'
        'useful when you want to format the result to a specific format and '
        'submit it to the test server')
    parser.add_argument(
        '--eval',
        type=str,
        nargs='+',
        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
    parser.add_argument('--show', action='store_true', help='show results')
    parser.add_argument(
        '--show-dir', help='directory where painted images will be saved')
    parser.add_argument(
        '--show-score-thr',
        type=float,
        default=0.3,
        help='score threshold (default: 0.3)')
    parser.add_argument(
        '--gpu-collect',
        action='store_true',
        help='whether to use gpu to collect results.')
    parser.add_argument(
        '--tmpdir',
        help='tmp directory used for collecting results from multiple '
        'workers, available when gpu-collect is not specified')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. If the value to '
        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
        'Note that the quotation marks are necessary and that no white space '
        'is allowed.')
    parser.add_argument(
        '--options',
        nargs='+',
        action=DictAction,
        help='custom options for evaluation, the key-value pair in xxx=yyy '
        'format will be kwargs for dataset.evaluate() function (deprecate), '
        'change to --eval-options instead.')
    parser.add_argument(
        '--eval-options',
        nargs='+',
        action=DictAction,
        help='custom options for evaluation, the key-value pair in xxx=yyy '
        'format will be kwargs for dataset.evaluate() function')
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='none',
        help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)

    if args.options and args.eval_options:
        raise ValueError(
            '--options and --eval-options cannot be both '
            'specified, --options is deprecated in favor of --eval-options')
    if args.options:
        warnings.warn('--options is deprecated in favor of --eval-options')
        args.eval_options = args.options
    return args


def main():
    args = parse_args()

    assert args.out or args.eval or args.format_only or args.show \
        or args.show_dir, \
        ('Please specify at least one operation (save/eval/format/show the '
         'results / save the results) with the argument "--out", "--eval"'
         ', "--format-only", "--show" or "--show-dir"')

    if args.eval and args.format_only:
        raise ValueError('--eval and --format_only cannot be both specified')

    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
        raise ValueError('The output file must be a pkl file.')

    cfg = Config.fromfile(args.config)
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)
    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True

    cfg.model.pretrained = None
    if cfg.model.get('neck'):
        if isinstance(cfg.model.neck, list):
            for neck_cfg in cfg.model.neck:
                if neck_cfg.get('rfp_backbone'):
                    if neck_cfg.rfp_backbone.get('pretrained'):
                        neck_cfg.rfp_backbone.pretrained = None
        elif cfg.model.neck.get('rfp_backbone'):
            if cfg.model.neck.rfp_backbone.get('pretrained'):
                cfg.model.neck.rfp_backbone.pretrained = None

    # in case the test dataset is concatenated
    samples_per_gpu = 1
    if isinstance(cfg.data.test, dict):
        cfg.data.test.test_mode = True
        samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
        if samples_per_gpu > 1:
            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
            cfg.data.test.pipeline = replace_ImageToTensor(
                cfg.data.test.pipeline)
    elif isinstance(cfg.data.test, list):
        for ds_cfg in cfg.data.test:
            ds_cfg.test_mode = True
        samples_per_gpu = max(
            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
        if samples_per_gpu > 1:
            for ds_cfg in cfg.data.test:
                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)

    if args.gpu_ids is not None:
        cfg.gpu_ids = args.gpu_ids
    else:
        cfg.gpu_ids = range(1)

    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
        if len(cfg.gpu_ids) > 1:
            warnings.warn(
                f'We treat {cfg.gpu_ids} as gpu-ids, and reset to '
                f'{cfg.gpu_ids[0:1]} as gpu-ids to avoid potential error in '
                'non-distribute testing time.')
            cfg.gpu_ids = cfg.gpu_ids[0:1]
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)

    rank, _ = get_dist_info()
    # allows not to create
    if args.work_dir is not None and rank == 0:
        mmcv.mkdir_or_exist(osp.abspath(args.work_dir))
        timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
        json_file = osp.join(args.work_dir, f'eval_{timestamp}.json')

    # build the dataloader
    dataset = build_dataset(cfg.data.test)
    data_loader = build_dataloader(
        dataset,
        samples_per_gpu=samples_per_gpu,
        workers_per_gpu=cfg.data.workers_per_gpu,
        dist=distributed,
        shuffle=False)

    # build the model and load checkpoint
    cfg.model.train_cfg = None
    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        wrap_fp16_model(model)
    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
    if args.fuse_conv_bn:
        model = fuse_conv_bn(model)
    # old versions did not save class info in checkpoints, this walkaround is
    # for backward compatibility
    if 'CLASSES' in checkpoint.get('meta', {}):
        model.CLASSES = checkpoint['meta']['CLASSES']
    else:
        model.CLASSES = dataset.CLASSES

    if not distributed:
        model = MMDataParallel(model, device_ids=cfg.gpu_ids)
        outputs = single_gpu_test(model, data_loader, args.show, args.show_dir,
                                  args.show_score_thr)
    else:
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False)
        outputs = multi_gpu_test(model, data_loader, args.tmpdir,
                                 args.gpu_collect)

    rank, _ = get_dist_info()
    if rank == 0:
        if args.out:
            print(f'\nwriting results to {args.out}')
            mmcv.dump(outputs, args.out)
        kwargs = {} if args.eval_options is None else args.eval_options
        if args.format_only:
            dataset.format_results(outputs, **kwargs)
        if args.eval:
            eval_kwargs = cfg.get('evaluation', {}).copy()
            # hard-code way to remove EvalHook args
            for key in [
                    'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
                    'rule', 'dynamic_intervals'
            ]:
                eval_kwargs.pop(key, None)
            eval_kwargs.update(dict(metric=args.eval, **kwargs))
            metric = dataset.evaluate(outputs, **eval_kwargs)
            print(metric)
            metric_dict = dict(config=args.config, metric=metric)
            if args.work_dir is not None and rank == 0:
                mmcv.dump(metric_dict, json_file)


if __name__ == '__main__':
    main()


================================================
FILE: tools_vis/test_whole_video.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
# Modified from mmdet 2.20.0 / https://github.com/open-mmlab/mmdetection/tree/ff9bc
import argparse
import os
import os.path as osp
import time
import warnings

import mmcv
import torch
from mmcv import Config, DictAction
from mmcv.cnn import fuse_conv_bn
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
                         wrap_fp16_model)

from tools2.apis import multi_gpu_test, single_gpu_test
from mmdet.datasets import (build_dataloader, build_dataset,
                            replace_ImageToTensor)
from mmdet.models import build_detector


def parse_args():
    parser = argparse.ArgumentParser(
        description='MMDet test (and eval) a model')
    parser.add_argument('config', help='test config file path')
    parser.add_argument('checkpoint', help='checkpoint file')
    parser.add_argument(
        '--work-dir',
        help='the directory to save the file containing evaluation metrics')
    parser.add_argument('--out', help='output result file in pickle format')
    parser.add_argument(
        '--fuse-conv-bn',
        action='store_true',
        help='Whether to fuse conv and bn, this will slightly increase'
        'the inference speed')
    parser.add_argument(
        '--gpu-ids',
        type=int,
        nargs='+',
        help='ids of gpus to use '
        '(only applicable to non-distributed testing)')
    parser.add_argument(
        '--format-only',
        action='store_true',
        help='Format the output results without perform evaluation. It is'
        'useful when you want to format the result to a specific format and '
        'submit it to the test server')
    parser.add_argument(
        '--eval',
        type=str,
        nargs='+',
        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
    parser.add_argument('--show', action='store_true', help='show results')
    parser.add_argument(
        '--show-dir', help='directory where painted images will be saved')
    parser.add_argument(
        '--show-score-thr',
        type=float,
        default=0.3,
        help='score threshold (default: 0.3)')
    parser.add_argument(
        '--gpu-collect',
        action='store_true',
        help='whether to use gpu to collect results.')
    parser.add_argument(
        '--tmpdir',
        help='tmp directory used for collecting results from multiple '
        'workers, available when gpu-collect is not specified')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. If the value to '
        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
        'Note that the quotation marks are necessary and that no white space '
        'is allowed.')
    parser.add_argument(
        '--options',
        nargs='+',
        action=DictAction,
        help='custom options for evaluation, the key-value pair in xxx=yyy '
        'format will be kwargs for dataset.evaluate() function (deprecate), '
        'change to --eval-options instead.')
    parser.add_argument(
        '--eval-options',
        nargs='+',
        action=DictAction,
        help='custom options for evaluation, the key-value pair in xxx=yyy '
        'format will be kwargs for dataset.evaluate() function')
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='none',
        help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)

    if args.options and args.eval_options:
        raise ValueError(
            '--options and --eval-options cannot be both '
            'specified, --options is deprecated in favor of --eval-options')
    if args.options:
        warnings.warn('--options is deprecated in favor of --eval-options')
        args.eval_options = args.options
    return args


def main():
    args = parse_args()

    assert args.out or args.eval or args.format_only or args.show \
        or args.show_dir, \
        ('Please specify at least one operation (save/eval/format/show the '
         'results / save the results) with the argument "--out", "--eval"'
         ', "--format-only", "--show" or "--show-dir"')

    if args.eval and args.format_only:
        raise ValueError('--eval and --format_only cannot be both specified')

    # if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
    #     raise ValueError('The output file must be a pkl file.')

    cfg = Config.fromfile(args.config)
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)
    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True

    cfg.model.pretrained = None
    if cfg.model.get('neck'):
        if isinstance(cfg.model.neck, list):
            for neck_cfg in cfg.model.neck:
                if neck_cfg.get('rfp_backbone'):
                    if neck_cfg.rfp_backbone.get('pretrained'):
                        neck_cfg.rfp_backbone.pretrained = None
        elif cfg.model.neck.get('rfp_backbone'):
            if cfg.model.neck.rfp_backbone.get('pretrained'):
                cfg.model.neck.rfp_backbone.pretrained = None

    # in case the test dataset is concatenated
    samples_per_gpu = 1
    if isinstance(cfg.data.test, dict):
        cfg.data.test.test_mode = True
        samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
        if samples_per_gpu > 1:
            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
            cfg.data.test.pipeline = replace_ImageToTensor(
                cfg.data.test.pipeline)
    elif isinstance(cfg.data.test, list):
        for ds_cfg in cfg.data.test:
            ds_cfg.test_mode = True
        samples_per_gpu = max(
            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
        if samples_per_gpu > 1:
            for ds_cfg in cfg.data.test:
                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)

    if args.gpu_ids is not None:
        cfg.gpu_ids = args.gpu_ids
    else:
        cfg.gpu_ids = range(1)

    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
        if len(cfg.gpu_ids) > 1:
            warnings.warn(
                f'We treat {cfg.gpu_ids} as gpu-ids, and reset to '
                f'{cfg.gpu_ids[0:1]} as gpu-ids to avoid potential error in '
                'non-distribute testing time.')
            cfg.gpu_ids = cfg.gpu_ids[0:1]
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)

    rank, _ = get_dist_info()
    # allows not to create
    if args.work_dir is not None and rank == 0:
        mmcv.mkdir_or_exist(osp.abspath(args.work_dir))
        timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
        json_file = osp.join(args.work_dir, f'eval_{timestamp}.json')

    # build the dataloader
    dataset = build_dataset(cfg.data.test)
    data_loader = build_dataloader(
        dataset,
        samples_per_gpu=samples_per_gpu,
        workers_per_gpu=cfg.data.workers_per_gpu,
        dist=distributed,
        shuffle=False)

    # build the model and load checkpoint
    cfg.model.train_cfg = None
    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        wrap_fp16_model(model)
    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
    if args.fuse_conv_bn:
        model = fuse_conv_bn(model)
    # old versions did not save class info in checkpoints, this walkaround is
    # for backward compatibility
    if 'CLASSES' in checkpoint.get('meta', {}):
        model.CLASSES = checkpoint['meta']['CLASSES']
    else:
        model.CLASSES = dataset.CLASSES

    if not distributed:
        model = MMDataParallel(model, device_ids=cfg.gpu_ids)
        outputs = single_gpu_test(model, data_loader, args.show, args.show_dir,
                                  args.show_score_thr)
    else:
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False)
        outputs = multi_gpu_test(model, data_loader, args.tmpdir,
                                 args.gpu_collect)

    rank, _ = get_dist_info()
    if rank == 0:
        if args.out:
            print(f'\nwriting results to {args.out}')
            mmcv.dump(outputs, args.out)
        kwargs = {} if args.eval_options is None else args.eval_options
        kwargs['resfile_path'] = args.checkpoint.replace('.pth', '_results')
        if kwargs['resfile_path'][:7] == 'logger/':
            os.system("ln -sf {} {}".format(
                os.path.join('../', kwargs['resfile_path'], 'submission_file.zip'), 'logger/submission.zip'))
        if args.format_only:
            dataset.format_results(outputs, **kwargs)
        if args.eval:
            eval_kwargs = cfg.get('evaluation', {}).copy()
            # hard-code way to remove EvalHook args
            for key in [
                    'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
                    'rule', 'dynamic_intervals'
            ]:
                eval_kwargs.pop(key, None)
            eval_kwargs.update(dict(metric=args.eval, **kwargs))
            metric = dataset.evaluate(outputs, **eval_kwargs)
            print(metric)
            metric_dict = dict(config=args.config, metric=metric)
            if args.work_dir is not None and rank == 0:
                mmcv.dump(metric_dict, json_file)


if __name__ == '__main__':
    main()


================================================
FILE: unitrack/__init__.py
================================================
from .model import *

================================================
FILE: unitrack/basetrack.py
================================================
import numpy as np
from collections import OrderedDict,deque
from unitrack.core.motion.kalman_filter import KalmanFilter
import unitrack.core.association.matching as matching
from unitrack.utils.box import *
import torch
import torch.nn.functional as F


class TrackState(object):
    New = 0
    Tracked = 1
    Lost = 2
    Removed = 3


class BaseTrack(object):
    _count = 0

    track_id = 0
    is_activated = False
    state = TrackState.New

    history = OrderedDict()
    features = []
    curr_feature = None
    score = 0
    start_frame = 0
    frame_id = 0
    time_since_update = 0

    # multi-camera
    location = (np.inf, np.inf)

    @property
    def end_frame(self):
        return self.frame_id

    @staticmethod
    def next_id():
        BaseTrack._count += 1
        return BaseTrack._count

    def activate(self, *args):
        raise NotImplementedError

    def predict(self):
        raise NotImplementedError

    def update(self, *args, **kwargs):
        raise NotImplementedError

    def mark_lost(self):
        self.state = TrackState.Lost

    def mark_removed(self):
        self.state = TrackState.Removed


class STrack(BaseTrack):
    shared_kalman = KalmanFilter()

    def __init__(self, tlwh, score, temp_feat, buffer_size=30, 
            mask=None, pose=None, ac=False, category=-1, use_kalman=True):

        # wait activate
        self._tlwh = np.asarray(tlwh, dtype=np.float)
        self.kalman_filter = None
        self.mean, self.covariance = None, None
        self.use_kalman = use_kalman
        if not use_kalman: ac=True
        self.is_activated = ac 

        self.score = score
        self.category = category 
        self.tracklet_len = 0

        self.smooth_feat = None
        self.update_features(temp_feat)
        self.features = deque([], maxlen=buffer_size)
        self.alpha = 0.9
        self.mask = mask
        self.pose = pose
    
    def update_features(self, feat):
        self.curr_feat = feat 
        if self.smooth_feat is None:
            self.smooth_feat = feat
        elif self.smooth_feat.shape == feat.shape:
            self.smooth_feat = self.alpha *self.smooth_feat + (1-self.alpha) * feat
        else:
            pass


    def predict(self):
        mean_state = self.mean.copy()
        if self.state != TrackState.Tracked:
            mean_state[7] = 0
        self.mean, self.covariance = self.kalman_filter.predict(mean_state, self.covariance)

    @staticmethod
    def multi_predict(stracks):
        if len(stracks) > 0:
            multi_mean = np.asarray([st.mean.copy() for st in stracks])
            multi_covariance = np.asarray([st.covariance for st in stracks])
            for i,st in enumerate(stracks):
                if st.state != TrackState.Tracked:
                    multi_mean[i][7] = 0
            multi_mean, multi_covariance = STrack.shared_kalman.multi_predict(multi_mean, multi_covariance)
            for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)):
                stracks[i].mean = mean
                stracks[i].covariance = cov


    def activate(self, kalman_filter, frame_id):
        """Start a new tracklet"""
        self.kalman_filter = kalman_filter
        self.track_id = self.next_id()
        self.mean, self.covariance = self.kalman_filter.initiate(tlwh_to_xyah(self._tlwh))

        self.tracklet_len = 0
        self.state = TrackState.Tracked
        if frame_id == 1:
            self.is_activated = True
        #self.is_activated = True
        self.frame_id = frame_id
        self.start_frame = frame_id

    def re_activate(self, new_track, frame_id, new_id=False, update_feature=True):
        if self.use_kalman:
            self.mean, self.covariance = self.kalman_filter.update(
                self.mean, self.covariance, tlwh_to_xyah(new_track.tlwh)
            )
        else:
            self.mean, self.covariance = None, None
            self._tlwh = np.asarray(new_track.tlwh, dtype=np.float)
        if update_feature:
            self.update_features(new_track.curr_feat)
        self.tracklet_len = 0
        self.state = TrackState.Tracked
        self.is_activated = True
        self.frame_id = frame_id
        if new_id:
            self.track_id = self.next_id()
        if not new_track.mask is None:
            self.mask = new_track.mask

    def update(self, new_track, frame_id, update_feature=True):
        """
        Update a matched track
        :type new_track: STrack
        :type frame_id: int
        :type update_feature: bool
        :return:
        """
        self.frame_id = frame_id
        self.tracklet_len += 1

        new_tlwh = new_track.tlwh
        if self.use_kalman:
            self.mean, self.covariance = self.kalman_filter.update(
                self.mean, self.covariance, tlwh_to_xyah(new_tlwh))
        else:
            self.mean, self.covariance = None, None
            self._tlwh = np.asarray(new_tlwh, dtype=np.float)
        self.state = TrackState.Tracked
        self.is_activated = True

        self.score = new_track.score
        '''
        For TAO dataset 
        '''
        self.category = new_track.category
        if update_feature:
            self.update_features(new_track.curr_feat)
        if not new_track.mask is None:
            self.mask = new_track.mask
        if not new_track.pose is None:
            self.pose = new_track.pose

    @property
    def tlwh(self):
        """Get current position in bounding box format `(top left x, top left y,
                width, height)`.
        """
        if self.mean is None:
            return self._tlwh.copy()
        ret = self.mean[:4].copy()
        ret[2] *= ret[3]
        ret[:2] -= ret[2:] / 2
        return ret

    @property
    def tlbr(self):
        """Convert bounding box to format `(min x, min y, max x, max y)`, i.e.,
        `(top left, bottom right)`.
        """
        ret = self.tlwh.copy()
        ret[2:] += ret[:2]
        return ret


    def to_xyah(self):
        return tlwh_to_xyah(self.tlwh)
    

    def __repr__(self):
        return 'OT_{}_({}-{})'.format(self.track_id, self.start_frame, self.end_frame)


def joint_stracks(tlista, tlistb):
    exists = {}
    res = []
    for t in tlista:
        exists[t.track_id] = 1
        res.append(t)
    for t in tlistb:
        tid = t.track_id
        if not exists.get(tid, 0):
            exists[tid] = 1
            res.append(t)
    return res


def sub_stracks(tlista, tlistb):
    stracks = {}
    for t in tlista:
        stracks[t.track_id] = t
    for t in tlistb:
        tid = t.track_id
        if stracks.get(tid, 0):
            del stracks[tid]
    return list(stracks.values())


def remove_duplicate_stracks(stracksa, stracksb, ioudist=0.15):
    pdist = matching.iou_distance(stracksa, stracksb)
    pairs = np.where(pdist<ioudist)
    dupa, dupb = list(), list()
    for p,q in zip(*pairs):
        timep = stracksa[p].frame_id - stracksa[p].start_frame
        timeq = stracksb[q].frame_id - stracksb[q].start_frame
        if timep > timeq:
            dupb.append(q)
        else:
            dupa.append(p)
    resa = [t for i,t in enumerate(stracksa) if not i in dupa]
    resb = [t for i,t in enumerate(stracksb) if not i in dupb]
    return resa, resb
            

================================================
FILE: unitrack/box.py
================================================
###################################################################
# File Name: box.py
# Author: Zhongdao Wang
# mail: wcd17@mails.tsinghua.edu.cn
# Created Time: Fri Jan 29 15:16:53 2021
###################################################################

import torch
from torchvision import ops

from .basetrack import STrack
from .multitracker import AssociationTracker
from unitrack.utils.box import scale_box, scale_box_input_size, xywh2xyxy, tlbr_to_tlwh


class BoxAssociationTracker(AssociationTracker):
    def __init__(self, opt):
        super(BoxAssociationTracker, self).__init__(opt)

    def extract_emb(self, img, obs):
        feat = self.app_model(img.unsqueeze(0).to(self.opt.device).float())
        scale = [feat.shape[-1]/self.opt.img_size[0],
                 feat.shape[-2]/self.opt.img_size[1]]
        obs_feat = scale_box(scale, obs).to(self.opt.device)
        obs_feat = [obs_feat[:, :4], ]
        ret = ops.roi_align(feat, obs_feat, self.opt.feat_size).detach().cpu()
        return ret

    def prepare_obs(self, img, img0, obs, embs=None):
        obs = torch.from_numpy(obs[obs[:, 4] > self.opt.conf_thres]).float()
        if len(obs) > 0:
            obs = xywh2xyxy(obs)
            obs = scale_box(self.opt.img_size, obs)
            embs = self.extract_emb(img, obs)
            obs = scale_box_input_size(self.opt.img_size, obs, img0.shape)

            if obs.shape[1] == 5:
                detections = [STrack(tlbr_to_tlwh(tlbrs[:4]), tlbrs[4], f,
                              self.buffer_size, use_kalman=self.opt.use_kalman)
                              for (tlbrs, f) in zip(obs, embs)]
            elif obs.shape[1] == 6:
                detections = [STrack(tlbr_to_tlwh(tlbrs[:4]), tlbrs[4], f,
                              self.buffer_size, category=tlbrs[5],
                              use_kalman=self.opt.use_kalman)
                              for (tlbrs, f) in zip(obs, embs)]
            else:
                raise ValueError(
                        'Shape of observations should be [n, 5] or [n, 6].')
        else:
            detections = []
        return detections


================================================
FILE: unitrack/core/__init__.py
================================================


================================================
FILE: unitrack/core/association/__init__.py
================================================


================================================
FILE: unitrack/core/association/matching.py
================================================
import torch
import torch.nn.functional as F
import numpy as np
import scipy
from scipy.spatial.distance import cdist
import lap

from cython_bbox import bbox_overlaps as bbox_ious
from ..motion import kalman_filter


def merge_matches(m1, m2, shape):
    O,P,Q = shape
    m1 = np.asarray(m1)
    m2 = np.asarray(m2)

    M1 = scipy.sparse.coo_matrix((np.ones(len(m1)), (m1[:, 0], m1[:, 1])), shape=(O, P))
    M2 = scipy.sparse.coo_matrix((np.ones(len(m2)), (m2[:, 0], m2[:, 1])), shape=(P, Q))

    mask = M1*M2
    match = mask.nonzero()
    match = list(zip(match[0], match[1]))
    unmatched_O = tuple(set(range(O)) - set([i for i, j in match]))
    unmatched_Q = tuple(set(range(Q)) - set([j for i, j in match]))

    return match, unmatched_O, unmatched_Q


def linear_assignment(cost_matrix, thresh):
    if cost_matrix.size == 0:
        return np.empty((0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple(range(cost_matrix.shape[1]))
    matches, unmatched_a, unmatched_b = [], [], []
    cost, x, y = lap.lapjv(cost_matrix, extend_cost=True, cost_limit=thresh)
    for ix, mx in enumerate(x):
        if mx >= 0:
            matches.append([ix, mx])
    unmatched_a = np.where(x < 0)[0]
    unmatched_b = np.where(y < 0)[0]
    matches = np.asarray(matches)
    return matches, unmatched_a, unmatched_b
            

def ious(atlbrs, btlbrs):
    """
    Compute cost based on IoU
    :type atlbrs: list[tlbr] | np.ndarray
    :type atlbrs: list[tlbr] | np.ndarray

    :rtype ious np.ndarray
    """
    ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=np.float)
    if ious.size == 0:
        return ious

    ious = bbox_ious(
        np.ascontiguousarray(atlbrs, dtype=np.float),
        np.ascontiguousarray(btlbrs, dtype=np.float)
    )

    return ious


def iou_distance(atracks, btracks):
    """
    Compute cost based on IoU
    :type atracks: list[STrack]
    :type btracks: list[STrack]

    :rtype cost_matrix np.ndarray
    """

    if (len(atracks)>0 and isinstance(atracks[0], np.ndarray)) or (len(btracks) > 0 and isinstance(btracks[0], np.ndarray)):
        atlbrs = atracks
        btlbrs = btracks
    else:
        atlbrs = [track.tlbr for track in atracks]
        btlbrs = [track.tlbr for track in btracks]
    _ious = ious(atlbrs, btlbrs)
    cost_matrix = 1 - _ious

    return cost_matrix

def embedding_distance(tracks, detections, metric='cosine'):
    """
    :param tracks: list[STrack]
    :param detections: list[BaseTrack]
    :param metric:
    :return: cost_matrix np.ndarray
    """

    cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float)
    if cost_matrix.size == 0:
        return cost_matrix
    det_features = np.asarray([track.curr_feat for track in detections], dtype=np.float)
    track_features = np.asarray([track.smooth_feat for track in tracks], dtype=np.float)
    cost_matrix = np.maximum(0.0, cdist(track_features, det_features)) # Nomalized features
    return cost_matrix


def fuse_motion(kf, cost_matrix, tracks, detections, only_position=False, lambda_=0.98, gate=True):
    if cost_matrix.size == 0:
        return cost_matrix
    gating_dim = 2 if only_position else 4
    gating_threshold = kalman_filter.chi2inv95[gating_dim]
    measurements = np.asarray([det.to_xyah() for det in detections])
    for row, track in enumerate(tracks):
        gating_distance = kf.gating_distance(
            track.mean, track.covariance, measurements, only_position, metric='maha')
        if gate:
            cost_matrix[row, gating_distance > gating_threshold] = np.inf
        cost_matrix[row] = lambda_ * cost_matrix[row] + (1-lambda_)* gating_distance
    return cost_matrix


def center_emb_distance(tracks, detections, metric='cosine'):
    """
    :param tracks: list[STrack]
    :param detections: list[BaseTrack]
    :param metric:
    :return: cost_matrix np.ndarray
    """

    cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float)
    if cost_matrix.size == 0:
        return cost_matrix
    det_features = torch.stack([track.curr_feat.squeeze() for track in detections])
    track_features = torch.stack([track.smooth_feat.squeeze() for track in tracks])
    normed_det = F.normalize(det_features)
    normed_track = F.normalize(track_features)
    cost_matrix = torch.mm(normed_track, normed_det.T)
    cost_matrix = 1 - cost_matrix.detach().cpu().numpy()
    return cost_matrix

def recons_distance(tracks, detections, tmp=100):
    """
    :param tracks: list[STrack]
    :param detections: list[BaseTrack]
    :param metric:
    :return: cost_matrix np.ndarray
    """

    cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float)
    if cost_matrix.size == 0:
        return cost_matrix
    det_features_ = torch.stack([track.curr_feat.squeeze() for track in detections])
    track_features_ = torch.stack([track.smooth_feat for track in tracks])
    det_features = F.normalize(det_features_, dim=1)
    track_features = F.normalize(track_features_, dim=1)

    ndet, ndim, nw, nh = det_features.shape
    ntrk, _, _, _ = track_features.shape
    fdet = det_features.permute(0,2,3,1).reshape(-1, ndim).cuda()        # ndet*nw*nh, ndim
    ftrk = track_features.permute(0,2,3,1).reshape(-1, ndim).cuda()      # ntrk*nw*nh, ndim

    aff = torch.mm(ftrk, fdet.transpose(0,1))                             # ntrk*nw*nh, ndet*nw*nh
    aff_td = F.softmax(tmp*aff, dim=1)
    aff_dt = F.softmax(tmp*aff, dim=0).transpose(0,1)

    recons_ftrk = torch.einsum('tds,dsm->tdm', aff_td.view(ntrk*nw*nh, ndet, nw*nh), 
                                fdet.view(ndet, nw*nh, ndim))         # ntrk*nw*nh, ndet, ndim
    recons_fdet = torch.einsum('dts,tsm->dtm', aff_dt.view(ndet*nw*nh, ntrk, nw*nh),
                                ftrk.view(ntrk, nw*nh, ndim))         # ndet*nw*nh, ntrk, ndim
 
    res_ftrk = (recons_ftrk.permute(0,2,1) - ftrk.unsqueeze(-1)).view(ntrk, nw*nh*ndim, ndet)
    res_fdet = (recons_fdet.permute(0,2,1) - fdet.unsqueeze(-1)).view(ndet, nw*nh*ndim, ntrk)

    cost_matrix = (torch.abs(res_ftrk).mean(1) + torch.abs(res_fdet).mean(1).transpose(0,1)) * 0.5
    cost_matrix = cost_matrix / cost_matrix.max(1)[0].unsqueeze(-1) 
    #pdb.set_trace()
    cost_matrix = cost_matrix.cpu().numpy()
    return cost_matrix


def get_track_feat(tracks, feat_flag='curr'):
    if feat_flag == 'curr':
        feat_list = [track.curr_feat.squeeze(0) for track in tracks]
    elif feat_flag == 'smooth':
        feat_list = [track.smooth_feat.squeeze(0) for track in tracks]
    else:
        raise NotImplementedError
    
    n = len(tracks)
    fdim = feat_list[0].shape[0]
    fdim_num = len(feat_list[0].shape)
    if fdim_num > 2:
        feat_list = [f.view(fdim,-1) for f in feat_list]
    numels = [f.shape[1] for f in feat_list]
    
    ret = torch.zeros(n, fdim, np.max(numels)).to(feat_list[0].device)
    for i, f in enumerate(feat_list):
        ret[i, :, :numels[i]] = f
    return ret 

def reconsdot_distance(tracks, detections, tmp=100):
    """
    :param tracks: list[STrack]
    :param detections: list[BaseTrack]
    :param metric:
    :return: cost_matrix np.ndarray
    """
    cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float)
    if cost_matrix.size == 0:
        return cost_matrix, None
    det_features_ = get_track_feat(detections)
    track_features_ = get_track_feat(tracks, feat_flag='curr')

    det_features = F.normalize(det_features_, dim=1)
    track_features = F.normalize(track_features_, dim=1)

    ndet, ndim, nsd = det_features.shape
    ntrk, _, nst = track_features.shape

    fdet = det_features.permute(0, 2, 1).reshape(-1, ndim)
    ftrk = track_features.permute(0, 2, 1).reshape(-1, ndim)

    aff = torch.mm(ftrk, fdet.transpose(0, 1))
    aff_td = F.softmax(tmp*aff, dim=1)
    aff_dt = F.softmax(tmp*aff, dim=0).transpose(0, 1)

    recons_ftrk = torch.einsum('tds,dsm->tdm', aff_td.view(ntrk*nst, ndet, nsd),
                               fdet.view(ndet, nsd, ndim))
    recons_fdet = torch.einsum('dts,tsm->dtm', aff_dt.view(ndet*nsd, ntrk, nst),
                               ftrk.view(ntrk, nst, ndim))

    recons_ftrk = recons_ftrk.permute(0, 2, 1).reshape((ntrk, nst*ndim, ndet))
    recons_ftrk_norm = F.normalize(recons_ftrk, dim=1)
    recons_fdet = recons_fdet.permute(0, 2, 1).view(ndet, nsd*ndim, ntrk)
    recons_fdet_norm = F.normalize(recons_fdet, dim=1)

    dot_td = torch.einsum('tad,ta->td', recons_ftrk_norm,
                          F.normalize(ftrk.reshape(ntrk, nst*ndim), dim=1))
    dot_dt = torch.einsum('dat,da->dt', recons_fdet_norm,
                          F.normalize(fdet.reshape(ndet, nsd*ndim), dim=1))

    cost_matrix = 1 - 0.5 * (dot_td + dot_dt.transpose(0, 1))
    cost_matrix = cost_matrix.detach().cpu().numpy()

    return cost_matrix, None


def category_gate(cost_matrix, tracks, detections):
    """
    :param tracks: list[STrack]
    :param detections: list[BaseTrack]
    :param metric:
    :return: cost_matrix np.ndarray
    """
    if cost_matrix.size == 0:
        return cost_matrix

    det_categories = np.array([d.category for d in detections])
    trk_categories = np.array([t.category for t in tracks])

    cost_matrix = cost_matrix + np.abs(
            det_categories[None, :] - trk_categories[:, None])
    return cost_matrix


================================================
FILE: unitrack/core/motion/kalman_filter.py
================================================
# vim: expandtab:ts=4:sw=4
import numpy as np
import scipy.linalg


"""
Table for the 0.95 quantile of the chi-square distribution with N degrees of
freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv
function and used as Mahalanobis gating threshold.
"""
chi2inv95 = {
    1: 3.8415,
    2: 5.9915,
    3: 7.8147,
    4: 9.4877,
    5: 11.070,
    6: 12.592,
    7: 14.067,
    8: 15.507,
    9: 16.919}


class KalmanFilter(object):
    """
    A simple Kalman filter for tracking bounding boxes in image space.

    The 8-dimensional state space

        x, y, a, h, vx, vy, va, vh

    contains the bounding box center position (x, y), aspect ratio a, height h,
    and their respective velocities.

    Object motion follows a constant velocity model. The bounding box location
    (x, y, a, h) is taken as direct observation of the state space (linear
    observation model).

    """

    def __init__(self):
        ndim, dt = 4, 1.

        # Create Kalman filter model matrices.
        self._motion_mat = np.eye(2 * ndim, 2 * ndim)
        for i in range(ndim):
            self._motion_mat[i, ndim + i] = dt
        self._update_mat = np.eye(ndim, 2 * ndim)

        # Motion and observation uncertainty are chosen relative to the current
        # state estimate. These weights control the amount of uncertainty in
        # the model. This is a bit hacky.
        self._std_weight_position = 1. / 20
        self._std_weight_velocity = 1. / 160

    def initiate(self, measurement):
        """Create track from unassociated measurement.

        Parameters
        ----------
        measurement : ndarray
            Bounding box coordinates (x, y, a, h) with center position (x, y),
            aspect ratio a, and height h.

        Returns
        -------
        (ndarray, ndarray)
            Returns the mean vector (8 dimensional) and covariance matrix (8x8
            dimensional) of the new track. Unobserved velocities are initialized
            to 0 mean.

        """
        mean_pos = measurement
        mean_vel = np.zeros_like(mean_pos)
        mean = np.r_[mean_pos, mean_vel]

        std = [
            2 * self._std_weight_position * measurement[3],
            2 * self._std_weight_position * measurement[3],
            1e-2,
            2 * self._std_weight_position * measurement[3],
            10 * self._std_weight_velocity * measurement[3],
            10 * self._std_weight_velocity * measurement[3],
            1e-5,
            10 * self._std_weight_velocity * measurement[3]]
        covariance = np.diag(np.square(std))
        return mean, covariance

    def predict(self, mean, covariance):
        """Run Kalman filter prediction step.

        Parameters
        ----------
        mean : ndarray
            The 8 dimensional mean vector of the object state at the previous
            time step.
        covariance : ndarray
            The 8x8 dimensional covariance matrix of the object state at the
            previous time step.

        Returns
        -------
        (ndarray, ndarray)
            Returns the mean vector and covariance matrix of the predicted
            state. Unobserved velocities are initialized to 0 mean.

        """
        std_pos = [
            self._std_weight_position * mean[3],
            self._std_weight_position * mean[3],
            1e-2,
            self._std_weight_position * mean[3]]
        std_vel = [
            self._std_weight_velocity * mean[3],
            self._std_weight_velocity * mean[3],
            1e-5,
            self._std_weight_velocity * mean[3]]
        motion_cov = np.diag(np.square(np.r_[std_pos, std_vel]))

        mean = np.dot(mean, self._motion_mat.T)
        covariance = np.linalg.multi_dot((
            self._motion_mat, covariance, self._motion_mat.T)) + motion_cov

        return mean, covariance

    def project(self, mean, covariance):
        """Project state distribution to measurement space.

        Parameters
        ----------
        mean : ndarray
            The state's mean vector (8 dimensional array).
        covariance : ndarray
            The state's covariance matrix (8x8 dimensional).

        Returns
        -------
        (ndarray, ndarray)
            Returns the projected mean and covariance matrix of the given state
            estimate.

        """
        std = [
            self._std_weight_position * mean[3],
            self._std_weight_position * mean[3],
            1e-1,
            self._std_weight_position * mean[3]]
        innovation_cov = np.diag(np.square(std))

        mean = np.dot(self._update_mat, mean)
        covariance = np.linalg.multi_dot((
            self._update_mat, covariance, self._update_mat.T))
        return mean, covariance + innovation_cov
    
    def multi_predict(self, mean, covariance):
        """Run Kalman filter prediction step (Vectorized version).

        Parameters
        ----------
        mean : ndarray
            The Nx8 dimensional mean matrix of the object states at the previous
            time step.
        covariance : ndarray
            The Nx8x8 dimensional covariance matrics of the object states at the
            previous time step.

        Returns
        -------
        (ndarray, ndarray)
            Returns the mean vector and covariance matrix of the predicted
            state. Unobserved velocities are initialized to 0 mean.

        """
        std_pos = [
            self._std_weight_position * mean[:, 3],
            self._std_weight_position * mean[:, 3],
            1e-2 * np.ones_like(mean[:, 3]),
            self._std_weight_position * mean[:, 3]]
        std_vel = [
            self._std_weight_velocity * mean[:, 3],
            self._std_weight_velocity * mean[:, 3],
            1e-5 * np.ones_like(mean[:, 3]),
            self._std_weight_velocity * mean[:, 3]]
        sqr = np.square(np.r_[std_pos, std_vel]).T
        
        motion_cov = []
        for i in range(len(mean)):
            motion_cov.append(np.diag(sqr[i]))
        motion_cov = np.asarray(motion_cov)
            
        mean = np.dot(mean, self._motion_mat.T)
        left = np.dot(self._motion_mat, covariance).transpose((1,0,2))
        covariance = np.dot(left, self._motion_mat.T) + motion_cov

        return mean, covariance

    def update(self, mean, covariance, measurement):
        """Run Kalman filter correction step.

        Parameters
        ----------
        mean : ndarray
            The predicted state's mean vector (8 dimensional).
        covariance : ndarray
            The state's covariance matrix (8x8 dimensional).
        measurement : ndarray
            The 4 dimensional measurement vector (x, y, a, h), where (x, y)
            is the center position, a the aspect ratio, and h the height of the
            bounding box.

        Returns
        -------
        (ndarray, ndarray)
            Returns the measurement-corrected state distribution.

        """
        projected_mean, projected_cov = self.project(mean, covariance)

        chol_factor, lower = scipy.linalg.cho_factor(
            projected_cov, lower=True, check_finite=False)
        kalman_gain = scipy.linalg.cho_solve(
            (chol_factor, lower), np.dot(covariance, self._update_mat.T).T,
            check_finite=False).T
        innovation = measurement - projected_mean

        new_mean = mean + np.dot(innovation, kalman_gain.T)
        new_covariance = covariance - np.linalg.multi_dot((
            kalman_gain, projected_cov, kalman_gain.T))
        return new_mean, new_covariance

    def gating_distance(self, mean, covariance, measurements,
                        only_position=False, metric='maha'):
        """Compute gating distance between state distribution and measurements.

        A suitable distance threshold can be obtained from `chi2inv95`. If
        `only_position` is False, the chi-square distribution has 4 degrees of
        freedom, otherwise 2.

        Parameters
        ----------
        mean : ndarray
            Mean vector over the state distribution (8 dimensional).
        covariance : ndarray
            Covariance of the state distribution (8x8 dimensional).
        measurements : ndarray
            An Nx4 dimensional matrix of N measurements, each in
            format (x, y, a, h) where (x, y) is the bounding box center
            position, a the aspect ratio, and h the height.
        only_position : Optional[bool]
            If True, distance computation is done with respect to the bounding
            box center position only.

        Returns
        -------
        ndarray
            Returns an array of length N, where the i-th element contains the
            squared Mahalanobis distance between (mean, covariance) and
            `measurements[i]`.

        """
        mean, covariance = self.project(mean, covariance)
        if only_position:
            mean, covariance = mean[:2], covariance[:2, :2]
            measurements = measurements[:, :2]
        
        d = measurements - mean
        if metric == 'gaussian':
            return np.sum(d * d, axis=1)
        elif metric == 'maha':
            cholesky_factor = np.linalg.cholesky(covariance)
            z = scipy.linalg.solve_triangular(
                cholesky_factor, d.T, lower=True, check_finite=False,
                overwrite_b=True)
            squared_maha = np.sum(z * z, axis=0)
            return squared_maha
        else:
            raise ValueError('invalid distance metric')


================================================
FILE: unitrack/core/propagation/__init__.py
================================================
###################################################################
# File Name: __init__.py
# Author: Zhongdao Wang
# mail: wcd17@mails.tsinghua.edu.cn
# Created Time: Mon Jan 18 15:57:34 2021
###################################################################

from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

from .propagate_box import propagate_box
from .propagate_mask import propagate_mask
from .propagate_pose import propagate_pose

def propagate(temp_feats, obs, img, model, format='box'):
    if format == 'box':
        return propagate_box(temp_feats, obs, img, model)
    elif format == 'mask':
        return propagate_box(temp_feats, obs, img, model)
    elif format == 'pose':
        return propagate_pose(temp_feats, obs, img, model)
    else:
        raise ValueError('Observation format not supported.')


================================================
FILE: unitrack/core/propagation/propagate_box.py
================================================
###################################################################
# File Name: propagate_box.py
# Author: Zhongdao Wang
# mail: wcd17@mails.tsinghua.edu.cn
# Created Time: Mon Jan 18 16:01:46 2021
###################################################################

from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

def propagate_box(temp_feats, box, img, model):
    pass


================================================
FILE: unitrack/core/propagation/propagate_mask.py
================================================
###################################################################
# File Name: propagate_box.py
# Author: Zhongdao Wang
# mail: wcd17@mails.tsinghua.edu.cn
# Created Time: Mon Jan 18 16:01:46 2021
###################################################################

from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

def propagate_mask(temp_feats, mask, img, model):
    pass


================================================
FILE: unitrack/core/propagation/propagate_pose.py
================================================
###################################################################
# File Name: propagate_box.py
# Author: Zhongdao Wang
# mail: wcd17@mails.tsinghua.edu.cn
# Created Time: Mon Jan 18 16:01:46 2021
###################################################################

from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

def propagate_pose(temp_feats, pose, img, model):
    pass


================================================
FILE: unitrack/mask.py
================================================
###################################################################
# File Name: mask.py
# Author: Zhongdao Wang
# mail: wcd17@mails.tsinghua.edu.cn
# Created Time: Fri Jan 29 15:16:53 2021
###################################################################

import numpy as np
import torch
import torch.nn.functional as F

from unitrack.utils.box import *
from unitrack.utils.mask import *
from .basetrack import *
from .multitracker import AssociationTracker


class MaskAssociationTracker(AssociationTracker):
    def __init__(self, opt):
        super(MaskAssociationTracker, self).__init__(opt)

    def extract_emb(self, img, obs):
        img = img.to(self.opt.device).float()
        with torch.no_grad():
            feat = self.app_model(img)
        _, d, h, w = feat.shape
        obs = torch.from_numpy(obs).to(self.opt.device).float()
        obs = F.interpolate(obs.unsqueeze(1), size=(h,w), mode='nearest')
        template_scale = np.prod(self.opt.feat_size)
        embs = []
        for ob in obs:
            obfeat = ob*feat
            scale = ob.sum()
            if scale > 0:
                if scale > self.opt.max_mask_area:
                    scale_factor = np.sqrt(self.opt.max_mask_area/scale.item())
                else:
                    scale_factor = 1
                norm_obfeat = F.interpolate(obfeat, scale_factor=scale_factor, mode='bilinear')
                norm_mask = F.interpolate(ob.unsqueeze(1), scale_factor=scale_factor, mode='nearest')
                emb = norm_obfeat[:,:, norm_mask.squeeze(0).squeeze(0).ge(0.5)]
                # print("embedding", emb.shape)
                embs.append(emb.cpu())
            else: 
                embs.append(torch.randn(d, template_scale))
        return obs, embs

    def prepare_obs(self, img, img0, obs, embs=None):
        ''' Step 1: Network forward, get detections & embeddings'''
        if obs.shape[0] > 0:
            masks, embs = self.extract_emb(img, obs)
            boxes = mask2box(masks)
            keep_idx = remove_duplicated_box(boxes, iou_th=0.7)
            boxes, masks, obs = boxes[keep_idx], masks[keep_idx], obs[keep_idx]
            embs = [embs[k] for k in keep_idx]
            detections = [STrack(tlbr_to_tlwh(tlbrs), 1, f, self.buffer_size, mask, ac=True) \
                    for (tlbrs,mask,f) in zip(boxes, obs, embs)]
        else:
            detections = []
        return detections


================================================
FILE: unitrack/mask_with_train_embs.py
================================================
###################################################################
# File Name: mask.py
# Author: Zhongdao Wang
# mail: wcd17@mails.tsinghua.edu.cn
# Created Time: Fri Jan 29 15:16:53 2021
###################################################################
import time

import numpy as np
import torch
import torch.nn.functional as F

from unitrack.utils.box import *
from unitrack.utils.mask import *
from .basetrack import *

from unitrack.model import AppearanceModel

class AssociationTrackerWithTrainedEmbed(object):
    def __init__(self, opt):
        self.opt = opt
        self.tracked_stracks = []  # type: list[STrack]
        self.lost_stracks = []  # type: list[STrack]
        self.removed_stracks = []  # type: list[STrack]

        self.frame_id = 0
        self.det_thresh = opt.conf_thres
        self.buffer_size = opt.track_buffer
        self.max_time_lost = self.buffer_size

        self.kalman_filter = KalmanFilter()

        # self.app_model = AppearanceModel(opt).to(opt.device)
        # self.app_model.eval()

        if not self.opt.asso_with_motion:
            self.opt.motion_lambda = 1
            self.opt.motion_gated = False

    def extract_emb(self, img, obs):
        raise NotImplementedError

    def prepare_obs(self, img, img0, obs, embs=None):
        raise NotImplementedError

    def update(self, img, img0, obs, embs=None):
        torch.cuda.empty_cache()
        self.frame_id += 1
        activated_stracks = []
        refind_stracks = []
        lost_stracks = []
        removed_stracks = []

        t1 = time.time()
        detections = self.prepare_obs(img, img0, obs, embs=None)

        ''' Add newly detected tracklets to tracked_stracks'''
        unconfirmed = []
        tracked_stracks = []  # type: list[STrack]
        for track in self.tracked_stracks:
            if not track.is_activated:
                unconfirmed.append(track)
            else:
                tracked_stracks.append(track)

        ''' Step 2: First association, with embedding'''
        tracks = joint_stracks(tracked_stracks, self.lost_stracks)
        dists, recons_ftrk = matching.center_emb_distance(tracks, detections)
        if self.opt.use_kalman:
            # Predict the current location with KF
            STrack.multi_predict(tracks)
            dists = matching.fuse_motion(self.kalman_filter, dists, tracks, detections,
                                         lambda_=self.opt.motion_lambda, gate=self.opt.motion_gated)
        if obs.shape[1] == 6:
            dists = matching.category_gate(dists, tracks, detections)
        matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.7)

        for itracked, idet in matches:
            track = tracks[itracked]
            det = detections[idet]
            if track.state == TrackState.Tracked:
                track.update(detections[idet], self.frame_id)
                activated_stracks.append(track)
            else:
                track.re_activate(det, self.frame_id, new_id=False)
                refind_stracks.append(track)

        if self.opt.use_kalman:
            '''(optional) Step 3: Second association, with IOU'''
            tracks = [tracks[i] for i in u_track if tracks[i].state == TrackState.Tracked]
            detections = [detections[i] for i in u_detection]
            dists = matching.iou_distance(tracks, detections)
            matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.5)

            for itracked, idet in matches:
                track = tracks[itracked]
                det = detections[idet]
                if track.state == TrackState.Tracked:
                    track.update(det, self.frame_id)
                    activated_stracks.append(track)
                else:
                    track.re_activate(det, self.frame_id, new_id=False)
                    refind_stracks.append(track)

            '''Deal with unconfirmed tracks, usually tracks with only one beginning frame'''
            detections = [detections[i] for i in u_detection]
            dists = matching.iou_distance(unconfirmed, detections)
            matches, u_unconfirmed, u_detection = matching.linear_assignment(
                dists, thresh=self.opt.confirm_iou_thres)
            for itracked, idet in matches:
                unconfirmed[itracked].update(detections[idet], self.frame_id)
                activated_stracks.append(unconfirmed[itracked])
            for it in u_unconfirmed:
                track = unconfirmed[it]
                track.mark_removed()
                removed_stracks.append(track)

        for it in u_track:
            track = tracks[it]
            if not track.state == TrackState.Lost:
                track.mark_lost()
                lost_stracks.append(track)

        """ Step 4: Init new stracks"""
        for inew in u_detection:
            track = detections[inew]
            if track.score < self.det_thresh:
                continue
            track.activate(self.kalman_filter, self.frame_id)
            activated_stracks.append(track)

        """ Step 5: Update state"""
        for track in self.lost_stracks:
            if self.frame_id - track.end_frame > self.max_time_lost:
                track.mark_removed()
                removed_stracks.append(track)

        self.tracked_stracks = [t for t in self.tracked_stracks if t.state == TrackState.Tracked]
        self.tracked_stracks = joint_stracks(self.tracked_stracks, activated_stracks)
        self.tracked_stracks = joint_stracks(self.tracked_stracks, refind_stracks)
        self.lost_stracks = sub_stracks(self.lost_stracks, self.tracked_stracks)
        self.lost_stracks.extend(lost_stracks)
        self.lost_stracks = sub_stracks(self.lost_stracks, self.removed_stracks)
        self.removed_stracks.extend(removed_stracks)
        self.tracked_stracks, self.lost_stracks = remove_duplicate_stracks(
            self.tracked_stracks, self.lost_stracks, ioudist=self.opt.dup_iou_thres)

        # get scores of lost tracks
        output_stracks = [track for track in self.tracked_stracks if track.is_activated]

        return output_stracks

    def reset_all(self, ):
        self.tracked_stracks = []  # type: list[STrack]
        self.lost_stracks = []  # type: list[STrack]
        self.removed_stracks = []  # type: list[STrack]
        self.frame_id = 0


class MaskAssociationTracker(AssociationTrackerWithTrainedEmbed):
    def __init__(self, opt):
        super(MaskAssociationTracker, self).__init__(opt)

    def extract_emb(self, img, obs, embs):
        img = img.to(self.opt.device).float()
        obs = obs.to(self.opt.device).float()
        embs = embs.to(self.opt.device).float().unsqueeze(-1)
        # print(img.shape)
        # print("obs", obs.shape)
        # print("embs", embs.shape)
        # exit()
        # obs = F.interpolate(obs.unsqueeze(1), size=(h,w), mode='nearest')
        # template_scale = np.prod(self.opt.feat_size)
        embs_list = []
        for emb in embs:
            # obfeat = ob
            embs_list.append(emb.cpu())
            # scale = ob.sum()
            # if scale > 0:
            #     if scale > self.opt.max_mask_area:
            #         scale_factor = np.sqrt(self.opt.max_mask_area/scale.item())
            #     else:
            #         scale_factor = 1
            #     norm_obfeat = F.interpolate(obfeat, scale_factor=scale_factor, mode='bilinear')
            #     norm_mask = F.interpolate(ob.unsqueeze(1), scale_factor=scale_factor, mode='nearest')
            #     emb = norm_obfeat[:,:, norm_mask.squeeze(0).squeeze(0).ge(0.5)]
            #     embs.append(emb.cpu())
            # else:
            #     embs.append(torch.randn(d, template_scale))
        return obs, embs_list

    def prepare_obs(self, img, img0, obs, embs=None):
        ''' Step 1: Network forward, get detections & embeddings'''
        if obs.shape[0] > 0:
            if embs is not None:
                masks, embs = self.extract_emb(img, obs, embs)
            boxes = mask2box(masks)
            keep_idx = remove_duplicated_box(boxes, iou_th=0.7)
            boxes, masks, obs = boxes[keep_idx], masks[keep_idx], obs[keep_idx]
            embs = [embs[k] for k in keep_idx]
            detections = [STrack(tlbr_to_tlwh(tlbrs), 1, f, self.buffer_size, mask, ac=True) \
                    for (tlbrs,mask,f) in zip(boxes, obs, embs)]
        else:
            detections = []
        return detections


================================================
FILE: unitrack/model/__init__.py
================================================
###################################################################
# File Name: __init__.py
# Author: Zhongdao Wang
# mail: wcd17@mails.tsinghua.edu.cn
# Created Time: Thu Dec 24 14:24:44 2020
###################################################################

from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

from .model import *
from .resnet import *


================================================
FILE: unitrack/model/functional.py
================================================
###################################################################
# File Name: functional.py
# Author: Zhongdao Wang
# mail: wcd17@mails.tsinghua.edu.cn
# Created Time: Mon Jun 21 21:04:09 2021
###################################################################

from __future__ import print_function
from __future__ import division
from __future__ import absolute_import


import torch
import torch.nn as nn
import torch.nn.functional as F

def hard_prop(pred):
    pred_max = pred.max(axis=0)[0]
    pred[pred <  pred_max] = 0
    pred[pred >= pred_max] = 1
    pred /= pred.sum(0)[None]
    return pred

def context_index_bank(n_context, long_mem, N):
    '''
    Construct bank of source frames indices, for each target frame
    '''
    ll = []   # "long term" context (i.e. first frame)
    for t in long_mem:
        assert 0 <= t < N, 'context frame out of bounds'
        idx = torch.zeros(N, 1).long()
        if t > 0:
            idx += t + (n_context+1)
            idx[:n_context+t+1] = 0
        ll.append(idx)
    # "short" context    
    ss = [(torch.arange(n_context)[None].repeat(N, 1) +  \
            torch.arange(N)[:, None])[:, :]]
    return ll + ss


def mem_efficient_batched_affinity(
        query, keys, mask, temperature, topk, long_mem, device):
    '''
    Mini-batched computation of affinity, for memory efficiency
    '''
    bsize, pbsize = 10, 100 #keys.shape[2] // 2
    Ws, Is = [], []

    for b in range(0, keys.shape[2], bsize):
        _k, _q = keys[:, :, b:b+bsize].to(device), query[:, :, b:b+bsize].to(device)
        w_s, i_s = [], []

        for pb in range(0, _k.shape[-1], pbsize):
            A = torch.einsum('ijklm,ijkn->iklmn', _k, _q[..., pb:pb+pbsize]) 
            A[0, :, len(long_mem):] += mask[..., pb:pb+pbsize].to(device)

            _, N, T, h1w1, hw = A.shape
            A = A.view(N, T*h1w1, hw)
            A /= temperature

            weights, ids = torch.topk(A, topk, dim=-2)
            weights = F.softmax(weights, dim=-2)
            
            w_s.append(weights.cpu())
            i_s.append(ids.cpu())

        weights = torch.cat(w_s, dim=-1)
        ids = torch.cat(i_s, dim=-1)
        Ws += [w for w in weights]
        Is += [ii for ii in ids]

    return Ws, Is

def batched_affinity(query, keys, mask, temperature, topk, long_mem, device):
    '''
    Mini-batched computation of affinity, for memory efficiency
    (less aggressively mini-batched)
    '''
    bsize = 2
    Ws, Is = [], []
    for b in range(0, keys.shape[2], bsize):
        _k, _q = keys[:, :, b:b+bsize].to(device), query[:, :, b:b+bsize].to(device)
        w_s, i_s = [], []

        A = torch.einsum('ijklmn,ijkop->iklmnop', _k, _q) / temperature
        
        # Mask
        A[0, :, len(long_mem):] += mask.to(device)

        _, N, T, h1w1, hw = A.shape
        A = A.view(N, T*h1w1, hw)
        A /= temperature

        weights, ids = torch.topk(A, topk, dim=-2)
        weights = F.softmax(weights, dim=-2)
            
        Ws += [w for w in weights]
        Is += [ii for ii in ids]
    
    return Ws, Is

def process_pose(pred, lbl_set, topk=3):
    # generate the coordinates:
    pred = pred[..., 1:]
    flatlbls = pred.flatten(0,1)
    topk = min(flatlbls.shape[0], topk)
    
    vals, ids = torch.topk(flatlbls, k=topk, dim=0)
    vals /= vals.sum(0)[None]
    xx, yy = ids % pred.shape[1], ids // pred.shape[1]

    current_coord = torch.stack([(xx * vals).sum(0), (yy * vals).sum(0)], dim=0)
    current_coord[:, flatlbls.sum(0) == 0] = -1

    pred_val_sharp = np.zeros((*pred.shape[:2], 3))

    for t in range(len(lbl_set) - 1):
        x = int(current_coord[0, t])
        y = int(current_coord[1, t])

        if x >=0 and y >= 0:
            pred_val_sharp[y, x, :] = lbl_set[t + 1]

    return current_coord.cpu(), pred_val_sharp

class MaskedAttention(nn.Module):
    '''
    A module that implements masked attention based on spatial locality 
    TODO implement in a more efficient way (torch sparse or correlation filter)
    '''
    def __init__(self, radius, flat=True):
        super(MaskedAttention, self).__init__()
        self.radius = radius
        self.flat = flat
        self.masks = {}
        self.index = {}

    def mask(self, H, W):
        if not ('%s-%s' %(H,W) in self.masks):
            self.make(H, W)
        return self.masks['%s-%s' %(H,W)]

    def index(self, H, W):
        if not ('%s-%s' %(H,W) in self.index):
            self.make_index(H, W)
        return self.index['%s-%s' %(H,W)]

    def make(self, H, W):
        if self.flat:
            H = int(H**0.5)
            W = int(W**0.5)
        
        gx, gy = torch.meshgrid(torch.arange(0, H), torch.arange(0, W))
        D = ( (gx[None, None, :, :] - gx[:, :, None, None])**2 + (gy[None, None, :, :] - gy[:, :, None, None])**2 ).float() ** 0.5
        D = (D < self.radius)[None].float()

        if self.flat:
            D = self.flatten(D)
        self.masks['%s-%s' %(H,W)] = D

        return D

    def flatten(self, D):
        return torch.flatten(torch.flatten(D, 1, 2), -2, -1)

    def make_index(self, H, W, pad=False):
        mask = self.mask(H, W).view(1, -1).byte()
        idx = torch.arange(0, mask.numel())[mask[0]][None]

        self.index['%s-%s' %(H,W)] = idx

        return idx
        
    def forward(self, x):
        H, W = x.shape[-2:]
        sid = '%s-%s' % (H,W)
        if sid not in self.masks:
            self.masks[sid] = self.make(H, W).to(x.device)
        mask = self.masks[sid]
        return x * mask[0]


================================================
FILE: unitrack/model/hrnet.py
================================================
# ------------------------------------------------------------------------------
# Copyright (c) Microsoft
# Licensed under the MIT License.
# Written by Bin Xiao (Bin.Xiao@microsoft.com)
# Modified by Ke Sun (sunk@mail.ustc.edu.cn)
# Modified by Zhongdao Wang(wcd17@mails.tsinghua.edu.cn)
# ------------------------------------------------------------------------------

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import pdb
import logging
import functools

import numpy as np

import torch
import torch.nn as nn
import torch._utils
import torch.nn.functional as F

BN_MOMENTUM = 0.1
logger = logging.getLogger(__name__)


def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
                               bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
                               momentum=BN_MOMENTUM)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class HighResolutionModule(nn.Module):
    def __init__(self, num_branches, blocks, num_blocks, num_inchannels,
                 num_channels, fuse_method, multi_scale_output=True):
        super(HighResolutionModule, self).__init__()
        self._check_branches(
            num_branches, blocks, num_blocks, num_inchannels, num_channels)

        self.num_inchannels = num_inchannels
        self.fuse_method = fuse_method
        self.num_branches = num_branches

        self.multi_scale_output = multi_scale_output

        self.branches = self._make_branches(
            num_branches, blocks, num_blocks, num_channels)
        self.fuse_layers = self._make_fuse_layers()
        self.relu = nn.ReLU(False)

    def _check_branches(self, num_branches, blocks, num_blocks,
                        num_inchannels, num_channels):
        if num_branches != len(num_blocks):
            error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format(
                num_branches, len(num_blocks))
            logger.error(error_msg)
            raise ValueError(error_msg)

        if num_branches != len(num_channels):
            error_msg = 'NUM_BRANCHES({}) <> NUM_CHANNELS({})'.format(
                num_branches, len(num_channels))
            logger.error(error_msg)
            raise ValueError(error_msg)

        if num_branches != len(num_inchannels):
            error_msg = 'NUM_BRANCHES({}) <> NUM_INCHANNELS({})'.format(
                num_branches, len(num_inchannels))
            logger.error(error_msg)
            raise ValueError(error_msg)

    def _make_one_branch(self, branch_index, block, num_blocks, num_channels,
                         stride=1):
        downsample = None
        if stride != 1 or \
           self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.num_inchannels[branch_index],
                          num_channels[branch_index] * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(num_channels[branch_index] * block.expansion,
                            momentum=BN_MOMENTUM),
            )

        layers = []
        layers.append(block(self.num_inchannels[branch_index],
                            num_channels[branch_index], stride, downsample))
        self.num_inchannels[branch_index] = \
            num_channels[branch_index] * block.expansion
        for i in range(1, num_blocks[branch_index]):
            layers.append(block(self.num_inchannels[branch_index],
                                num_channels[branch_index]))

        return nn.Sequential(*layers)

    def _make_branches(self, num_branches, block, num_blocks, num_channels):
        branches = []

        for i in range(num_branches):
            branches.append(
                self._make_one_branch(i, block, num_blocks, num_channels))

        return nn.ModuleList(branches)

    def _make_fuse_layers(self):
        if self.num_branches == 1:
            return None

        num_branches = self.num_branches
        num_inchannels = self.num_inchannels
        fuse_layers = []
        for i in range(num_branches if self.multi_scale_output else 1):
            fuse_layer = []
            for j in range(num_branches):
                if j > i:
                    fuse_layer.append(nn.Sequential(
                        nn.Conv2d(num_inchannels[j],
                                  num_inchannels[i],
                                  1,
                                  1,
                                  0,
                                  bias=False),
                        nn.BatchNorm2d(num_inchannels[i], 
                                       momentum=BN_MOMENTUM),
                        nn.Upsample(scale_factor=2**(j-i), mode='nearest')))
                elif j == i:
                    fuse_layer.append(None)
                else:
                    conv3x3s = []
                    for k in range(i-j):
                        if k == i - j - 1:
                            num_outchannels_conv3x3 = num_inchannels[i]
                            conv3x3s.append(nn.Sequential(
                                nn.Conv2d(num_inchannels[j],
                                          num_outchannels_conv3x3,
                                          3, 2, 1, bias=False),
                                nn.BatchNorm2d(num_outchannels_conv3x3, 
                                            momentum=BN_MOMENTUM)))
                        else:
                            num_outchannels_conv3x3 = num_inchannels[j]
                            conv3x3s.append(nn.Sequential(
                                nn.Conv2d(num_inchannels[j],
                                          num_outchannels_conv3x3,
                                          3, 2, 1, bias=False),
                                nn.BatchNorm2d(num_outchannels_conv3x3,
                                            momentum=BN_MOMENTUM),
                                nn.ReLU(False)))
                    fuse_layer.append(nn.Sequential(*conv3x3s))
            fuse_layers.append(nn.ModuleList(fuse_layer))

        return nn.ModuleList(fuse_layers)

    def get_num_inchannels(self):
        return self.num_inchannels

    def forward(self, x):
        if self.num_branches == 1:
            return [self.branches[0](x[0])]

        for i in range(self.num_branches):
            x[i] = self.branches[i](x[i])

        x_fuse = []
        for i in range(len(self.fuse_layers)):
            y = x[0] if i == 0 else self.fuse_layers[i][0](x[0])
            for j in range(1, self.num_branches):
                if i == j:
                    y = y + x[j]
                else:
                    fused = self.fuse_layers[i][j](x[j])
                    fh, fw = fused.shape[-2:]
                    yh, yw = y.shape[-2:]
                    if fh > yh:
                        fused = fused[:,:,(fh-yh)//2:-(fh-yh)//2,:]
                    if fw > yw:
                        fused = fused[:,:,:,(fw-yw)//2:-(fw-yw)//2]
                    y = y + fused
            x_fuse.append(self.relu(y))

        return x_fuse


blocks_dict = {
    'BASIC': BasicBlock,
    'BOTTLENECK': Bottleneck
}


class HighResolutionNet(nn.Module):

    def __init__(self, cfg, **kwargs):
        super(HighResolutionNet, self).__init__()
        self.cfg = cfg

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1,
                               bias=False)
        self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
        self.relu = nn.ReLU(inplace=True)

        self.stage1_cfg = cfg['MODEL']['EXTRA']['STAGE1']
        num_channels = self.stage1_cfg['NUM_CHANNELS'][0]
        block = blocks_dict[self.stage1_cfg['BLOCK']]
        num_blocks = self.stage1_cfg['NUM_BLOCKS'][0]
        self.layer1 = self._make_layer(block, 64, num_channels, num_blocks)
        stage1_out_channel = block.expansion*num_channels

        self.stage2_cfg = cfg['MODEL']['EXTRA']['STAGE2']
        num_channels = self.stage2_cfg['NUM_CHANNELS']
        block = blocks_dict[self.stage2_cfg['BLOCK']]
        num_channels = [
            num_channels[i] * block.expansion for i in range(len(num_channels))]
        self.transition1 = self._make_transition_layer(
            [stage1_out_channel], num_channels)
        self.stage2, pre_stage_channels = self._make_stage(
            self.stage2_cfg, num_channels)

        self.stage3_cfg = cfg['MODEL']['EXTRA']['STAGE3']
        num_channels = self.stage3_cfg['NUM_CHANNELS']
        block = blocks_dict[self.stage3_cfg['BLOCK']]
        num_channels = [
            num_channels[i] * block.expansion for i in range(len(num_channels))]
        self.transition2 = self._make_transition_layer(
            pre_stage_channels, num_channels)
        self.stage3, pre_stage_channels = self._make_stage(
            self.stage3_cfg, num_channels)

        self.stage4_cfg = cfg['MODEL']['EXTRA']['STAGE4']
        num_channels = self.stage4_cfg['NUM_CHANNELS']
        block = blocks_dict[self.stage4_cfg['BLOCK']]
        num_channels = [
            num_channels[i] * block.expansion for i in range(len(num_channels))]
        self.transition3 = self._make_transition_layer(
            pre_stage_channels, num_channels)
        self.stage4, pre_stage_channels = self._make_stage(
            self.stage4_cfg, num_channels, multi_scale_output=True)

        # Classification Head
        self.incre_modules, self.downsamp_modules, \
            self.final_layer = self._make_head(pre_stage_channels)

        self.classifier = nn.Linear(2048, 1000)

    def _make_head(self, pre_stage_channels):
        head_block = Bottleneck
        head_channels = [32, 64, 128, 256]

        # Increasing the #channels on each resolution 
        # from C, 2C, 4C, 8C to 128, 256, 512, 1024
        incre_modules = []
        for i, channels  in enumerate(pre_stage_channels):
            incre_module = self._make_layer(head_block,
                                            channels,
                                            head_channels[i],
                                            1,
                                            stride=1)
            incre_modules.append(incre_module)
        incre_modules = nn.ModuleList(incre_modules)
            
        # downsampling modules
        downsamp_modules = []
        for i in range(len(pre_stage_channels)-1):
            in_channels = head_channels[i] * head_block.expansion
            out_channels = head_channels[i+1] * head_block.expansion

            downsamp_module = nn.Sequential(
                nn.Conv2d(in_channels=in_channels,
                          out_channels=out_channels,
                          kernel_size=3,
                          stride=2,
                          padding=1),
                nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM),
                nn.ReLU(inplace=True)
            )

            downsamp_modules.append(downsamp_module)
        downsamp_modules = nn.ModuleList(downsamp_modules)

        final_layer = nn.Sequential(
            nn.Conv2d(
                in_channels=head_channels[3] * head_block.expansion,
                out_channels=2048,
                kernel_size=1,
                stride=1,
                padding=0
            ),
            nn.BatchNorm2d(2048, momentum=BN_MOMENTUM),
            nn.ReLU(inplace=True)
        )

        return incre_modules, downsamp_modules, final_layer

    def _make_transition_layer(
            self, num_channels_pre_layer, num_channels_cur_layer):
        num_branches_cur = len(num_channels_cur_layer)
        num_branches_pre = len(num_channels_pre_layer)

        transition_layers = []
        for i in range(num_branches_cur):
            if i < num_branches_pre:
                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
                    transition_layers.append(nn.Sequential(
                        nn.Conv2d(num_channels_pre_layer[i],
                                  num_channels_cur_layer[i],
                                  3,
                                  1,
                                  1,
                                  bias=False),
                        nn.BatchNorm2d(
                            num_channels_cur_layer[i], momentum=BN_MOMENTUM),
                        nn.ReLU(inplace=True)))
                else:
                    transition_layers.append(None)
            else:
                conv3x3s = []
                for j in range(i+1-num_branches_pre):
                    inchannels = num_channels_pre_layer[-1]
                    outchannels = num_channels_cur_layer[i] \
                        if j == i-num_branches_pre else inchannels
                    conv3x3s.append(nn.Sequential(
                        nn.Conv2d(
                            inchannels, outchannels, 3, 2, 1, bias=False),
                        nn.BatchNorm2d(outchannels, momentum=BN_MOMENTUM),
                        nn.ReLU(inplace=True)))
                transition_layers.append(nn.Sequential(*conv3x3s))

        return nn.ModuleList(transition_layers)

    def _make_layer(self, block, inplanes, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
            )

        layers = []
        layers.append(block(inplanes, planes, stride, downsample))
        inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(inplanes, planes))

        return nn.Sequential(*layers)

    def _make_stage(self, layer_config, num_inchannels,
                    multi_scale_output=True):
        num_modules = layer_config['NUM_MODULES']
        num_branches = layer_config['NUM_BRANCHES']
        num_blocks = layer_config['NUM_BLOCKS']
        num_channels = layer_config['NUM_CHANNELS']
        block = blocks_dict[layer_config['BLOCK']]
        fuse_method = layer_config['FUSE_METHOD']

        modules = []
        for i in range(num_modules):
            # multi_scale_output is only used last module
            if not multi_scale_output and i == num_modules - 1:
                reset_multi_scale_output = False
            else:
                reset_multi_scale_output = True

            modules.append(
                HighResolutionModule(num_branches,
                                      block,
                                      num_blocks,
                                      num_inchannels,
                                      num_channels,
                                      fuse_method,
                                      reset_multi_scale_output)
            )
            num_inchannels = modules[-1].get_num_inchannels()

        return nn.Sequential(*modules), num_inchannels

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.layer1(x)

        x_list = []
        for i in range(self.stage2_cfg['NUM_BRANCHES']):
            if self.transition1[i] is not None:
                x_list.append(self.transition1[i](x))
            else:
                x_list.append(x)
        y_list = self.stage2(x_list)

        x_list = []
        for i in range(self.stage3_cfg['NUM_BRANCHES']):
            if self.transition2[i] is not None:
                x_list.append(self.transition2[i](y_list[-1]))
            else:
                x_list.append(y_list[i])
        y_list = self.stage3(x_list)

        x_list = []
        for i in range(self.stage4_cfg['NUM_BRANCHES']):
            if self.transition3[i] is not None:
                x_list.append(self.transition3[i](y_list[-1]))
            else:
                x_list.append(y_list[i])
        y_list = self.stage4(x_list)

        # Classification Head
        y_list_out = {}
        y_list_out[0] = self.incre_modules[0](y_list[0])
        for i in range(len(self.downsamp_modules)):
            y_list_out[i+1] = self.incre_modules[i+1](y_list[i+1]) + \
                        self.downsamp_modules[i](y_list_out[i])

        #y = self.final_layer(y)

        ret = y_list_out[self.cfg['MODEL']['RETURN_STAGE']]

        ret_size = y_list_out[1].shape[-2:]
        ret = F.interpolate(ret, ret_size, mode='bilinear')
        return ret

    def init_weights(self, pretrained='',):
        print('=> init weights from normal distribution')
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(
                    m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
        if os.path.isfile(pretrained):
            pretrained_dict = torch.load(pretrained)
            print('=> loading pretrained model {}'.format(pretrained))
            model_dict = self.state_dict()
            pretrained_dict = {k: v for k, v in pretrained_dict.items()
                               if k in model_dict.keys()}
            for k, _ in pretrained_dict.items():
                print(
                    '=> loading {} pretrained model {}'.format(k, pretrained))
            model_dict.update(pretrained_dict)
            self.load_state_dict(model_dict)


config = {
'hrnet_w18': {
    'MODEL':{
        'EXTRA':{
            'STAGE1':{
                'NUM_MODULES':1,
                'NUM_BRANCHES':1,
                'BLOCK': 'BOTTLENECK',
                'NUM_BLOCKS':[4,],
                'NUM_CHANNELS':[64,],
                'FUSE_METHOD': 'SUM',
                },
            'STAGE2':{
                'NUM_MODULES':1,
                'NUM_BRANCHES':2,
                'BLOCK': 'BASIC',
                'NUM_BLOCKS':[4,4,],
                'NUM_CHANNELS':[18, 36],
                'FUSE_METHOD': 'SUM',
                },
            'STAGE3':{
                'NUM_MODULES':4,
                'NUM_BRANCHES':3,
                'BLOCK': 'BASIC',
                'NUM_BLOCKS':[4,4,4],
                'NUM_CHANNELS':[18, 36, 72],
                'FUSE_METHOD': 'SUM',
                },
            'STAGE4':{
                'NUM_MODULES':3,
                'NUM_BRANCHES':4,
                'BLOCK': 'BASIC',
                'NUM_BLOCKS':[4,4,4,4],
                'NUM_CHANNELS':[18, 36, 72, 144],
                'FUSE_METHOD': 'SUM',
                },
            }
        } 
    },
'hrnet_w32': {
    'MODEL':{
        'EXTRA':{
            'STAGE1':{
                'NUM_MODULES':1,
                'NUM_BRANCHES':1,
                'BLOCK': 'BOTTLENECK',
                'NUM_BLOCKS':[4,],
                'NUM_CHANNELS':[64,],
                'FUSE_METHOD': 'SUM',
                },
            'STAGE2':{
                'NUM_MODULES':1,
                'NUM_BRANCHES':2,
                'BLOCK': 'BASIC',
                'NUM_BLOCKS':[4,4,],
                'NUM_CHANNELS':[32, 64],
                'FUSE_METHOD': 'SUM',
                },
            'STAGE3':{
                'NUM_MODULES':4,
                'NUM_BRANCHES':3,
                'BLOCK': 'BASIC',
                'NUM_BLOCKS':[4,4,4],
                'NUM_CHANNELS':[32, 64, 128],
                'FUSE_METHOD': 'SUM',
                },
            'STAGE4':{
                'NUM_MODULES':3,
                'NUM_BRANCHES':4,
                'BLOCK': 'BASIC',
                'NUM_BLOCKS':[4,4,4,4],
                'NUM_CHANNELS':[32, 64, 128, 256],
                'FUSE_METHOD': 'SUM',
                },
            }
        } 
    }
}

def get_cls_net(c, **kwargs):
    cfg = config[c]
    cfg['MODEL']['RETURN_STAGE'] = kwargs['return_stage']
    model = HighResolutionNet(cfg, **kwargs)
    model.init_weights(pretrained=kwargs['pretrained'])
    return model

if __name__ == '__main__':
    net = get_cls_net('hrnet_w18', return_stage=2, pretrained='../weights/hrnetv2_w18_imagenet.pth')
    pdb.set_trace()


================================================
FILE: unitrack/model/model.py
================================================
import pdb
import os.path as osp

import torch
import torch.nn as nn

from unitrack.model import resnet
from unitrack.model import hrnet
from unitrack.model import random_feat_generator

class AppearanceModel(nn.Module):
    def __init__(self, args):
        super(AppearanceModel, self).__init__()
        self.args = args
        
        self.model = make_encoder(args).to(self.args.device)
    def forward(self, x):
        z = self.model(x)
        return z

def partial_load(pretrained_dict, model, skip_keys=[], log=False):
    model_dict = model.state_dict()
    
    # 1. filter out unnecessary keys
    filtered_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict and not any([sk in k for sk in skip_keys])}
    skipped_keys = [k for k in pretrained_dict if k not in filtered_dict]
    unload_keys = [k for k in model_dict if k not in pretrained_dict]
    
    # 2. overwrite entries in the existing state dict
    model_dict.update(filtered_dict)

    # 3. load the new state dict
    model.load_state_dict(model_dict)

    if log:
        print('\nSkipped keys: ', skipped_keys)
        print('\nLoading keys: ', filtered_dict.keys())
        print('\nUnLoaded keys: ', unload_keys)

def load_vince_model(path):
    checkpoint = torch.load(path, map_location={'cuda:0': 'cpu'})
    checkpoint = {k.replace('feature_extractor.module.model.', ''): checkpoint[k] for k in checkpoint if 'feature_extractor' in k}
    return checkpoint


def load_uvc_model(ckpt_path):
    net = resnet.resnet18()
    net.avgpool, net.fc = None, None

    ckpt = torch.load(ckpt_path, map_location='cpu')
    state_dict = {k.replace('module.gray_encoder.', ''):v for k,v in ckpt['state_dict'].items() if 'gray_encoder' in k}
    partial_load(state_dict, net)

    return net


def load_tc_model(ckpt_path):
    model_state = torch.load(ckpt_path, map_location='cpu')['state_dict']
    
    net = resnet.resnet50()
    net_state = net.state_dict()

    for k in [k for k in model_state.keys() if 'encoderVideo' in k]:
        kk = k.replace('module.encoderVideo.', '')
        tmp = model_state[k]
        if net_state[kk].shape != model_state[k].shape and net_state[kk].dim() == 4 and model_state[k].dim() == 5:
            tmp = model_state[k].squeeze(2)
        net_state[kk][:] = tmp[:]
        
    net.load_state_dict(net_state)

    return net

class From3D(nn.Module):
    ''' Use a 2D convnet as a 3D convnet '''
    def __init__(self, resnet):
        super(From3D, self).__init__()
        self.model = resnet
    
    def forward(self, x):
        N, C, T, h, w = x.shape
        xx = x.permute(0, 2, 1, 3, 4).contiguous().view(-1, C, h, w)
        m = self.model(xx)

        return m.view(N, T, *m.shape[-3:]).permute(0, 2, 1, 3, 4)


def make_encoder(args):
    SSL_MODELS = ['byol', 'deepcluster-v2', 'infomin', 'insdis', 'moco-v1', 'moco-v2',
            'pcl-v1', 'pcl-v2','pirl', 'sela-v2', 'swav', 'simclr-v1', 'simclr-v2',
            'pixpro', 'detco', 'barlowtwins']
    model_type = args.model_type
    if model_type == 'crw':
        net = resnet.resnet18()
        if osp.isfile(args.resume):
            ckpt = torch.load(args.resume)
            state = {}
            for k, v in ckpt['model'].items():
                if 'conv1.1.weight' in k or 'conv2.1.weight' in k:
                    state[k.replace('.1.weight', '.weight')] = v
                if 'encoder.model' in k:
                    state[k.replace('encoder.model.', '')] = v
                else:
                    state[k] = v
            partial_load(state, net, skip_keys=['head',])
            del ckpt
    elif model_type == 'random18':
        net = resnet.resnet18(pretrained=False)
    elif model_type == 'random50':
        net = resnet.resnet50(pretrained=False)
    elif model_type == 'imagenet18':
        net = resnet.resnet18(pretrained=True)
    elif model_type == 'imagenet50':
        net = resnet.resnet50(pretrained=True)
    elif model_type == 'imagenet101':
        net = resnet.resnet101(pretrained=True)
    elif model_type == 'imagenet_resnext50':
        net = resnet.resnext50_32x4d(pretrained=True)
    elif model_type == 'imagenet_resnext101':
        net = resnet.resnext101_32x8d(pretrained=True)
    elif model_type == 'mocov2':
        net = resnet.resnet50(pretrained=False)
        net_ckpt = torch.load(args.resume)
        net_state = {k.replace('module.encoder_q.', ''):v for k,v in net_ckpt['state_dict'].items() \
                if 'module.encoder_q' in k}
        partial_load(net_state, net)
    elif model_type == 'uvc':
        net = load_uvc_model(args.resume)
    elif model_type == 'timecycle':
        net = load_tc_model(args.resume)
    elif model_type in SSL_MODELS:
        net = resnet.resnet50(pretrained=False)
        net_ckpt = torch.load(args.resume)
        partial_load(net_ckpt, net)
    elif 'hrnet' in model_type:
        net = hrnet.get_cls_net(model_type, return_stage=args.return_stage, pretrained=args.resume)
    elif model_type == 'random':
        net = random_feat_generator.RandomFeatGenerator(args)
    else:
        raise ValueError('Invalid model_type.')
    if hasattr(net, 'modify'):
        net.modify(remove_layers=args.remove_layers)

    if 'Conv2d' in str(net) and not args.infer2D:
        net = From3D(net)
    return net


================================================
FILE: unitrack/model/random_feat_generator.py
================================================
###################################################################
# File Name: random_feat_generator.py
# Author: Zhongdao Wang
# mail: wcd17@mails.tsinghua.edu.cn
# Created Time: Mon May 10 16:13:46 2021
###################################################################

from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

import torch
import torch.nn as nn

class RandomFeatGenerator(nn.Module):
    def __init__(self, args):
        super(RandomFeatGenerator, self).__init__()
        self.df = args.down_factor
        self.dim = args.dim
        self.dummy = nn.Linear(2,3)
    def forward(self, x):
        if len(x.shape) == 4:
            N,C,H,W = x.shape
        elif len(x.shape) == 5:
            N,C,T,H,W = x.shape
        else:
            raise ValueError
        c, h, w = self.dim, round(H/self.df), round(W/self.df)

        if len(x.shape) == 4:
            feat = torch.rand(N,c,h,w).cuda()
        elif len(x.shape) == 5:
            feat = torch.rand(N,c,T,h,w).cuda()
        return feat

    def __str__(self):
        return ''


================================================
FILE: unitrack/model/resnet.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
try:
    from torch.hub import load_state_dict_from_url
except ImportError:
    from torch.utils.model_zoo import load_url as load_state_dict_from_url

import torchvision.models.resnet as torch_resnet
from torchvision.models.resnet import BasicBlock, Bottleneck

model_urls = {'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
    'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
    'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
    'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',
    'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',
}

class ResNet(torch_resnet.ResNet):
    def __init__(self, *args, **kwargs):
        super(ResNet, self).__init__(*args, **kwargs)

    def modify(self, remove_layers=[], padding=''):
        # Set stride of layer3 and layer 4 to 1 (from 2)
        filter_layers = lambda x: [l for l in x if getattr(self, l) is not None]
        for layer in filter_layers(['layer3', 'layer4']):
            for m in getattr(self, layer).modules():
                if isinstance(m, torch.nn.Conv2d):
                    m.stride = tuple(1 for _ in m.stride)
        # Set padding (zeros or reflect, doesn't change much; 
        # zeros requires lower temperature)
        if padding != '' and padding != 'no':
            for m in self.modules():
                if isinstance(m, torch.nn.Conv2d) and sum(m.padding) > 0:
                    m.padding_mode = padding
        elif padding == 'no':
            for m in self.modules():
                if isinstance(m, torch.nn.Conv2d) and sum(m.padding) > 0:
                    m.padding = (0,0)

        # Remove extraneous layers
        remove_layers += ['fc', 'avgpool']
        for layer in filter_layers(remove_layers):
            setattr(self, layer, None)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = x if self.maxpool is None else self.maxpool(x) 

        x = self.layer1(x)
        x = F.avg_pool2d(x,(2,2)) if self.layer2 is None else self.layer2(x)
        x = x if self.layer3 is None else self.layer3(x) 
        x = x if self.layer4 is None else self.layer4(x) 
    
        return x        


def _resnet(arch, block, layers, pretrained, progress, **kwargs):
    model = ResNet(block, layers, **kwargs)
    if pretrained:
        state_dict = load_state_dict_from_url(model_urls[arch],
                                              progress=progress)
        model.load_state_dict(state_dict)
    return model

def resnet18(pretrained=False, progress=True, **kwargs):
    return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
                   **kwargs)

def resnet50(pretrained=False, progress=True, **kwargs) -> ResNet:
    return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
                   **kwargs)

def resnet101(pretrained=False, progress=True, **kwargs): 
    return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress,
                   **kwargs)

def resnet152(pretrained=False, progress=True, **kwargs):
    r"""ResNet-152 model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress,
                   **kwargs)


def resnext50_32x4d(pretrained=False, progress=True, **kwargs):
    r"""ResNeXt-50 32x4d model from
    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    kwargs['groups'] = 32
    kwargs['width_per_group'] = 4
    return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3],
                   pretrained, progress, **kwargs)


def resnext101_32x8d(pretrained=False, progress=True, **kwargs):
    r"""ResNeXt-101 32x8d model from
    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    kwargs['groups'] = 32
    kwargs['width_per_group'] = 8
    return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3],
                   pretrained, progress, **kwargs)


def wide_resnet50_2(pretrained=False, progress=True, **kwargs):
    r"""Wide ResNet-50-2 model from
    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_

    The model is the same as ResNet except for the bottleneck number of channels
    which is twice larger in every block. The number of channels in outer 1x1
    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
    channels, and in Wide ResNet-50-2 has 2048-1024-2048.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    kwargs['width_per_group'] = 64 * 2
    return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3],
                   pretrained, progress, **kwargs)


def wide_resnet101_2(pretrained=False, progress=True, **kwargs):
    r"""Wide ResNet-101-2 model from
    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_

    The model is the same as ResNet except for the bottleneck number of channels
    which is twice larger in every block. The number of channels in outer 1x1
    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
    channels, and in Wide ResNet-50-2 has 2048-1024-2048.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    kwargs['width_per_group'] = 64 * 2
    return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3],
                   pretrained, progress, **kwargs)


================================================
FILE: unitrack/multitracker.py
================================================
import os
import pdb
import cv2
import time
import itertools
import os.path as osp
from collections import deque

import numpy as np
import torch
import torch.nn.functional as F
from torchvision import ops

from unitrack.model import AppearanceModel, partial_load
from unitrack.utils.log import logger
from unitrack.core.association import matching
from unitrack.core.propagation import propagate
from unitrack.core.motion.kalman_filter import KalmanFilter

from unitrack.utils.box import *
from unitrack.utils.mask import *
from .basetrack import *


class AssociationTracker(object):
    def __init__(self, opt):
        self.opt = opt
        self.tracked_stracks = []  # type: list[STrack]
        self.lost_stracks = []     # type: list[STrack]
        self.removed_stracks = []  # type: list[STrack]

        self.frame_id = 0
        self.det_thresh = opt.conf_thres
        self.buffer_size = opt.track_buffer
        self.max_time_lost = self.buffer_size

        self.kalman_filter = KalmanFilter()

        self.app_model = AppearanceModel(opt).to(opt.device)
        self.app_model.eval()
        
        if not self.opt.asso_with_motion:
            self.opt.motion_lambda = 1
            self.opt.motion_gated = False
        
    def extract_emb(self, img, obs):
        raise NotImplementedError

    def prepare_obs(self, img, img0, obs, embs=None):
        raise NotImplementedError

    def update(self, img, img0, obs, embs=None):
        torch.cuda.empty_cache()
        self.frame_id += 1
        activated_stracks = []
        refind_stracks = []
        lost_stracks = []
        removed_stracks = []
 
        t1 = time.time()
        detections = self.prepare_obs(img, img0, obs, embs=None)

        ''' Add newly detected tracklets to tracked_stracks'''
        unconfirmed = []
        tracked_stracks = []  # type: list[STrack]
        for track in self.tracked_stracks:
            if not track.is_activated:
                unconfirmed.append(track)
            else:
                tracked_stracks.append(track)

        ''' Step 2: First association, with embedding'''
        tracks = joint_stracks(tracked_stracks, self.lost_stracks)
        dists, recons_ftrk = matching.reconsdot_distance(tracks, detections)
        if self.opt.use_kalman: 
            # Predict the current location with KF
            STrack.multi_predict(tracks)
            dists = matching.fuse_motion(self.kalman_filter, dists, tracks, detections, 
                    lambda_=self.opt.motion_lambda, gate=self.opt.motion_gated)
        if obs.shape[1] == 6:
            dists = matching.category_gate(dists, tracks, detections)
        matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.7)

        for itracked, idet in matches:
            track = tracks[itracked]
            det = detections[idet]
            if track.state == TrackState.Tracked:
                track.update(detections[idet], self.frame_id)
                activated_stracks.append(track)
            else:
                track.re_activate(det, self.frame_id, new_id=False)
                refind_stracks.append(track)
        
        if self.opt.use_kalman:
            '''(optional) Step 3: Second association, with IOU'''
            tracks = [tracks[i] for i in u_track if tracks[i].state==TrackState.Tracked]
            detections = [detections[i] for i in u_detection]
            dists = matching.iou_distance(tracks, detections)
            matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.5)
            
            for itracked, idet in matches:
                track = tracks[itracked]
                det = detections[idet]
                if track.state == TrackState.Tracked:
                    track.update(det, self.frame_id)
                    activated_stracks.append(track)
                else:
                    track.re_activate(det, self.frame_id, new_id=False)
                    refind_stracks.append(track)

            '''Deal with unconfirmed tracks, usually tracks with only one beginning frame'''
            detections = [detections[i] for i in u_detection]
            dists = matching.iou_distance(unconfirmed, detections)
            matches, u_unconfirmed, u_detection = matching.linear_assignment(
                    dists, thresh=self.opt.confirm_iou_thres)
            for itracked, idet in matches:
                unconfirmed[itracked].update(detections[idet], self.frame_id)
                activated_stracks.append(unconfirmed[itracked])
            for it in u_unconfirmed:
                track = unconfirmed[it]
                track.mark_removed()
                removed_stracks.append(track)

        for it in u_track:
            track = tracks[it]
            if not track.state == TrackState.Lost:
                track.mark_lost()
                lost_stracks.append(track)

        """ Step 4: Init new stracks"""
        for inew in u_detection:
            track = detections[inew]
            if track.score < self.det_thresh:
                continue
            track.activate(self.kalman_filter, self.frame_id)
            activated_stracks.append(track)

        """ Step 5: Update state"""
        for track in self.lost_stracks:
            if self.frame_id - track.end_frame > self.max_time_lost:
                track.mark_removed()
                removed_stracks.append(track)

        self.tracked_stracks = [t for t in self.tracked_stracks if t.state == TrackState.Tracked]
        self.tracked_stracks = joint_stracks(self.tracked_stracks, activated_stracks)
        self.tracked_stracks = joint_stracks(self.tracked_stracks, refind_stracks)
        self.lost_stracks = sub_stracks(self.lost_stracks, self.tracked_stracks)
        self.lost_stracks.extend(lost_stracks)
        self.lost_stracks = sub_stracks(self.lost_stracks, self.removed_stracks)
        self.removed_stracks.extend(removed_stracks)
        self.tracked_stracks, self.lost_stracks = remove_duplicate_stracks(
                self.tracked_stracks, self.lost_stracks, ioudist=self.opt.dup_iou_thres)

        # get scores of lost tracks
        output_stracks = [track for track in self.tracked_stracks if track.is_activated]

        return output_stracks

    def reset_all(self, ):
        self.tracked_stracks = []  # type: list[STrack]
        self.lost_stracks = []  # type: list[STrack]
        self.removed_stracks = []  # type: list[STrack]
        self.frame_id = 0

================================================
FILE: unitrack/utils/__init__.py
================================================
from collections import defaultdict, deque
import datetime
import time
import torch

import errno
import os
import pdb
import sys

from . import visualize
from . import box
from . import meter
from . import log

import numpy as np
from torch import nn
from torch.nn import functional as F


def to_numpy(tensor):
    if torch.is_tensor(tensor):
        return tensor.cpu().numpy()
    elif type(tensor).__module__ != 'numpy':
        raise ValueError("Cannot convert {} to numpy array"
                         .format(type(tensor)))
    return tensor

def to_torch(ndarray):
    if type(ndarray).__module__ == 'numpy':
        return torch.from_numpy(ndarray)
    elif not torch.is_tensor(ndarray):
        raise ValueError("Cannot convert {} to torch tensor"
                         .format(type(ndarray)))
    return ndarray

def im_to_numpy(img):
    img = to_numpy(img)
    img = np.transpose(img, (1, 2, 0)) # H*W*C
    return img

def im_to_torch(img):
    img = np.transpose(img, (2, 0, 1)) # C*H*W
    img = to_torch(img).float()
    return img


================================================
FILE: unitrack/utils/box.py
================================================
###################################################################
# File Name: box.py
# Author: Zhongdao Wang
# mail: wcd17@mails.tsinghua.edu.cn
# Created Time: Wed Dec 23 16:27:15 2020
###################################################################

import torch
import torchvision
import numpy as np


def xyxy2xywh(x):
    # Convert bounding box format from [x1, y1, x2, y2] to [x, y, w, h]
    y = x.clone() if x.dtype is torch.float32 else x.copy()
    y[:, 0] = (x[:, 0] + x[:, 2]) / 2
    y[:, 1] = (x[:, 1] + x[:, 3]) / 2
    y[:, 2] = x[:, 2] - x[:, 0]
    y[:, 3] = x[:, 3] - x[:, 1]
    return y


def xywh2xyxy(x):
    # Convert bounding box format from [x, y, w, h] to [x1, y1, x2, y2]
    y = x.clone() if x.dtype is torch.float32 else x.copy()
    y[:, 0] = (x[:, 0] - x[:, 2] / 2)
    y[:, 1] = (x[:, 1] - x[:, 3] / 2)
    y[:, 2] = (x[:, 0] + x[:, 2] / 2)
    y[:, 3] = (x[:, 1] + x[:, 3] / 2)
    return y


def tlwh2xyxy(x):
    # Convert bounding box format from [x, y, w, h] to [x1, y1, x2, y2]
    y = x.clone() if x.dtype is torch.float32 else x.copy()
    y[:, 2] = (x[:, 0] + x[:, 2])
    y[:, 3] = (x[:, 1] + x[:, 3])
    return y


def tlwh_to_xywh(tlwh):
    ret = np.asarray(tlwh).copy()
    ret[:2] += ret[2:] / 2
    return ret


def tlwh_to_xyah(tlwh):
    """Convert bounding box to format `(center x, center y, aspect ratio,
    height)`, where the aspect ratio is `width / height`.
    """
    ret = np.asarray(tlwh).copy()
    ret[:2] += ret[2:] / 2
    ret[2] /= (ret[3] + 1e-6)
    return ret


def tlbr_to_tlwh(tlbr):
    ret = np.asarray(tlbr).copy()
    ret[2:] -= ret[:2]
    return ret


def tlwh_to_tlbr(tlwh):
    ret = np.asarray(tlwh).copy()
    ret[2:] += ret[:2]
    return ret


def scale_box(scale, coords):
    c = coords.clone()
    c[:, [0, 2]] = coords[:, [0, 2]] * scale[0]
    c[:, [1, 3]] = coords[:, [1, 3]] * scale[1]
    return c


def scale_box_letterbox_size(img_size, coords, img0_shape):
    gain_w = float(img_size[0]) / img0_shape[1]  # gain  = old / new
    gain_h = float(img_size[1]) / img0_shape[0]
    gain = min(gain_w, gain_h)
    pad_x = (img_size[0] - img0_shape[1] * gain) / 2  # width padding
    pad_y = (img_size[1] - img0_shape[0] * gain) / 2  # height padding
    coords[:, 0:4] *= gain
    coords[:, [0, 2]] += pad_x
    coords[:, [1, 3]] += pad_y
    return coords


def scale_box_input_size(img_size, coords, img0_shape):
    # Rescale x1, y1, x2, y2 from 416 to image size
    gain_w = float(img_size[0]) / img0_shape[1]  # gain  = old / new
    gain_h = float(img_size[1]) / img0_shape[0]
    gain = min(gain_w, gain_h)
    pad_x = (img_size[0] - img0_shape[1] * gain) / 2  # width padding
    pad_y = (img_size[1] - img0_shape[0] * gain) / 2  # height padding
    coords[:, [0, 2]] -= pad_x
    coords[:, [1, 3]] -= pad_y
    coords[:, 0:4] /= gain
    return coords


def clip_boxes(boxes, im_shape):
    """
    Clip boxes to image boundaries.
    """
    boxes = np.asarray(boxes)
    if boxes.shape[0] == 0:
        return boxes
    boxes = np.copy(boxes)
    # x1 >= 0
    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
    # y1 >= 0
    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
    # x2 < im_shape[1]
    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
    # y2 < im_shape[0]
    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
    return boxes


def clip_box(bbox, im_shape):
    h, w = im_shape[:2]
    bbox = np.copy(bbox)
    bbox[0] = max(min(bbox[0], w - 1), 0)
    bbox[1] = max(min(bbox[1], h - 1), 0)
    bbox[2] = max(min(bbox[2], w - 1), 0)
    bbox[3] = max(min(bbox[3], h - 1), 0)

    return bbox


def int_box(box):
    box = np.asarray(box, dtype=np.float)
    box = np.round(box)
    return np.asarray(box, dtype=np.int)


def remove_duplicated_box(boxes, iou_th=0.5):
    if isinstance(boxes, np.ndarray):
        boxes = torch.from_numpy(boxes)
    jac = torchvision.ops.box_iou(boxes, boxes).float()
    jac -= torch.eye(jac.shape[0])
    keep = np.ones(len(boxes)) == 1
    for i, b in enumerate(boxes):
        if b[0] == -1 and b[1] == -1 and b[2] == 10 and b[3] == 10:
            keep[i] = False
    for r, row in enumerate(jac):
        if keep[r]:
            discard = torch.where(row > iou_th)
            keep[discard] = False
    return np.where(keep)[0]


def skltn2box(skltn):
    dskltn = dict()
    for s in skltn:
        dskltn[s['id'][0]] = (int(s['x'][0]), int(s['y'][0]))
    if len(dskltn) == 0:
        return np.array(
                [-1, -1, np.random.randint(1, 40), np.random.randint(1, 70)])

    xmin = np.min([dskltn[k][0] for k in dskltn])
    xmax = np.max([dskltn[k][0] for k in dskltn])
    ymin = np.min([dskltn[k][1] for k in dskltn])
    ymax = np.max([dskltn[k][1] for k in dskltn])
    if xmin == xmax:
        xmax += 10
    if ymin == ymax:
        ymax += 10
    return np.array([xmin, ymin, xmax, ymax])


================================================
FILE: unitrack/utils/io.py
================================================
import os
import os.path as osp
from typing import Dict
import numpy as np

from utils.log import logger

def mkdir_if_missing(d):
    if not osp.exists(d):
        os.makedirs(d)

def write_mots_results(filename, results, data_type='mot'):
    if not filename:
        return
    path = os.path.dirname(filename)
    if not os.path.exists(path):
        os.makedirs(path)

    if data_type in ('mot'):
        save_format = '{frame} {id} {cid} {imh} {imw} {rle}\n'
    else:
        raise ValueError(data_type)

    with open(filename, 'w') as f:
        for frame_id, tlwhs, rles, track_ids in results:
            for rle, track_id in zip(rles, track_ids):
                if track_id < 0:
                    continue
                rle_str = rle['counts']
                imh, imw = rle['size']
                line = save_format.format(frame=frame_id, id=track_id+2000, cid=2, imh=imh, imw=imw, rle=rle_str)
                f.write(line)
    logger.info('Save results to {}'.format(filename))

def write_mot_results(filename, results, data_type='mot'):
    if not filename:
        return
    path = os.path.dirname(filename)
    if not os.path.exists(path):
        os.makedirs(path)

    if data_type in ('mot', 'mcmot', 'lab'):
        save_format = '{frame},{id},{x1},{y1},{w},{h},1,-1,-1,-1\n'
    elif data_type == 'kitti':
        save_format = '{frame} {id} pedestrian -1 -1 -10 {x1} {y1} {x2} {y2} -1 -1 -1 -1000 -1000 -1000 -10 {score}\n'
    else:
        raise ValueError(data_type)

    with open(filename, 'w') as f:
        for frame_id, tlwhs, track_ids in results:
            if data_type == 'kitti':
                frame_id -= 1
            for tlwh, track_id in zip(tlwhs, track_ids):
                if track_id < 0:
                    continue
                x1, y1, w, h = tlwh
                x2, y2 = x1 + w, y1 + h
                line = save_format.format(frame=frame_id, id=track_id, x1=x1, y1=y1, x2=x2, y2=y2, w=w, h=h)
                f.write(line)
    logger.info('Save results to {}'.format(filename))


def read_mot_results(filename, data_type='mot', is_gt=False, is_ignore=False):
    if data_type in ('mot', 'lab'):
        read_fun = _read_mot_results
    else:
        raise ValueError('Unknown data type: {}'.format(data_type))

    return read_fun(filename, is_gt, is_ignore)


"""
labels={'ped', ...			% 1
'person_on_vhcl', ...	% 2
'car', ...				% 3
'bicycle', ...			% 4
'mbike', ...			% 5
'non_mot_vhcl', ...		% 6
'static_person', ...	% 7
'distractor', ...		% 8
'occluder', ...			% 9
'occluder_on_grnd', ...		%10
'occluder_full', ...		% 11
'reflection', ...		% 12
'crowd' ...			% 13
};
"""


def _read_mot_results(filename, is_gt, is_ignore):
    valid_labels = {1}
    ignore_labels = {2, 7, 8, 12}
    results_dict = dict()
    if os.path.isfile(filename):
        with open(filename, 'r') as f:
            for line in f.readlines():
                linelist = line.split(',')
                if len(linelist) < 7:
                    continue
                fid = int(linelist[0])
                if fid < 1:
                    continue
                results_dict.setdefault(fid, list())

                if is_gt:
                    if 'MOT16-' in filename or 'MOT17-' in filename:
                        label = int(float(linelist[7]))
                        mark = int(float(linelist[6]))
                        if mark == 0 or label not in valid_labels:
                            continue
                    score = 1
                elif is_ignore:
                    if 'MOT16-' in filename or 'MOT17-' in filename:
                        label = int(float(linelist[7]))
                        vis_ratio = float(linelist[8])
                        if label not in ignore_labels and vis_ratio >= 0:
                            continue
                    else:
                        continue
                    score = 1
                else:
                    score = float(linelist[6])

                tlwh = tuple(map(float, linelist[2:6]))
                target_id = int(linelist[1])

                results_dict[fid].append((tlwh, target_id, score))

    return results_dict


def unzip_objs(objs):
    if len(objs) > 0:
        tlwhs, ids, scores = zip(*objs)
    else:
        tlwhs, ids, scores = [], [], []
    tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)

    return tlwhs, ids, scores


================================================
FILE: unitrack/utils/log.py
================================================
import logging


def get_logger(name='root'):
    formatter = logging.Formatter(
        # fmt='%(asctime)s [%(levelname)s]: %(filename)s(%(funcName)s:%(lineno)s) >> %(message)s')
        fmt='%(asctime)s [%(levelname)s]: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

    handler = logging.StreamHandler()
    handler.setFormatter(formatter)

    logger = logging.getLogger(name)
    logger.setLevel(logging.DEBUG)
    logger.addHandler(handler)
    return logger


logger = get_logger('root')


================================================
FILE: unitrack/utils/mask.py
================================================
###################################################################
# File Name: mask.py
# Author: Zhongdao Wang
# mail: wcd17@mails.tsinghua.edu.cn
# Created Time: Tue Feb  9 10:05:47 2021
###################################################################

from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

import cv2
import torch
import numpy as np
import pycocotools.mask as mask_utils


def coords2bbox(coords, extend=2):
    """
    INPUTS:
     - coords: coordinates of pixels in the next frame
    """
    center = torch.mean(coords, dim=0) # b * 2
    center = center.view(1,2)
    center_repeat = center.repeat(coords.size(0),1)

    dis_x = torch.sqrt(torch.pow(coords[:,0] - center_repeat[:,0], 2))
    dis_x = max(torch.mean(dis_x, dim=0).detach(),1)
    dis_y = torch.sqrt(torch.pow(coords[:,1] - center_repeat[:,1], 2))
    dis_y = max(torch.mean(dis_y, dim=0).detach(),1)

    left = center[:,0] - dis_x*extend
    right = center[:,0] + dis_x*extend
    top = center[:,1] - dis_y*extend
    bottom = center[:,1] + dis_y*extend

    return (top.item(), left.item(), bottom.item(), right.item())


def coords2bbox_all(coords):
    left = coords[:, 0].min().item()
    top = coords[:, 1].min().item()
    right = coords[:, 0].max().item()
    bottom = coords[:, 1].max().item()
    return top, left, bottom, right


def coords2bboxTensor(coords, extend=2):
    """
    INPUTS:
     - coords: coordinates of pixels in the next frame
    """
    center = torch.mean(coords, dim=0) # b * 2
    center = center.view(1,2)
    center_repeat = center.repeat(coords.size(0),1)

    dis_x = torch.sqrt(torch.pow(coords[:,0] - center_repeat[:,0], 2))
    dis_x = max(torch.mean(dis_x, dim=0).detach(),1)
    dis_y = torch.sqrt(torch.pow(coords[:,1] - center_repeat[:,1], 2))
    dis_y = max(torch.mean(dis_y, dim=0).detach(),1)

    left = center[:,0] - dis_x*extend
    right = center[:,0] + dis_x*extend
    top = center[:,1] - dis_y*extend
    bottom = center[:,1] + dis_y*extend

    return torch.Tensor([top.item(), left.item(), bottom.item(), right.item()]).to(coords.device)

def mask2box(masks):
    boxes = []
    for mask in masks:
        m = mask[0].nonzero().float()
        if m.numel() > 0:
            box = coords2bbox(m, extend=2)
        else:
            box = (-1,-1,10,10)
        boxes.append(box)
    return np.asarray(boxes)

def tensor_mask2box(masks):
    boxes = []
    for mask in masks:
        m = mask.nonzero().float()
        if m.numel() > 0:
            # box = coords2bbox(m, extend=2)
            box = coords2bbox_all(m)
        else:
            box = (-1,-1,10,10)
        boxes.append(box)
    return np.asarray(boxes)

def batch_mask2boxlist(masks):
    """
    Args:
        masks: Tensor b,n,h,w

    Returns: List[List[box]]

    """
    batch_bbox = []
    for i, b_masks in enumerate(masks):
        boxes = []
        for mask in b_masks:
            m = mask.nonzero().float()
            if m.numel() > 0:
                box = coords2bboxTensor(m, extend=2)
            else:
                box = torch.Tensor([0,0,0,0]).to(m.device)
            boxes.append(box.unsqueeze(0))
        boxes_t = torch.cat(boxes, 0)
        batch_bbox.append(boxes_t)

    return batch_bbox


def bboxlist2roi(bbox_list):
    """Convert a list of bboxes to roi format.

    Args:
        bbox_list (list[Tensor]): a list of bboxes corresponding to a batch
            of images.

    Returns:
        Tensor: shape (n, 5), [batch_ind, x1, y1, x2, y2]
    """
    rois_list = []
    for img_id, bboxes in enumerate(bbox_list):
        if bboxes.size(0) > 0:
            img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
            rois = torch.cat([img_inds, bboxes[:, :4]], dim=-1)
        else:
            rois = bboxes.new_zeros((0, 5))
        rois_list.append(rois)
    rois = torch.cat(rois_list, 0)
    return rois

def bbox2roi(bbox_list):
    """Convert a list of bboxes to roi format.

    Args:
        bbox_list (list[Tensor]): a list of bboxes corresponding to a batch
            of images.

    Returns:
        Tensor: shape (n, 5), [batch_ind, x1, y1, x2, y2]
    """
    rois_list = []
    for img_id, bboxes in enumerate(bbox_list):
        if bboxes.size(0) > 0:
            img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
            rois = torch.cat([img_inds, bboxes[:, :4]], dim=-1)
        else:
            rois = bboxes.new_zeros((0, 5))
        rois_list.append(rois)
    rois = torch.cat(rois_list, 0)
    return rois

def temp_interp_mask(maskseq, T):
    '''
    maskseq: list of elements (RLE_mask, timestamp)
    return list of RLE_mask, length of list is T
    '''
    size = maskseq[0][0]['size']
    blank_mask = np.asfortranarray(np.zeros(size).astype(np.uint8))
    blank_mask = mask_utils.encode(blank_mask)
    blank_mask['counts'] = blank_mask['counts'].decode('ascii')
    ret = [blank_mask,] * T
    for m, t in maskseq:
        ret[t] = m
    return ret

def mask_seq_jac(sa, sb):
    j = np.zeros((len(sa), len(sb)))
    for ia, a in enumerate(sa):
        for ib, b in enumerate(sb):
            ious = [mask_utils.iou([at], [bt], [False,]) for (at, bt) in zip(a,b)]
            tiou = np.mean(ious)
            j[ia, ib] = tiou
    return j
        

def skltn2mask(skltn, size):
    h, w = size
    mask = np.zeros((h,w))
    
    dskltn = dict()
    for s in skltn:
        dskltn[s['id'][0]] = (int(s['x'][0]), int(s['y'][0]))
    if len(dskltn)==0:
        return mask
    trunk_polygon = list()
    for k in np.array([3,4,10,13,9])-1:
        p = dskltn.get(k, None)
        if not p is None:
            trunk_polygon.append(p)
    trunk_polygon = np.asarray(trunk_polygon, 'int32')
    if len(trunk_polygon) > 2:
        cv2.fillConvexPoly(mask, trunk_polygon, 1)

    xmin = np.min([dskltn[k][0] for k in dskltn])
    xmax = np.max([dskltn[k][0] for k in dskltn])
    ymin = np.min([dskltn[k][1] for k in dskltn])
    ymax = np.max([dskltn[k][1] for k in dskltn])
    line_width = np.max([int(np.max([xmax-xmin, ymax-ymin, 0])/20),8])


    skeleton = [[10, 11], [11, 12], [9,8], 
                [8,7], [10, 13], [9, 13], 
                [13, 15], [10,4], [4,5], 
                [5,6], [9,3], [3,2], [2,1]]
    

    for sk in skeleton:
        st = dskltn.get(sk[0]-1, None)
        ed = dskltn.get(sk[1]-1, None)
        if st is None or ed is None:
            continue
        cv2.line(mask, st, ed, color=1, thickness=line_width)
    
    #dmask = cv2.resize(mask, (w//8, h//8), interpolation=cv2.INTER_NEAREST)
    #pdb.set_trace()
    
    return mask


def pts2array(pts):
    arr = np.zeros((15,3))
    for s in pts:
        arr[s['id'][0]][0] = int(s['x'][0])
        arr[s['id'][0]][1] = int(s['y'][0])
        arr[s['id'][0]][2] = s['score'][0]
    return arr


================================================
FILE: unitrack/utils/meter.py
================================================
###################################################################
# File Name: meter.py
# Author: Zhongdao Wang
# mail: wcd17@mails.tsinghua.edu.cn
# Created Time: Wed Dec 23 16:35:34 2020
###################################################################

from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
import time


class Timer(object):
    """A simple timer."""
    def __init__(self):
        self.total_time = 0.
        self.calls = 0
        self.start_time = 0.
        self.diff = 0.
        self.average_time = 0.

        self.duration = 0.

    def tic(self):
        # using time.time instead of time.clock because time time.clock
        # does not normalize for multithreading
        self.start_time = time.time()

    def toc(self, average=True):
        self.diff = time.time() - self.start_time
        self.total_time += self.diff
        self.calls += 1
        self.average_time = self.total_time / self.calls
        if average:
            self.duration = self.average_time
        else:
            self.duration = self.diff
        return self.duration

    def clear(self):
        self.total_time = 0.
        self.calls = 0
        self.start_time = 0.
        self.diff = 0.
        self.average_time = 0.
        self.duration = 0.


================================================
FILE: unitrack/utils/palette.py
================================================
palette_str = '''0 0 0
128 0 0
0 128 0
128 128 0
0 0 128
128 0 128
0 128 128
128 128 128
64 0 0
191 0 0
64 128 0
191 128 0
64 0 128
191 0 128
64 128 128
191 128 128
0 64 0
128 64 0
0 191 0
128 191 0
0 64 128
128 64 128
22 22 22
23 23 23
24 24 24
25 25 25
26 26 26
27 27 27
28 28 28
29 29 29
30 30 30
31 31 31
32 32 32
33 33 33
34 34 34
35 35 35
36 36 36
37 37 37
38 38 38
39 39 39
40 40 40
41 41 41
42 42 42
43 43 43
44 44 44
45 45 45
46 46 46
47 47 47
48 48 48
49 49 49
50 50 50
51 51 51
52 52 52
53 53 53
54 54 54
55 55 55
56 56 56
57 57 57
58 58 58
59 59 59
60 60 60
61 61 61
62 62 62
63 63 63
64 64 64
65 65 65
66 66 66
67 67 67
68 68 68
69 69 69
70 70 70
71 71 71
72 72 72
73 73 73
74 74 74
75 75 75
76 76 76
77 77 77
78 78 78
79 79 79
80 80 80
81 81 81
82 82 82
83 83 83
84 84 84
85 85 85
86 86 86
87 87 87
88 88 88
89 89 89
90 90 90
91 91 91
92 92 92
93 93 93
94 94 94
95 95 95
96 96 96
97 97 97
98 98 98
99 99 99
100 100 100
101 101 101
102 102 102
103 103 103
104 104 104
105 105 105
106 106 106
107 107 107
108 108 108
109 109 109
110 110 110
111 111 111
112 112 112
113 113 113
114 114 114
115 115 115
116 116 116
117 117 117
118 118 118
119 119 119
120 120 120
121 121 121
122 122 122
123 123 123
124 124 124
125 125 125
126 126 126
127 127 127
128 128 128
129 129 129
130 130 130
131 131 131
132 132 132
133 133 133
134 134 134
135 135 135
136 136 136
137 137 137
138 138 138
139 139 139
140 140 140
141 141 141
142 142 142
143 143 143
144 144 144
145 145 145
146 146 146
147 147 147
148 148 148
149 149 149
150 150 150
151 151 151
152 152 152
153 153 153
154 154 154
155 155 155
156 156 156
157 157 157
158 158 158
159 159 159
160 160 160
161 161 161
162 162 162
163 163 163
164 164 164
165 165 165
166 166 166
167 167 167
168 168 168
169 169 169
170 170 170
171 171 171
172 172 172
173 173 173
174 174 174
175 175 175
176 176 176
177 177 177
178 178 178
179 179 179
180 180 180
181 181 181
182 182 182
183 183 183
184 184 184
185 185 185
186 186 186
187 187 187
188 188 188
189 189 189
190 190 190
191 191 191
192 192 192
193 193 193
194 194 194
195 195 195
196 196 196
197 197 197
198 198 198
199 199 199
200 200 200
201 201 201
202 202 202
203 203 203
204 204 204
205 205 205
206 206 206
207 207 207
208 208 208
209 209 209
210 210 210
211 211 211
212 212 212
213 213 213
214 214 214
215 215 215
216 216 216
217 217 217
218 218 218
219 219 219
220 220 220
221 221 221
222 222 222
223 223 223
224 224 224
225 225 225
226 226 226
227 227 227
228 228 228
229 229 229
230 230 230
231 231 231
232 232 232
233 233 233
234 234 234
235 235 235
236 236 236
237 237 237
238 238 238
239 239 239
240 240 240
241 241 241
242 242 242
243 243 243
244 244 244
245 245 245
246 246 246
247 247 247
248 248 248
249 249 249
250 250 250
251 251 251
252 252 252
253 253 253
254 254 254
255 255 255'''
import numpy as np
tensor = np.array([[int(x) for x in line.split()] for line in palette_str.split('\n')])


================================================
FILE: unitrack/utils/visualize.py
================================================

import cv2
import numpy as np
import imageio as io
from matplotlib import cm

import time
import PIL

import pycocotools.mask as mask_utils
from . import palette


def dump_predictions(pred, lbl_set, img, prefix):
    '''
    Save:
        1. Predicted labels for evaluation
        2. Label heatmaps for visualization
    '''
    lbl_set = palette.tensor.astype(np.uint8)
    sz = img.shape[:-1]

    # Upsample predicted soft label maps
    # pred_dist = pred.copy()
    pred_dist = cv2.resize(pred, sz[::-1])[:]
    
    # Argmax to get the hard label for index
    pred_lbl = np.argmax(pred_dist, axis=-1)
    pred_lbl = np.array(lbl_set, dtype=np.int32)[pred_lbl]      
    mask = np.float32(pred_lbl.sum(2) > 0)[:,:,None]
    alpha = 0.5
    img_with_label = mask * (np.float32(img) * alpha + \
            np.float32(pred_lbl) * (1-alpha)) + (1-mask) * np.float32(img)

    # Visualize label distribution for object 1 (debugging/analysis)
    pred_soft = pred_dist[..., 1]
    pred_soft = cv2.resize(pred_soft, (img.shape[1], img.shape[0]), 
            interpolation=cv2.INTER_NEAREST)
    pred_soft = cm.jet(pred_soft)[..., :3] * 255.0
    img_with_heatmap1 =  np.float32(img) * 0.5 + np.float32(pred_soft) * 0.5

    # Save blend image for visualization
    io.imwrite('%s_blend.jpg' % prefix, np.uint8(img_with_label))

    if prefix[-4] != '.':  # Super HACK-y
        imname2 = prefix + '_mask.png'
    else:
        imname2 = prefix.replace('jpg','png')

    # Save predicted labels for evaluation
    io.imwrite(imname2, np.uint8(pred_lbl))

    return img_with_label, pred_lbl, img_with_heatmap1


def make_gif(video, outname='/tmp/test.gif', sz=256):
    if hasattr(video, 'shape'):
        video = video.cpu()
        if video.shape[0] == 3:
            video = video.transpose(0, 1)

        video = video.numpy().transpose(0, 2, 3, 1)
        video = (video*255).astype(np.uint8)
        
    video = [cv2.resize(vv, (sz, sz)) for vv in video]

    if outname is None:
        return np.stack(video)

    io.mimsave(outname, video, duration = 0.2)

def get_color(idx):
    idx = idx * 17
    color = ((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255)
    return color

def plot_tracking(image, obs, obj_ids, scores=None, frame_id=0, fps=0.):
    im = np.ascontiguousarray(np.copy(image))
    im_h, im_w = im.shape[:2]

    text_scale = max(1, image.shape[1] / 1600.)
    text_thickness = 1 if text_scale > 1.1 else 1
    line_thickness = max(1, int(image.shape[1] / 150.))
    alpha = 0.4

    for i, ob in enumerate(obs): 
        obj_id = int(obj_ids[i])
        id_text = '{}'.format(int(obj_id))
        _line_thickness = 1 if obj_id <= 0 else line_thickness
        color = get_color(obj_id)
        if len(ob) == 4:
            x1, y1, w, h = ob
            intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h)))
            cv2.rectangle(im, intbox[0:2], intbox[2:4], color=color, thickness=line_thickness)
            cv2.putText(im, id_text, (intbox[0], intbox[1] + 30), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255),
                        thickness=text_thickness)
        elif isinstance(ob, dict):
            mask = mask_utils.decode(ob)
            mask = cv2.resize(mask, (im_w, im_h), interpolation=cv2.INTER_LINEAR)
            mask = (mask > 0.5).astype(np.uint8)[:,:,None]
            mask_color = mask * color
            im = (1 - mask) * im + mask * (alpha*im + (1-alpha)*mask_color) 
        else:
            raise ValueError('Observation format not supported.')
    return im


def vis_pose(oriImg, points):

    pa = np.zeros(15)
    pa[2] = 0
    pa[12] = 8
    pa[8] = 4
    pa[4] = 0
    pa[11] = 7
    pa[7] = 3
    pa[3] = 0
    pa[0] = 1
    pa[14] = 10
    pa[10] = 6
    pa[6] = 1
    pa[13] = 9
    pa[9] = 5
    pa[5] = 1

    colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0],
              [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255],
              [170,0,255],[255,0,255]]
    canvas = oriImg
    stickwidth = 4
    x = points[0, :]
    y = points[1, :]

    for n in range(len(x)):
        pair_id = int(pa[n])

        x1 = int(x[pair_id])
        y1 = int(y[pair_id])
        x2 = int(x[n])
        y2 = int(y[n])

        if x1 >= 0 and y1 >= 0 and x2 >= 0 and y2 >= 0:
            cv2.line(canvas, (x1, y1), (x2, y2), colors[n], 8)

    return canvas


def draw_skeleton(aa, kp, color, show_skeleton_labels=False, dataset= "PoseTrack"):
    if dataset == "COCO":
        skeleton = [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13], 
                [6, 12], [7, 13], [6, 7], [6, 8], [7, 9], [8, 10], 
                [9, 11], [2, 3], [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7]]
        kp_names = ['nose', 'l_eye', 'r_eye', 'l_ear', 'r_ear', 'l_shoulder',
                    'r_shoulder', 'l_elbow', 'r_elbow', 'l_wrist', 'r_wrist',
                    'l_hip', 'r_hip', 'l_knee', 'r_knee', 'l_ankle', 'r_ankle']
    elif dataset == "PoseTrack":
        skeleton = [[10, 11], [11, 12], [9,8], [8,7],
                    [10, 13], [9, 13], [13, 15], [10,4],
                    [4,5], [5,6], [9,3], [3,2], [2,1]]
        kp_names = ['right_ankle', 'right_knee', 'right_pelvis',
                    'left_pelvis', 'left_knee', 'left_ankle',
                    'right_wrist', 'right_elbow', 'right_shoulder',
                    'left_shoulder', 'left_elbow', 'left_wrist',
                    'upper_neck', 'nose', 'head']
    for i, j in skeleton:
        if kp[i-1][0] >= 0 and kp[i-1][1] >= 0 and kp[j-1][0] >= 0 and kp[j-1][1] >= 0 and \
            (len(kp[i-1]) <= 2 or (len(kp[i-1]) > 2 and  kp[i-1][2] > 0.1 and kp[j-1][2] > 0.1)):
            st = (int(kp[i-1][0]), int(kp[i-1][1]))
            ed = (int(kp[j-1][0]), int(kp[j-1][1]))
            cv2.line(aa, st, ed,  color, max(1, int(aa.shape[1]/150.)))
    for j in range(len(kp)):
        if kp[j][0] >= 0 and kp[j][1] >= 0:
            pt = (int(kp[j][0]), int(kp[j][1]))
            if len(kp[j]) <= 2 or (len(kp[j]) > 2 and kp[j][2] > 1.1):
                cv2.circle(aa, pt, 2, tuple((0,0,255)), 2)
            elif len(kp[j]) <= 2 or (len(kp[j]) > 2 and kp[j][2] > 0.1):
                cv2.circle(aa, pt, 2, tuple((255,0,0)), 2)

            if show_skeleton_labels and (len(kp[j]) <= 2 or (len(kp[j]) > 2 and kp[j][2] > 0.1)):
                cv2.putText(aa, kp_names[j], tuple(kp[j][:2]), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0))