Full Code of lxtGH/Video-K-Net for AI

main a69340321f47 cached
237 files
1.8 MB
460.2k tokens
1528 symbols
1 requests
Download .txt
Showing preview only (1,973K chars total). Download the full file or copy to clipboard to get everything.
Repository: lxtGH/Video-K-Net
Branch: main
Commit: a69340321f47
Files: 237
Total size: 1.8 MB

Directory structure:
gitextract_0ef7ckct/

├── .gitignore
├── DATASET.md
├── LICENSE
├── README.md
├── configs/
│   ├── det/
│   │   ├── _base_/
│   │   │   ├── datasets/
│   │   │   │   ├── cityscapes_panoptic.py
│   │   │   │   ├── cityscapes_step.py
│   │   │   │   ├── cityscapes_vps_clips.py
│   │   │   │   ├── cityscapes_vps_clips_trainval.py
│   │   │   │   ├── coco_instance.py
│   │   │   │   ├── coco_panoptic.py
│   │   │   │   ├── coco_panoptic_instance_annotations.py
│   │   │   │   ├── kitti_step_dvps.py
│   │   │   │   ├── kitti_step_vps.py
│   │   │   │   ├── kitti_step_vps_trainval.py
│   │   │   │   ├── mapillary_panoptic.py
│   │   │   │   └── vipseg_dvps.py
│   │   │   ├── default_runtime.py
│   │   │   ├── models/
│   │   │   │   ├── knet_citystep_s3_r50_fpn.py
│   │   │   │   ├── knet_kitti_step_s3_r50_fpn.py
│   │   │   │   ├── knet_s3_r50_deformable_fpn.py
│   │   │   │   ├── knet_s3_r50_fpn.py
│   │   │   │   ├── knet_s3_r50_fpn_panoptic.py
│   │   │   │   ├── knet_vipseg_s3_r50_fpn.py
│   │   │   │   └── video_knet_s3_r50_fpn_panoptic.py
│   │   │   └── schedules/
│   │   │       ├── schedule_10e.py
│   │   │       └── schedule_1x.py
│   │   ├── coco/
│   │   │   ├── knet_s3_r50_deformable_fpn_ms-3x_coco.py
│   │   │   ├── knet_s3_r50_fpn_ms-3x_coco-panoptic.py
│   │   │   ├── knet_s3_r50_fpn_ms-3x_coco.py
│   │   │   └── knet_s3_swin-b_deformable_fpn_ms-3x_coco.py
│   │   ├── common/
│   │   │   ├── lsj_coco_panoptic_50e.py
│   │   │   ├── mstrain_3x_coco_instance.py
│   │   │   ├── mstrain_3x_coco_panoptic_inst_anno.py
│   │   │   ├── mstrain_3x_coco_panoptic_inst_anno_detr_aug.py
│   │   │   └── mstrain_64e_city_panoptic.py
│   │   ├── knet_cityscapes_step/
│   │   │   ├── knet_s3_r50_fpn.py
│   │   │   ├── knet_s3_swin_b_fpn.py
│   │   │   └── knet_s3_swin_l_fpn.py
│   │   ├── video_knet_kitti_step/
│   │   │   ├── video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train.py
│   │   │   ├── video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train_8e.py
│   │   │   ├── video_knet_s3_swinb_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_update.py
│   │   │   ├── video_knet_s3_swinl_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_update.py
│   │   │   └── video_knet_s3_swinl_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_update_conv_short_track_fc.py
│   │   └── video_knet_vipseg/
│   │       ├── video_knet_s3_r50_rpn_vipseg_mask_embed_link_ffn_joint_train.py
│   │       └── video_knet_s3_swin_b_rpn_vipseg_mask_embed_link_ffn_joint_train_8e.py
│   └── video_knet_vis/
│       ├── _base_/
│       │   ├── datasets/
│       │   │   ├── coco_instance.py
│       │   │   └── youtubevis_2019.py
│       │   ├── default_runtime.py
│       │   ├── models/
│       │   │   ├── knet_track_r50.py
│       │   │   └── knet_track_r50_deformablefpn.py
│       │   └── schedules/
│       │       ├── schedule_0.75x.py
│       │       ├── schedule_1x.py
│       │       └── schedule_8e.py
│       ├── common/
│       │   └── mstrain_3x_coco_instance.py
│       └── video_knet_vis/
│           ├── knet_track_r50_1x_youtubevis.py
│           ├── knet_track_r50_deformable_fpn_1x_youtubevis.py
│           ├── knet_track_swinb_1x_youtubevis_8e.py
│           └── knet_track_swinb_deformable_1x_youtubevis.py
├── external/
│   ├── cityscape_panoptic.py
│   ├── cityscapes_step.py
│   ├── cityscapes_vps.py
│   ├── coco_panoptic.py
│   ├── dataset/
│   │   ├── dvps_pipelines/
│   │   │   ├── __init__.py
│   │   │   ├── loading.py
│   │   │   ├── transforms.py
│   │   │   └── tricks.py
│   │   ├── forecasting_pipelines/
│   │   │   ├── __init__.py
│   │   │   ├── loading.py
│   │   │   └── transforms.py
│   │   ├── mIoU.py
│   │   └── pipelines/
│   │       ├── __init__.py
│   │       ├── formatting.py
│   │       ├── loading.py
│   │       ├── test_time_aug.py
│   │       └── transforms.py
│   ├── evalhooks.py
│   ├── ext/
│   │   ├── mask.py
│   │   └── ytvos.py
│   ├── fcn_mask_head.py
│   ├── kitti_step_dvps.py
│   ├── panoptic_fpn.py
│   ├── panoptic_head.py
│   ├── semantic_seg_head.py
│   ├── semkitti_dvps.py
│   ├── test.py
│   ├── train.py
│   ├── utils.py
│   └── vipseg_dvps.py
├── knet/
│   ├── __init__.py
│   ├── cross_entropy_loss.py
│   ├── det/
│   │   ├── dice_loss.py
│   │   ├── kernel_head.py
│   │   ├── kernel_iter_head.py
│   │   ├── kernel_update_head.py
│   │   ├── knet.py
│   │   ├── mask_hungarian_assigner.py
│   │   ├── mask_pseudo_sampler.py
│   │   ├── msdeformattn_decoder.py
│   │   ├── semantic_fpn_wrapper.py
│   │   └── utils.py
│   ├── kernel_updator.py
│   └── video/
│       ├── __init__.py
│       ├── dice_loss.py
│       ├── kernel_head.py
│       ├── kernel_iter_head.py
│       ├── kernel_update_head.py
│       ├── knet.py
│       ├── knet_quansi_dense.py
│       ├── knet_quansi_dense_embed_fc.py
│       ├── knet_quansi_dense_embed_fc_joint_train.py
│       ├── knet_quansi_dense_embed_fc_toy_exp.py
│       ├── knet_quansi_dense_roi_gt_box.py
│       ├── knet_quansi_dense_roi_gt_box_joint_train.py
│       ├── knet_track_head.py
│       ├── knet_track_head_roi_align.py
│       ├── knet_uni_track.py
│       ├── mask_hungarian_assigner.py
│       ├── mask_pseudo_sampler.py
│       ├── qdtrack/
│       │   ├── builder.py
│       │   ├── losses/
│       │   │   ├── __init__.py
│       │   │   ├── l2_loss.py
│       │   │   └── multipos_cross_entropy_loss.py
│       │   ├── track/
│       │   │   ├── __init__.py
│       │   │   ├── similarity.py
│       │   │   └── transforms.py
│       │   └── trackers/
│       │       ├── __init__.py
│       │       ├── quasi_dense_embed_tracker.py
│       │       └── tao_tracker.py
│       ├── track_heads.py
│       ├── tracker.py
│       └── util.py
├── knet_vis/
│   ├── __init__.py
│   ├── det/
│   │   ├── __init__.py
│   │   ├── kernel_head.py
│   │   ├── kernel_iter_head.py
│   │   ├── kernel_update_head.py
│   │   ├── knet.py
│   │   ├── mask_hungarian_assigner.py
│   │   ├── mask_pseudo_sampler.py
│   │   ├── semantic_fpn_wrapper.py
│   │   └── utils.py
│   ├── kernel_updator.py
│   └── tracker/
│       ├── __init__.py
│       ├── kernel_frame_head.py
│       ├── kernel_frame_iter_head.py
│       ├── kernel_head.py
│       ├── kernel_iter_head.py
│       ├── kernel_update_head.py
│       ├── mask_hungarian_assigner.py
│       ├── positional_encoding.py
│       ├── semantic_fpn_wrapper3D.py
│       └── track.py
├── mmtrack/
│   ├── datasets/
│   │   ├── coco_video_dataset.py
│   │   ├── parsers/
│   │   │   ├── __init__.py
│   │   │   └── coco_video_parser.py
│   │   └── youtube_vis_dataset.py
│   ├── pipelines/
│   │   ├── __init__.py
│   │   ├── formatting.py
│   │   ├── loading.py
│   │   ├── test_time_aug.py
│   │   └── transforms.py
│   └── transform.py
├── scripts/
│   ├── kitti_step_prepare.py
│   └── visualizer.py
├── swin/
│   ├── DetectRS.py
│   ├── ckpt_convert.py
│   ├── mix_transformer.py
│   ├── swin_checkpoint.py
│   ├── swin_transformer.py
│   ├── swin_transformer_rfp.py
│   └── transformer.py
├── tools/
│   ├── dataset/
│   │   ├── cityscapes_instance_idmap.py
│   │   └── youtubevis2coco.py
│   ├── dist_step_test.sh
│   ├── dist_test.sh
│   ├── dist_train.sh
│   ├── dist_train_new.sh
│   ├── dist_vps_test.sh
│   ├── docker.sh
│   ├── eval_dstq.py
│   ├── eval_dstq_step.py
│   ├── eval_dstq_vipseg.py
│   ├── eval_dvpq_step.py
│   ├── eval_dvpq_vipseg.py
│   ├── flops_counter.py
│   ├── get_flops.py
│   ├── inference_kitti_step.sh
│   ├── slurm_test.sh
│   ├── slurm_test_dvps.sh
│   ├── slurm_test_step.sh
│   ├── slurm_test_vis.sh
│   ├── slurm_test_vps.sh
│   ├── slurm_train.sh
│   ├── test.py
│   ├── test_dvps.py
│   ├── test_step.py
│   ├── test_vps.py
│   ├── train.py
│   ├── utils/
│   │   ├── DSTQ.py
│   │   ├── STQ.py
│   │   └── cityscapesvps_eval.py
│   └── visualization.py
├── tools_vis/
│   ├── apis/
│   │   ├── __init__.py
│   │   └── test.py
│   ├── dist_test_whole_video.sh
│   ├── docker.sh
│   ├── slurm_test_vis.sh
│   ├── test.py
│   └── test_whole_video.py
└── unitrack/
    ├── __init__.py
    ├── basetrack.py
    ├── box.py
    ├── core/
    │   ├── __init__.py
    │   ├── association/
    │   │   ├── __init__.py
    │   │   └── matching.py
    │   ├── motion/
    │   │   └── kalman_filter.py
    │   └── propagation/
    │       ├── __init__.py
    │       ├── propagate_box.py
    │       ├── propagate_mask.py
    │       └── propagate_pose.py
    ├── mask.py
    ├── mask_with_train_embs.py
    ├── model/
    │   ├── __init__.py
    │   ├── functional.py
    │   ├── hrnet.py
    │   ├── model.py
    │   ├── random_feat_generator.py
    │   └── resnet.py
    ├── multitracker.py
    └── utils/
        ├── __init__.py
        ├── box.py
        ├── io.py
        ├── log.py
        ├── mask.py
        ├── meter.py
        ├── palette.py
        └── visualize.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
work_dir/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

data/
data
.vscode
.idea
.DS_Store

# custom
*.pkl
*.pkl.json
*.log.json

# Pytorch
*.pth
*.py~
*.sh~

debug/*
vis/
analysis/*
pretrain/*


================================================
FILE: DATASET.md
================================================
Please prepare the data structure as the following instruction:

The final dataset folder should be like this. 
```
root 
├── data
│   ├──  kitti-step
│   ├──  coco
│   ├──  VIPSeg
│   ├──  youtube_vis_2019
│   ├──  cityscapes
```

### [VPS] KITTI-STEP

Download the KITTI-STEP from the official website. 

Then run the scripts in scripts/kitti_step_prepare.py.
You will get such format.
You can get the our pre-process format in https://huggingface.co/LXT/VideoK-Net/tree/main

```
├── kitti-step
│   ├──  video_sequence
│   │   ├── train
            ├──00018_000331_leftImg8bit.png
            ├──000018_000331_panoptic.png
            ├──****
│   │   ├── val
│   │   ├── test 
```


### [VPS] VIPSeg

Download the origin dataset from the official repo.\
Following official repo, we use resized videos for training and evaluation (The short size of the input is set to 720 while the ratio is keeped).

```
├── VIPSeg
│   ├──  images
│   │   ├── 1241_qYvEuwrSiXc
        │      ├──*.jpg
│   ├──  panomasks 
│   │   ├── 1241_qYvEuwrSiXc
        │      ├──*.png
│   ├──  panomasksRGB 
```


### [VIS] Youtube-VIS-2019
We use pre-processed json file according to mmtracking codebase.
see the "tools/dataset/youtubevis2coco.py"

```
├── youtube_vis_2019
│   ├── annotations
│   │   ├── train.json
│   │   ├── valid.json
│   │   ├── youtube_vis_2019_train.json
│   │   ├── youtube_vis_2019_valid.json
│   ├── train
│   │   ├──JPEGImages
│   │   │   ├──video floders
│   ├── valid
│   │   ├──JPEGImages
│   │   │   ├──video floders
```


### [VSS] VSPW

To do


### [VPS] Cityscapes 

For Cityscape-VPS and Cityscape-DVPS, we suggest the follower to see
The model of Video K-Net will not be released due to the Patent ISSUE and INTERNAL USEAGE. 

You can find our related works. ECCV-2022, PolyphonicFormer: A Unified Framework For Panoptic Segmentation + Depth Estimation (winner of ICCV-2021 BMTT workshop)
(https://github.com/HarborYuan/PolyphonicFormer)



## Image DataSet For Pretraining K-Net

### COCO dataset

COCO is most common datatsets. It contains 80 thing classes and 54 stuff classes.

The dataset format is the same as origin [Detectron2](https://github.com/facebookresearch/detectron2)
including panoptic segmentation preparation [scirpts](https://github.com/facebookresearch/detectron2/blob/master/datasets/prepare_panoptic_fpn.py).

Then the final folder is like this:
```
├── coco
│   ├── annotations
│   │   ├── panoptic_{train,val}2017.json
│   │   ├── instance_{train,val}2017.json
│   ├── train2017
│   ├── val2017
│   ├── panoptic_{train,val}2017/  # png annotations
```

### Cityscapes dataset

Cityscapes dataset is a high-resolution road-scene dataset which contains 19 classes. 
(8 thing classes and 11 stuff classes). 2975 images for training, 500 images for validation and 1525 images for testing.

Preparing cityscape dataset has three steps:

1, Convert segmentation id map(origin label id maps) to trainId maps (id ranges: 0-18 for training) using 
the official scripts [repo](https://github.com/mcordts/cityscapesScripts)

2, The run python dataset/prepare_cityscapes.py to generate the COCO-like annotations. 
This annotations can be used for Instance Segmentation training.

using csCreateTrainIdLabelImgs.py

and put the instancesonly_filtered_gtFine_train.json into annotations folder


3, For Panoptic Segmenation dataset, to generate the json file 

using csCreatePanopticImgs.py 

or you can download the our transformed .json and .png files via link: () and put the 
json file into annotations folder. 

Then the final folder is like this:

```
├── cityscapes
│   ├── annotations
│   │   ├── instancesonly_filtered_gtFine_train.json # coco instance annotation file(COCO format)
│   │   ├── instancesonly_filtered_gtFine_val.json
│   │   ├── cityscapes_panoptic_train.json  # panoptic json file 
│   │   ├── cityscapes_panoptic_val.json  
│   ├── leftImg8bit
│   ├── gtFine
│   │   ├──cityscapes_panoptic_{train,val}/  # png annotations
│   │   
```


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2022 Xiangtai  Lee

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
# Video K-Net: A Simple, Strong, and Unified Baseline for Video Segmentation (CVPR-2022, oral) 
## [Paper](https://arxiv.org/abs/2204.04656), [Sides](./slides/Video-KNet-cvpr-slides-10-25-version.pptx), [Poster](./slides/cvpr22_poster_lxt_zww_pjm.pdf), [Video](https://www.youtube.com/watch?v=LIEyp_czu20&t=3s)

[Xiangtai Li](https://lxtgh.github.io/),
[Wenwei Zhang](https://zhangwenwei.cn/),
[Jiangmiao Pang](https://oceanpang.github.io/),
[Kai Chen](https://chenkai.site/), 
[Guangliang Cheng](https://scholar.google.com/citations?user=FToOC-wAAAAJ),
[Yunhai Tong](https://scholar.google.com/citations?user=T4gqdPkAAAAJ&hl=zh-CN),
[Chen Change Loy](https://www.mmlab-ntu.com/person/ccloy/).

We introduce Video K-Net, a simple, strong, and unified framework for fully end-to-end dense video segmentation. 

The method is built upon K-Net, a method of unifying image segmentation via a group of learnable kernels.

This project contains the training and testing code of Video K-Net for both VPS (Video Panoptic Segmentation), 
VSS(Video Semantic Segmentation), VIS(Video Instance Segmentation).

To the best of our knowledge, our Video K-Net is the first open-sourced method that supports three different video segmentation tasks (VIS, VPS, VSS) for Video Scene Understanding.

## News! Video K-Net is acknowledged as a strong baseline for CVPR-2023 workshop ["The 2nd Pixel-level Video Understanding in the Wild"](https://www.vspwdataset.com/Workshop%202023.html). 
## News! Video K-Net also supports [VIP-Seg](https://github.com/VIPSeg-Dataset/VIPSeg-Dataset) dataset(CVPR-2022). It also achieves the new state-of-the-art result.


### Environment and DataSet Preparation 
Our codebase is based on MMDetection and MMSegmentation. Parts of the code is borrowed from MMtracking and UniTrack.

- MIM >= 0.1.1
- MMCV-full >= v1.3.8
- MMDetection == v2.18.0
- timm
- scipy
- panopticapi

See the [DATASET.md](https://github.com/lxtGH/Video-K-Net/blob/main/DATASET.md)

knet folder contains the Video K-Net for VPS.

knet_vis folder contains the Video K-Net for VIS.



### Pretrained CKPTs and Trained Models

We provide the pretrained models for VPS and VIS.

Baidu Yun Link: [here](https://pan.baidu.com/s/12dIinkAF3o60fcAoggVhjQ)  Code:i034

One Drive Link: [here](https://1drv.ms/u/s!Ai4mxaXd6lVBgSCTUS0QWNim2zGx?e=uceSee)

The pretrained models are provided to train the Video K-Net.

The trained models are also provided for play and test.



### [VPS] KITTI-STEP

1. First pretrain K-Net on Cityscapes-STEP datasset. As shown in original STEP paper(Appendix Part) and our own EXP results, this step is very important to improve the segmentation performance.
You can also use our trained model for verification.

Cityscape-STEP follows the format of STEP: 17 stuff classes and 2 thing classes. 

```bash
# train cityscapes step panoptic segmentation models
sh ./tools/slurm_train.sh $PARTITION knet_step configs/det/knet_cityscapes_step/knet_s3_r50_fpn.py $WORK_DIR --no-validate
```

2. Then train the Video K-Net on KITTI-STEP. We have provided the pretrained models from Cityscapes of Video K-Net.

For slurm users:

```bash
# train Video K-Net on KITTI-step using R-50
GPUS=8 sh ./tools/slurm_train.sh $PARTITION video_knet_step configs/det/video_knet_kitti_step/video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train.py $WORK_DIR --no-validate --load-from /path_to_knet_step_city_r50
```

```bash
# train Video K-Net on KITTI-step using Swin-base
GPUS=16 GPUS_PER_NODE=8 sh ./tools/slurm_train.sh $PARTITION video_knet_step configs/det/video_knet_kitti_step/video_knet_s3_swinb_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train.py $WORK_DIR --no-validate --load-from /path_to_knet_step_city_r50
```

Our models are trained with two V100 machines. 

For Local machine:

```bash
# train Video K-Net on KITTI-step with 8 GPUs
sh ./tools/dist_train.sh video_knet_step configs/det/video_knet_kitti_step/video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train.py 8 $WORK_DIR --no-validate
```


3. Testing and Demo.

We provide both VPQ and STQ metrics to evaluate VPS models. 

```bash
# test locally 
sh ./tools/dist_step_test.sh configs/det/knet_cityscapes_ste/knet_s3_r50_fpn.py $MODEL_DIR 
```

We also dump the colored images for debug.

```bash
# eval STEP STQ
python tools/eval_dstq_step.py result_path gt_path
```

```bash
# eval STEP VPQ
python tools/eval_dvpq_step.py result_path gt_path
```

#### Toy Video K-Net 

As shown in the paper, we also provide toy video K-Net in knet/video/knet_quansi_dense_embed_fc_toy_exp.py. 
You use the K-Net pre-trained on image-level KITTI-STEP without tracking.


### [VIS] YouTube-VIS-2019

1. First Download the pre-trained Image K-Net instance segmentation models. All the models are pretrained on COCO which is
a common. You can also pretrain it by yourself. We also provide the config for pretraining.

For slurm users:

```bash
# train K-Net instance segmentation models on COCO using R-50
GPUS=8 sh ./tools/slurm_train.sh $PARTITION knet_instance configs/det/coco/knet_s3_r50_fpn_ms-3x_coco.py $WORK_DIR 
```

2. Then train the video K-Net in a clip-wised manner. 

```bash
# train Video K-Net VIS models using R-50
GPUS=8 sh ./tools/slurm_train.sh $PARTITION video_knet_vis configs/video_knet_vis/video_knet_vis/knet_track_r50_1x_youtubevis.py $WORK_DIR --load-from /path_to_knet_instance_coco
```

3. To evaluate the results of Video K-Net on VIS. Dump the prediction results for submission to the conda server. 

```bash
# test Video K-Net VIS models using R-50
GPUS=8 sh tools_vis/dist_test_whole_video.sh $PARTITION video_knet_vis configs/video_knet_vis/video_knet_vis/knet_track_r50_1x_youtubevis.py $WORK_DIR --format-only
```
The result json is dumped into the root of this codebase. 

### [VPS] VIP-Seg

1. First Download the pre-trained Image K-Net panoptic segmentation models. All the models are pretrained on COCO which is
a common step following VIP-Seg. You can also pretrain it by yourself. We also provide the config for pretraining.
```bash
# train K-Net on COCO Panoptic Segmetnation
GPUS=8 sh ./tools/slurm_train.sh $PARTITION knet_coco configs/det/coco/knet_s3_r50_fpn_ms-3x_coco-panoptic.py $WORK_DIR 
```

2. Train the Video K-Net on the VIP-Seg dataset. 
```bash
# train Video K-Net on VIP-Seg
GPUS=8 sh ./tools/slurm_train.sh $PARTITION video_knet_vis configs/det/video_knet_vipseg/video_knet_s3_r50_rpn_vipseg_mask_embed_link_ffn_joint_train.py $WORK_DIR --load-from /path/knet_coco_pretrained_r50
```

3. Test the Video K-Net on VIP-Seg val dataset.
```bash
# test locally on VIP-Seg
sh ./tools/dist_step_test.sh configs/det/video_knet_vipseg/video_knet_s3_r50_rpn_vipseg_mask_embed_link_ffn_joint_train.py $MODEL_DIR 
```

We also dump the colored images for debug.

```bash
# eval STEP STQ
python tools/eval_dstq_vipseg.py result_path gt_path
```

```bash
# eval STEP VPQ
python tools/eval_dvpq_vipseg.py result_path gt_path
```


## Visualization Results


### Results on KITTI-STEP DataSet



### Results on VIP-Seg DataSet



### Results on YouTube-VIS DataSet



### Short term segmentation and tracking results on Cityscapes VPS dataset.

images(left), Video K-Net(middle), Ground Truth 
![Alt Text](./figs/cityscapes_vps_video_1_20220318131729.gif)

![Alt Text](./figs/cityscapes_vps_video_2_20220318132943.gif)

### Long term segmentation and tracking results on STEP dataset.

![Alt Text](./figs/step_video_1_20220318133227.gif)

![Alt Text](./figs/step_video_2_20220318133423.gif)


## Related Project and Acknowledgement
## Citing Video K-Net :pray:

If you use our codebase in your research or used for CVPR-2023 pixel-level video workshop, please use the following BibTeX entry.

NIPS-2021, K-Net: Unified Segmentation: Our Image baseline (https://github.com/ZwwWayne/K-Net)

ECCV-2022, PolyphonicFormer: A Unified Framework For Panoptic Segmentation + Depth Estimation (winner of ICCV-2021 BMTT workshop)
(https://github.com/HarborYuan/PolyphonicFormer)

```bibtex
@inproceedings{li2022videoknet,
  title={Video k-net: A simple, strong, and unified baseline for video segmentation},
  author={Li, Xiangtai and Zhang, Wenwei and Pang, Jiangmiao and Chen, Kai and Cheng, Guangliang and Tong, Yunhai and Loy, Chen Change},
  booktitle={CVPR},
  year={2022}
}

@article{zhang2021k,
  title={K-net: Towards unified image segmentation},
  author={Zhang, Wenwei and Pang, Jiangmiao and Chen, Kai and Loy, Chen Change},
  journal={NeurIPS},
  year={2021}
}
```



================================================
FILE: configs/det/_base_/datasets/cityscapes_panoptic.py
================================================
# dataset settings
dataset_type = 'CityscapesPanopticDataset'
data_root = 'data/cityscapes/'

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
    dict(
        type='Resize', img_scale=[(2048, 800), (2048, 1024)], multiscale_mode='range', keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
]


test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(2048, 1024),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]

data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=8,
        dataset=dict(
            type=dataset_type,
            ann_file=dict(
                ins_ann=data_root + 'annotations/instancesonly_filtered_gtFine_train.json',
                panoptic_ann=data_root + 'annotations/cityscapes_panoptic_train.json'
            ),
            img_prefix=data_root + 'leftImg8bit/train/',
            seg_prefix=data_root + 'gtFine/train',
            pipeline=train_pipeline)),
    val=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root +'annotations/instancesonly_filtered_gtFine_val.json',
            panoptic_ann=data_root + "annotations/cityscapes_panoptic_val.json"
        ),
        img_prefix=data_root + 'leftImg8bit/val/',
        seg_prefix=data_root + 'gtFine/cityscapes_panoptic_val',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instancesonly_filtered_gtFine_val.json',
            panoptic_ann=data_root + "annotations/cityscapes_panoptic_val.json"
        ),
        img_prefix=data_root + 'leftImg8bit/val/',
        seg_prefix=data_root + 'gtFine/cityscapes_panoptic_val',
        pipeline=test_pipeline))

evaluation = dict(metric=['panoptic'])


================================================
FILE: configs/det/_base_/datasets/cityscapes_step.py
================================================
dataset_type = 'CityscapesSTEP'
data_root = 'data/cityscapes'

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53],
    std=[58.395, 57.12, 57.375],
    to_rgb=True
)

train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotationsInstanceMasks', cherry=[11, 13]),
    dict(type='KNetInsAdapterCherryPick', stuff_nums=11, cherry=[11, 13]),
    dict(type='Resize', img_scale=(1024, 2048), ratio_range=[0.5, 2.0], keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='RandomCrop', crop_size=(1024, 2048)),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='PadFutureMMDet', size_divisor=32, pad_val=dict(img=0, masks=0, seg=255)),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_masks', 'gt_labels', 'gt_semantic_seg'],
         meta_keys=('ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                    'flip_direction', 'img_norm_cfg')
         ),
]

test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        scale_factor=[1.0],
        flip=False,
        transforms=[
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect',
                 keys=['img'],
                 meta_keys=[
                     'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                     'flip_direction', 'img_norm_cfg'
                 ]),
        ])
]

data = dict(
    samples_per_gpu=4,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=8,
        dataset=dict(
            type=dataset_type,
            data_root=data_root,
            split='train',
            test_mode=False,
            pipeline=train_pipeline
        )),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        split='val',
        test_mode=True,
        pipeline=test_pipeline
    ),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        split='val',
        test_mode=True,
        pipeline=test_pipeline
    )
)

evaluation = dict()


================================================
FILE: configs/det/_base_/datasets/cityscapes_vps_clips.py
================================================
dataset_type = 'CityscapesVPSDataset'
data_root = 'data/cityscapes_vps/'
dataset_type_test = "CityscapesPanopticDataset"

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

train_pipeline = [
    dict(type='LoadMultiImagesFromFile'),
    dict(type='SeqLoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
    dict(type='SeqResize', img_scale=[(512, 1024), (2048, 4096)], multiscale_mode='range', keep_ratio=True),
    dict(type='SeqRandomFlip',  share_params=True, flip_ratio=0.5),
    dict(type='SeqRandomCrop',  crop_size=(1024, 1024), share_params=True),
    dict(type='SeqNormalize', **img_norm_cfg),
    dict(type='SeqPad', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', "gt_instance_ids"]),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]


test_pipeline = [
    dict(type='LoadRefImageFromFile'),

    dict(
        type='MultiScaleFlipAug',
        img_scale=[(2048, 1024)],
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img', 'ref_img']),
            dict(type='Collect', keys=['img', 'ref_img']),
        ])
]

data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=8,
        dataset=dict(
            type=dataset_type,
            ann_file=dict(ins_ann=data_root +'instances_train_city_vps_rle.json',
                          panoptic_ann=data_root + 'panoptic_im_train_city_vps.json'
                          ),
            img_prefix=data_root + 'train/img/',
            seg_prefix=data_root + 'train/labelmap/',
            pipeline=train_pipeline,
            offsets=[-1,+1])),
    val=dict(
        type=dataset_type_test,
        ann_file=dict(ins_ann=data_root + 'instances_val_city_vps_rle.json',
                      panoptic_ann=data_root + 'panoptic_gt_val_city_vps.json',
                      vps=True
                      ),
        img_prefix=data_root + 'val/img/',
        seg_prefix=data_root + 'val/panoptic_video/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type_test,
        ann_file=dict(ins_ann=data_root + 'instances_val_city_vps_rle.json',
                      panoptic_ann=data_root + 'panoptic_gt_val_city_vps.json',
                      vps=True
                      ),
        img_prefix=data_root + 'val/img_all/',     # img for validation
        ref_prefix=data_root + 'val/img_all/',  # ref_images
        nframes_span_test=30,
        pipeline=test_pipeline))

evaluation = dict(metric=['panoptic'])

================================================
FILE: configs/det/_base_/datasets/cityscapes_vps_clips_trainval.py
================================================
dataset_type = 'CityscapesVPSDataset'
data_root = 'data/cityscapes_vps/'
dataset_type_test = "CityscapesPanopticDataset"

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

train_pipeline = [
    dict(type='LoadMultiImagesFromFile'),
    dict(type='SeqLoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
    dict(type='SeqResize', img_scale=[(512, 1024), (2048, 4096)], multiscale_mode='range', keep_ratio=True),
    dict(type='SeqRandomFlip',  share_params=True, flip_ratio=0.5),
    dict(type='SeqRandomCrop',  crop_size=(1024, 2048), share_params=True),
    dict(type='SeqNormalize', **img_norm_cfg),
    dict(type='SeqPad', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', "gt_instance_ids"]),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]


test_pipeline = [
    dict(type='LoadRefImageFromFile'),

    dict(
        type='MultiScaleFlipAug',
        img_scale=[(2048, 1024)],
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img', 'ref_img']),
            dict(type='Collect', keys=['img', 'ref_img']),
        ])
]

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=8,
        dataset=dict(
            type='ConcatDataset',
            separate_eval=False,
            datasets=[
                dict(
                    type=dataset_type,
                    ann_file=dict(ins_ann=data_root +'instances_train_city_vps_rle.json',
                                  panoptic_ann=data_root + 'panoptic_im_train_city_vps.json'
                                  ),
                    img_prefix=data_root + 'train/img/',
                    seg_prefix=data_root + 'train/labelmap/',
                    pipeline=train_pipeline,
                    offsets=[-1,+1]
                ),
            dict(
                type=dataset_type,
                ann_file=dict(ins_ann=data_root +'instances_val_city_vps_rle.json',
                              panoptic_ann=data_root + 'panoptic_gt_val_city_vps.json'
                              ),
                img_prefix=data_root + 'val/img/',
                seg_prefix=data_root + 'val/labelmap/',
                pipeline=train_pipeline,
                offsets=[-1,+1]),
            ],
        )
    ),
    val=dict(
        type=dataset_type,
        ann_file=dict(ins_ann=data_root + 'instances_val_city_vps_rle.json',
                      panoptic_ann=data_root + 'panoptic_gt_val_city_vps.json',
                      vps=True
                      ),
        img_prefix=data_root + 'val/img_all/',     # img for validation
        ref_prefix=data_root + 'val/img_all/',  # ref_images
        nframes_span_test=30,
        pipeline=test_pipeline)

)

evaluation = dict(metric=['panoptic'])

================================================
FILE: configs/det/_base_/datasets/coco_instance.py
================================================
dataset_type = 'CocoDataset'
data_root = 'data/coco/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1333, 800),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_train2017.json',
        img_prefix=data_root + 'train2017/',
        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline))
# we do not evaluate bbox because K-Net does not predict bounding boxes
evaluation = dict(metric=['segm'])


================================================
FILE: configs/det/_base_/datasets/coco_panoptic.py
================================================
dataset_type = 'CocoPanopticDatasetCustom'
data_root = 'data/coco/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(
        type='Collect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1333, 800),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip', flip_ratio=0.5),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_train2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_train2017.json'),
        img_prefix=data_root + 'train2017/',
        seg_prefix=data_root + 'panoptic_stuff_train2017/',
        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_val2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_val2017.json'),
        seg_prefix=data_root + 'panoptic_val2017/',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_val2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_val2017.json'),
        seg_prefix=data_root + 'panoptic_val2017/',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline))

evaluation = dict(metric=['segm', 'panoptic'])


================================================
FILE: configs/det/_base_/datasets/coco_panoptic_instance_annotations.py
================================================
dataset_type = 'CocoPanopticDatasetCustom'
data_root = 'data/coco/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(
        type='Collect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1333, 800),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip', flip_ratio=0.5),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_train2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_train2017.json'),
        img_prefix=data_root + 'train2017/',
        seg_prefix=data_root + 'panoptic_stuff_train2017/',
        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_val2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_val2017.json'),
        seg_prefix=data_root + 'panoptic_val2017/',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_val2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_val2017.json'),
        seg_prefix=data_root + 'panoptic_val2017/',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline))

evaluation = dict(metric=['segm', 'panoptic'])


================================================
FILE: configs/det/_base_/datasets/kitti_step_dvps.py
================================================
dataset_type = 'KITTISTEPDVPSDataset'
data_root = 'data/kitti-step'

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)

# The kitti dataset contains 1226 x 370 and 1241 x 376
train_pipeline = [
    dict(type='LoadMultiImagesDirect'),
    dict(type='LoadMultiAnnotationsDirect', with_depth=True, divisor=-1, cherry_pick=True, cherry=[11, 13]),
    # dict(type='SeqResizeWithDepth', img_scale=(370, 1226), ratio_range=[1.0, 2.0], keep_ratio=True),
    dict(type='SeqFlipWithDepth', flip_ratio=0.5),
    # dict(type='SeqRandomCropWithDepth', crop_size=(352, 1024), share_params=True),
    dict(type='SeqNormalizeWithDepth', **img_norm_cfg),
    dict(type='SeqPadWithDepth', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_depth', 'gt_instance_ids', ]),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]

test_pipeline = [
    dict(type='LoadImgDirect'),
    dict(
        type='MultiScaleFlipAug',
        scale_factor=[1.0],
        flip=False,
        transforms=[
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect',
                 keys=['img'],
                 meta_keys=[
                     'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                     'flip_direction', 'img_norm_cfg', 'ori_filename'
                 ]),
        ])
]

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=4,
        dataset=dict(
            type=dataset_type,
            data_root=data_root,
            split='train',
            ref_seq_index=None,
            test_mode=False,
            pipeline=train_pipeline,
            with_depth=True,
        )),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        split='val',
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
        with_depth=True,
    ),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        split='val',
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
        with_depth=True,
    )
)

evaluation = dict()


================================================
FILE: configs/det/_base_/datasets/kitti_step_vps.py
================================================
dataset_type = 'KITTISTEPDVPSDataset'
data_root = 'data/kitti-step'

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)

# The kitti dataset contains 1226 x 370 and 1241 x 376
# 384 x 1248 is the minimum size that is 32-divisible
train_pipeline = [
    dict(type='LoadMultiImagesDirect'),
    dict(type='LoadMultiAnnotationsDirect', with_depth=False, divisor=-1, cherry_pick=True, cherry=[11, 13]),
    dict(type='SeqResizeWithDepth', img_scale=(384, 1248), ratio_range=[0.5, 2.0], keep_ratio=True),
    dict(type='SeqFlipWithDepth', flip_ratio=0.5),
    dict(type='SeqRandomCropWithDepth', crop_size=(384, 1248), share_params=True),
    dict(type='SeqNormalizeWithDepth', **img_norm_cfg),
    dict(type='SeqPadWithDepth', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids']),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]

test_pipeline = [
    dict(type='LoadImgDirect'),
    dict(
        type='MultiScaleFlipAug',
        scale_factor=[1.0],
        flip=False,
        transforms=[
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect',
                 keys=['img'],
                 meta_keys=[
                     'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                     'flip_direction', 'img_norm_cfg', 'ori_filename', "filename"
                 ]),
        ])
]

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=4,
        dataset=dict(
            type=dataset_type,
            data_root=data_root,
            split='train',
            ref_seq_index=None,
            test_mode=False,
            pipeline=train_pipeline,
            with_depth=False,
        )),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        split='val',
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
        with_depth=False,
    ),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        split='val',
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
        with_depth=False,
    )
)

evaluation = dict()


================================================
FILE: configs/det/_base_/datasets/kitti_step_vps_trainval.py
================================================
dataset_type = 'KITTISTEPDVPSDataset'
data_root = 'data/kitti-step'

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)

# The kitti dataset contains 1226 x 370 and 1241 x 376
# 384 x 1248 is the minimum size that is 32-divisible
train_pipeline = [
    dict(type='LoadMultiImagesDirect'),
    dict(type='LoadMultiAnnotationsDirect', with_depth=False, divisor=-1, cherry_pick=True, cherry=[11, 13]),
    dict(type='SeqResizeWithDepth', img_scale=(384, 1248), ratio_range=[0.5, 2.0], keep_ratio=True),
    dict(type='SeqFlipWithDepth', flip_ratio=0.5),
    dict(type='SeqRandomCropWithDepth', crop_size=(384, 1248), share_params=True),
    dict(type='SeqNormalizeWithDepth', **img_norm_cfg),
    dict(type='SeqPadWithDepth', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids']),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]

test_pipeline = [
    dict(type='LoadImgDirect'),
    dict(
        type='MultiScaleFlipAug',
        scale_factor=[1.0],
        flip=False,
        transforms=[
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect',
                 keys=['img'],
                 meta_keys=[
                     'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                     'flip_direction', 'img_norm_cfg', 'ori_filename', "filename"
                 ]),
        ])
]

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=4,
        dataset=dict(
            type='ConcatDataset',
            separate_eval=False,
            datasets=[
                dict(
                    type=dataset_type,
                    data_root=data_root,
                    split='train',
                    ref_seq_index=None,
                    test_mode=False,
                    pipeline=train_pipeline,
                    with_depth=False,
                ),
                dict(
                    type=dataset_type,
                    data_root=data_root,
                    split='val',
                    ref_seq_index=None,
                    test_mode=False,
                    pipeline=train_pipeline,
                    with_depth=False,
                )
            ]
        ),
    ),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        split='val',
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
        with_depth=False,
    ),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        split='val',
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
        with_depth=False,
    )
)

evaluation = dict()


================================================
FILE: configs/det/_base_/datasets/mapillary_panoptic.py
================================================
dataset_type = 'MapillaryPanopticDataset'
data_root = 'data/mapillary/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
    dict(type='Resize', img_scale=[(1024, 4096), (2048, 4096)], multiscale_mode='range', keep_ratio=True),
    dict(type='RandomCrop', crop_size=(1024, 1024)),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(
        type='Collect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
]

test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(2048, 4096),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip', flip_ratio=0.5),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]

data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/coco/training.json',
            panoptic_ann=data_root + 'annotations/panoptic_train.json'
        ),
        img_prefix=data_root + 'training/images',
        seg_prefix=data_root + 'training/panoptic_stuff_train',
        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/coco/validation.json',
            panoptic_ann=data_root + 'annotations/panoptic_val.json'),
        seg_prefix=data_root + 'validation/panoptic',
        img_prefix=data_root + 'validation/images',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/coco/validation.json',
            panoptic_ann=data_root + 'annotations/panoptic_val.json'),
        seg_prefix=data_root + 'validation/panoptic',
        img_prefix=data_root + 'validation/images',
        pipeline=test_pipeline))

evaluation = dict(metric=['segm', 'panoptic'])


================================================
FILE: configs/det/_base_/datasets/vipseg_dvps.py
================================================
dataset_type = 'VIPSegDVPSDataset'
data_root = 'data/VIPSeg'

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)

crop_size = (736, 736)

train_pipeline = [
    dict(type='LoadMultiImagesDirect'),
    dict(type='LoadMultiAnnotationsDirect', with_depth=False, vipseg=True),
    dict(type='SeqResizeWithDepth', img_scale=(720, 100000), ratio_range=[1., 2.], keep_ratio=True),
    dict(type='SeqFlipWithDepth', flip_ratio=0.5),
    dict(type='SeqRandomCropWithDepth', crop_size=(736, 736), share_params=True),
    dict(type='SeqPadWithDepth', size_divisor=32),
    dict(type='SeqNormalize', **img_norm_cfg),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids']),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]


test_pipeline = [
    dict(type='LoadImgDirect'),
    dict(
        type='MultiScaleFlipAug',
        scale_factor=[1.0],
        flip=False,
        transforms=[
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect',
                 keys=['img', 'img_id', 'seq_id'],
                 meta_keys=[
                     'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                     'flip_direction', 'img_norm_cfg', 'ori_filename', "filename"
                 ]),
        ])
]

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=1,
        dataset=dict(
            type=dataset_type,
            data_root=data_root,
            test_mode=False,
            split='train',
            ref_seq_index=[-2, -1, 1, 2],
            is_instance_only=True,
            pipeline=train_pipeline,
        )),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        split='val',
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
    ),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        split='val',
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
    )
)

evaluation = dict()


================================================
FILE: configs/det/_base_/default_runtime.py
================================================
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
    ])
# yapf:enable

dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]


================================================
FILE: configs/det/_base_/models/knet_citystep_s3_r50_fpn.py
================================================
num_stages = 3
num_proposals = 100
conv_kernel_size = 1

model = dict(
    type='KNet',
    cityscapes=False,
    kitti_step=True,
    num_thing_classes=2,
    num_stuff_classes=17,
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=0,
        add_extra_convs='on_input',
        num_outs=4
    ),
    rpn_head=dict(
        type='ConvKernelHead',
        num_classes=19,
        num_thing_classes=2,
        num_stuff_classes=17,
        cat_stuff_mask=True,
        conv_kernel_size=conv_kernel_size,
        feat_downsample_stride=2,
        feat_refine_stride=1,
        feat_refine=False,
        use_binary=True,
        num_loc_convs=1,
        num_seg_convs=1,
        conv_normal_init=True,
        localization_fpn=dict(
            type='SemanticFPNWrapper',
            in_channels=256,
            feat_channels=256,
            out_channels=256,
            start_level=0,
            end_level=3,
            upsample_times=2,
            positional_encoding=dict(
                type='SinePositionalEncoding', num_feats=128, normalize=True),
            cat_coors=False,
            cat_coors_level=3,
            fuse_by_cat=False,
            return_list=False,
            num_aux_convs=1,
            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)),
        num_proposals=num_proposals,
        proposal_feats_with_obj=True,
        xavier_init_kernel=False,
        kernel_init_std=1,
        num_cls_fcs=1,
        in_channels=256,
        feat_transform_cfg=None,
        loss_rank=dict(
            type='CrossEntropyLoss',
            use_sigmoid=False,
            loss_weight=0.1),
        loss_seg=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_mask=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_dice=dict(type='DiceLoss', loss_weight=4.0)),
    roi_head=dict(
        type='KernelIterHead',
        num_thing_classes=2,
        num_stuff_classes=17,
        do_panoptic=True,
        num_stages=num_stages,
        stage_loss_weights=[1] * num_stages,
        proposal_feature_channel=256,
        mask_head=[
            dict(
                type='KernelUpdateHead',
                num_classes=19,
                num_thing_classes=2,
                num_stuff_classes=17,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=2,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'),
                    act_cfg=None
                ),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_rank=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=0.1
                ),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0
                ),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0
                ),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0))
            for _ in range(num_stages)
        ]
    ),
    # training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaskHungarianAssigner',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
            sampler=dict(type='MaskPseudoSampler'),
            pos_weight=1),
        rcnn=[
            dict(
                assigner=dict(
                    type='MaskHungarianAssigner',
                    cls_cost=dict(type='FocalLossCost', weight=2.0),
                    dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                    mask_cost=dict(type='MaskCost', weight=1.0,
                                   pred_act=True)),
                sampler=dict(type='MaskPseudoSampler'),
                pos_weight=1)

            for _ in range(num_stages)
        ]),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=num_proposals,
            mask_thr=0.5,
            stuff_score_thr=0.05,
            merge_stuff_thing=dict(
                iou_thr=0.5,
                stuff_max_area=4096,
                instance_score_thr=0.25
            )
        )
    )
)

custom_imports = dict(
    imports=[
        'knet.det.kernel_head',
        'knet.det.kernel_iter_head',
        'knet.det.kernel_update_head',
        'knet.det.semantic_fpn_wrapper',
        'knet.det.dice_loss',
        'knet.det.mask_hungarian_assigner',
        'knet.det.mask_pseudo_sampler',
        'knet.kernel_updator',
        'knet.cross_entropy_loss',
        'swin.swin_transformer',
        'swin.mix_transformer',
        'swin.DetectRS',
        'swin.swin_transformer_rfp',
        'external.cityscapes_step',
        'external.dataset.pipelines.transforms',
        'external.dataset.pipelines.loading',
    ],
    allow_failed_imports=False
)


================================================
FILE: configs/det/_base_/models/knet_kitti_step_s3_r50_fpn.py
================================================
num_stages = 3
num_proposals = 100
conv_kernel_size = 1

model = dict(
    type='KNet',
    cityscapes=False,
    kitti_step=True,
    num_thing_classes=2,
    num_stuff_classes=17,
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=0,
        add_extra_convs='on_input',
        num_outs=4
    ),
    rpn_head=dict(
        type='ConvKernelHead',
        num_classes=19,
        num_thing_classes=2,
        num_stuff_classes=17,
        cat_stuff_mask=True,
        conv_kernel_size=conv_kernel_size,
        feat_downsample_stride=2,
        feat_refine_stride=1,
        feat_refine=False,
        use_binary=True,
        num_loc_convs=1,
        num_seg_convs=1,
        conv_normal_init=True,
        localization_fpn=dict(
            type='SemanticFPNWrapper',
            in_channels=256,
            feat_channels=256,
            out_channels=256,
            start_level=0,
            end_level=3,
            upsample_times=2,
            positional_encoding=dict(
                type='SinePositionalEncoding', num_feats=128, normalize=True),
            cat_coors=False,
            cat_coors_level=3,
            fuse_by_cat=False,
            return_list=False,
            num_aux_convs=1,
            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)),
        num_proposals=num_proposals,
        proposal_feats_with_obj=True,
        xavier_init_kernel=False,
        kernel_init_std=1,
        num_cls_fcs=1,
        in_channels=256,
        feat_transform_cfg=None,
        loss_rank=dict(
            type='CrossEntropyLoss',
            use_sigmoid=False,
            loss_weight=0.1),
        loss_seg=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_mask=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_dice=dict(type='DiceLoss', loss_weight=4.0)),
    roi_head=dict(
        type='KernelIterHead',
        num_thing_classes=2,
        num_stuff_classes=17,
        do_panoptic=True,
        num_stages=num_stages,
        stage_loss_weights=[1] * num_stages,
        proposal_feature_channel=256,
        mask_head=[
            dict(
                type='KernelUpdateHead',
                num_classes=19,
                num_thing_classes=2,
                num_stuff_classes=17,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=2,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'),
                    act_cfg=None
                ),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_rank=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=0.1
                ),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0
                ),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0
                ),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0))
            for _ in range(num_stages)
        ]
    ),
    # training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaskHungarianAssigner',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
            sampler=dict(type='MaskPseudoSampler'),
            pos_weight=1),
        rcnn=[
            dict(
                assigner=dict(
                    type='MaskHungarianAssigner',
                    cls_cost=dict(type='FocalLossCost', weight=2.0),
                    dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                    mask_cost=dict(type='MaskCost', weight=1.0,
                                   pred_act=True)),
                sampler=dict(type='MaskPseudoSampler'),
                pos_weight=1)

            for _ in range(num_stages)
        ]),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=num_proposals,
            mask_thr=0.5,
            stuff_score_thr=0.05,
            merge_stuff_thing=dict(
                overlap_thr=0.6,
                iou_thr=0.5,
                stuff_max_area=4096,
                instance_score_thr=0.25
            )
        )
    )
)

custom_imports = dict(
    imports=[
        'knet.det.kernel_head',
        'knet.det.kernel_iter_head',
        'knet.det.kernel_update_head',
        'knet.det.semantic_fpn_wrapper',
        'knet.det.dice_loss',
        'knet.det.mask_hungarian_assigner',
        'knet.det.mask_pseudo_sampler',
        'knet.kernel_updator',
        'knet.cross_entropy_loss',
        'swin.swin_transformer',
        'swin.mix_transformer',
        'swin.DetectRS',
        'swin.swin_transformer_rfp',
        'external.cityscapes_step',
        'external.kitti_step_dvps',
        'external.dataset.dvps_pipelines.transforms',
        'external.dataset.dvps_pipelines.loading',
        'external.dataset.dvps_pipelines.tricks',
        'external.dataset.pipelines.formatting',
        # 'knet.video.knet_track',
        # 'knet.video.knet_track_head',
        'knet.video.track_heads',
        'knet.video.kernel_head',
        'knet.video.kernel_iter_head',
        'knet.video.kernel_update_head',
        'knet.video.knet_uni_track',
        'knet.video.knet_quansi_dense',
        # 'knet.video.knet_quansi_dense_roi',
        'knet.video.knet_quansi_dense_roi_gt_box',
        'knet.video.knet_quansi_dense_embed_fc',
        'knet.video.knet_quansi_dense_embed_fc_joint_train',
        # 'knet.video.knet_quansi_dense_embed_fc_with_appearance',
        'knet.video.knet_quansi_dense_roi_gt_box_joint_train',
        # 'knet.video.knet_quansi_dense_embed_fc_toy_exp',
        'knet.video.qdtrack.losses.l2_loss',
        'knet.video.qdtrack.losses.multipos_cross_entropy_loss',
        'knet.video.qdtrack.trackers.quasi_dense_embed_tracker',
    ],
    allow_failed_imports=False
)


================================================
FILE: configs/det/_base_/models/knet_s3_r50_deformable_fpn.py
================================================
num_stages = 3
num_proposals = 100
conv_kernel_size = 1
model = dict(
    type='KNet',
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    neck=dict(
        type='MSDeformAttnPixelDecoder',
        num_outs=3,
        norm_cfg=dict(type='GN', num_groups=32),
        act_cfg=dict(type='ReLU'),
        return_one_list=True,
        encoder=dict(
            type='DetrTransformerEncoder',
            num_layers=6,
            transformerlayers=dict(
                type='BaseTransformerLayer',
                attn_cfgs=dict(
                    type='MultiScaleDeformableAttention',
                    embed_dims=256,
                    num_heads=8,
                    num_levels=3,
                    num_points=4,
                    im2col_step=64,
                    dropout=0.0,
                    batch_first=False,
                    norm_cfg=None,
                    init_cfg=None),
                ffn_cfgs=dict(
                    type='FFN',
                    embed_dims=256,
                    feedforward_channels=1024,
                    num_fcs=2,
                    ffn_drop=0.0,
                    act_cfg=dict(type='ReLU', inplace=True)),
                operation_order=('self_attn', 'norm', 'ffn', 'norm')),
            init_cfg=None),
        positional_encoding=dict(
            type='SinePositionalEncoding', num_feats=128, normalize=True),
        init_cfg=None),
    rpn_head=dict(
        type='ConvKernelHead',
        conv_kernel_size=conv_kernel_size,
        feat_downsample_stride=2,
        feat_refine_stride=1,
        feat_refine=False,
        use_binary=True,
        num_loc_convs=1,
        num_seg_convs=1,
        conv_normal_init=True,
        localization_fpn=dict(
            type='SemanticFPNWrapper',
            in_channels=256,
            feat_channels=256,
            out_channels=256,
            start_level=0,
            end_level=3,
            upsample_times=2,
            positional_encoding=dict(
                type='SinePositionalEncoding', num_feats=128, normalize=True),
            cat_coors=False,
            cat_coors_level=3,
            fuse_by_cat=False,
            return_list=False,
            num_aux_convs=1,
            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)),
        num_proposals=num_proposals,
        proposal_feats_with_obj=True,
        xavier_init_kernel=False,
        kernel_init_std=1,
        num_cls_fcs=1,
        in_channels=256,
        num_classes=80,
        feat_transform_cfg=None,
        loss_seg=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_mask=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_dice=dict(type='DiceLoss', loss_weight=4.0)),
    roi_head=dict(
        type='KernelIterHead',
        num_stages=num_stages,
        stage_loss_weights=[1] * num_stages,
        proposal_feature_channel=256,
        mask_head=[
            dict(
                type='KernelUpdateHead',
                num_classes=80,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=2,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0)) for _ in range(num_stages)
        ]),
    # training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaskHungarianAssigner',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
            sampler=dict(type='MaskPseudoSampler'),
            pos_weight=1),
        rcnn=[
            dict(
                assigner=dict(
                    type='MaskHungarianAssigner',
                    cls_cost=dict(type='FocalLossCost', weight=2.0),
                    dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                    mask_cost=dict(type='MaskCost', weight=1.0,
                                   pred_act=True)),
                sampler=dict(type='MaskPseudoSampler'),
                pos_weight=1) for _ in range(num_stages)
        ]),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=num_proposals,
            mask_thr=0.5,
            merge_stuff_thing=dict(
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3))))

custom_imports = dict(
    imports=[
        'knet.det.kernel_head',
        'knet.det.kernel_iter_head',
        'knet.det.kernel_update_head',
        'knet.det.semantic_fpn_wrapper',
        'knet.det.dice_loss',
        'knet.kernel_updator',
        'knet.det.msdeformattn_decoder',
        'knet.det.mask_hungarian_assigner',
        'knet.det.mask_pseudo_sampler',
        'external.coco_panoptic',
        'swin.swin_transformer'
    ],
    allow_failed_imports=False)


================================================
FILE: configs/det/_base_/models/knet_s3_r50_fpn.py
================================================
num_stages = 3
num_proposals = 100
conv_kernel_size = 1
model = dict(
    type='KNet',
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=0,
        add_extra_convs='on_input',
        num_outs=4),
    rpn_head=dict(
        type='ConvKernelHead',
        conv_kernel_size=conv_kernel_size,
        feat_downsample_stride=2,
        feat_refine_stride=1,
        feat_refine=False,
        use_binary=True,
        num_loc_convs=1,
        num_seg_convs=1,
        conv_normal_init=True,
        localization_fpn=dict(
            type='SemanticFPNWrapper',
            in_channels=256,
            feat_channels=256,
            out_channels=256,
            start_level=0,
            end_level=3,
            upsample_times=2,
            positional_encoding=dict(
                type='SinePositionalEncoding', num_feats=128, normalize=True),
            cat_coors=False,
            cat_coors_level=3,
            fuse_by_cat=False,
            return_list=False,
            num_aux_convs=1,
            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)),
        num_proposals=num_proposals,
        proposal_feats_with_obj=True,
        xavier_init_kernel=False,
        kernel_init_std=1,
        num_cls_fcs=1,
        in_channels=256,
        num_classes=80,
        feat_transform_cfg=None,
        loss_seg=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_mask=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_dice=dict(type='DiceLoss', loss_weight=4.0)),
    roi_head=dict(
        type='KernelIterHead',
        num_stages=num_stages,
        stage_loss_weights=[1] * num_stages,
        proposal_feature_channel=256,
        mask_head=[
            dict(
                type='KernelUpdateHead',
                num_classes=80,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=2,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0)) for _ in range(num_stages)
        ]),
    # training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaskHungarianAssigner',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
            sampler=dict(type='MaskPseudoSampler'),
            pos_weight=1),
        rcnn=[
            dict(
                assigner=dict(
                    type='MaskHungarianAssigner',
                    cls_cost=dict(type='FocalLossCost', weight=2.0),
                    dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                    mask_cost=dict(type='MaskCost', weight=1.0,
                                   pred_act=True)),
                sampler=dict(type='MaskPseudoSampler'),
                pos_weight=1) for _ in range(num_stages)
        ]),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=num_proposals,
            mask_thr=0.5,
            merge_stuff_thing=dict(
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3))))

custom_imports = dict(
    imports=[
        'knet.det.knet',
        'knet.det.kernel_head',
        'knet.det.kernel_iter_head',
        'knet.det.kernel_update_head',
        'knet.det.semantic_fpn_wrapper',
        'knet.det.dice_loss',
        'knet.kernel_updator',
        'knet.det.msdeformattn_decoder',
        'knet.det.mask_hungarian_assigner',
        'knet.det.mask_pseudo_sampler',
        'panoptic_fpn.coco_panoptic',
    ],
    allow_failed_imports=False)


================================================
FILE: configs/det/_base_/models/knet_s3_r50_fpn_panoptic.py
================================================
num_stages = 3
num_proposals = 100
conv_kernel_size = 1
model = dict(
    type='KNet',
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=0,
        add_extra_convs='on_input',
        num_outs=4),
    rpn_head=dict(
        type='ConvKernelHead',
        num_classes=133,  # modified for panoptic
        cat_stuff_mask=True,  # modified for panoptic
        conv_kernel_size=conv_kernel_size,
        feat_downsample_stride=2,
        feat_refine_stride=1,
        feat_refine=False,
        use_binary=True,
        num_loc_convs=1,
        num_seg_convs=1,
        conv_normal_init=True,
        localization_fpn=dict(
            type='SemanticFPNWrapper',
            in_channels=256,
            feat_channels=256,
            out_channels=256,
            start_level=0,
            end_level=3,
            upsample_times=2,
            positional_encoding=dict(
                type='SinePositionalEncoding', num_feats=128, normalize=True),
            cat_coors=False,
            cat_coors_level=3,
            fuse_by_cat=False,
            return_list=False,
            num_aux_convs=1,
            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)),
        num_proposals=num_proposals,
        proposal_feats_with_obj=True,
        xavier_init_kernel=False,
        kernel_init_std=1,
        num_cls_fcs=1,
        in_channels=256,
        feat_transform_cfg=None,
        loss_rank=dict(
            type='CrossEntropyLoss',
            use_sigmoid=False,
            loss_weight=0.1),
        loss_seg=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_mask=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_dice=dict(type='DiceLoss', loss_weight=4.0)),
    roi_head=dict(
        type='KernelIterHead',
        do_panoptic=True,
        num_stages=num_stages,
        stage_loss_weights=[1] * num_stages,
        proposal_feature_channel=256,
        mask_head=[
            dict(
                type='KernelUpdateHead',
                num_classes=133,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=2,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_rank=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=0.1),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0)) for _ in range(num_stages)
        ]),
    # training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaskHungarianAssigner',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
            sampler=dict(type='MaskPseudoSampler'),
            pos_weight=1),
        rcnn=[
            dict(
                assigner=dict(
                    type='MaskHungarianAssigner',
                    cls_cost=dict(type='FocalLossCost', weight=2.0),
                    dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                    mask_cost=dict(type='MaskCost', weight=1.0,
                                   pred_act=True)),
                sampler=dict(type='MaskPseudoSampler'),
                pos_weight=1) for _ in range(num_stages)
        ]),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=num_proposals,
            mask_thr=0.5,
            stuff_score_thr=0.05,
            merge_stuff_thing=dict(
                overlap_thr=0.6,
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3))))

custom_imports = dict(
    imports=[
        'knet.det.kernel_head',
        'knet.det.kernel_iter_head',
        'knet.det.kernel_update_head',
        'knet.det.semantic_fpn_wrapper',
        'knet.det.dice_loss',
        'knet.kernel_updator',
        'knet.cross_entropy_loss',
        'knet.det.mask_hungarian_assigner',
        'knet.det.mask_pseudo_sampler',
        'swin.swin_transformer',
        'external.mot_step',
        'swin.mix_transformer',
        'swin.DetectRS',
        'swin.swin_transformer_rfp',
        'external.coco_panoptic',
        'external.mapillary_panoptic',
        'external.cityscape_panoptic',
        'external.kitti_step_dvps',
        'external.mot_step',
        'external.dataset.dvps_pipelines.transforms',
        'external.dataset.dvps_pipelines.loading',
        'external.dataset.dvps_pipelines.tricks',
        'external.dataset.pipelines.formatting',
    ],
    allow_failed_imports=False)


================================================
FILE: configs/det/_base_/models/knet_vipseg_s3_r50_fpn.py
================================================
num_stages = 3
num_proposals = 100
conv_kernel_size = 1

num_thing_classes = 58
num_stuff_classes = 66
num_classes = num_stuff_classes + num_thing_classes

model = dict(
    type='KNet',
    cityscapes=False,
    kitti_step=True,
    num_thing_classes=num_thing_classes,
    num_stuff_classes=num_stuff_classes,
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=0,
        add_extra_convs='on_input',
        num_outs=4
    ),
    rpn_head=dict(
        type='ConvKernelHead',
        num_classes=num_classes,
        num_thing_classes=num_thing_classes,
        num_stuff_classes=num_stuff_classes,
        cat_stuff_mask=True,
        conv_kernel_size=conv_kernel_size,
        feat_downsample_stride=2,
        feat_refine_stride=1,
        feat_refine=False,
        use_binary=True,
        num_loc_convs=1,
        num_seg_convs=1,
        conv_normal_init=True,
        localization_fpn=dict(
            type='SemanticFPNWrapper',
            in_channels=256,
            feat_channels=256,
            out_channels=256,
            start_level=0,
            end_level=3,
            upsample_times=2,
            positional_encoding=dict(
                type='SinePositionalEncoding', num_feats=128, normalize=True),
            cat_coors=False,
            cat_coors_level=3,
            fuse_by_cat=False,
            return_list=False,
            num_aux_convs=1,
            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)),
        num_proposals=num_proposals,
        proposal_feats_with_obj=True,
        xavier_init_kernel=False,
        kernel_init_std=1,
        num_cls_fcs=1,
        in_channels=256,
        feat_transform_cfg=None,
        loss_rank=dict(
            type='CrossEntropyLoss',
            use_sigmoid=False,
            loss_weight=0.1),
        loss_seg=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_mask=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_dice=dict(type='DiceLoss', loss_weight=4.0)),
    roi_head=dict(
        type='KernelIterHead',
        num_thing_classes=num_thing_classes,
        num_stuff_classes=num_stuff_classes,
        do_panoptic=True,
        num_stages=num_stages,
        stage_loss_weights=[1] * num_stages,
        proposal_feature_channel=256,
        mask_head=[
            dict(
                type='KernelUpdateHead',
                num_classes=num_classes,
                num_thing_classes=num_thing_classes,
                num_stuff_classes=num_stuff_classes,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=2,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'),
                    act_cfg=None
                ),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_rank=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=0.1
                ),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0
                ),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0
                ),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0))
            for _ in range(num_stages)
        ]
    ),
    # training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaskHungarianAssigner',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
            sampler=dict(type='MaskPseudoSampler'),
            pos_weight=1),
        rcnn=[
            dict(
                assigner=dict(
                    type='MaskHungarianAssigner',
                    cls_cost=dict(type='FocalLossCost', weight=2.0),
                    dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                    mask_cost=dict(type='MaskCost', weight=1.0,
                                   pred_act=True)),
                sampler=dict(type='MaskPseudoSampler'),
                pos_weight=1)

            for _ in range(num_stages)
        ]),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=num_proposals,
            mask_thr=0.5,
            stuff_score_thr=0.05,
            merge_stuff_thing=dict(
                overlap_thr=0.6,
                iou_thr=0.5,
                stuff_max_area=4096,
                instance_score_thr=0.25
            )
        )
    )
)

custom_imports = dict(
    imports=[
        'knet.det.knet',
        'knet.det.kernel_head',
        'knet.det.kernel_iter_head',
        'knet.det.kernel_update_head',
        'knet.det.semantic_fpn_wrapper',
        'knet.det.dice_loss',
        'knet.det.mask_hungarian_assigner',
        'knet.det.mask_pseudo_sampler',
        'knet.kernel_updator',
        'knet.cross_entropy_loss',
        'swin.swin_transformer',
        'swin.mix_transformer',
        'swin.DetectRS',
        'swin.swin_transformer_rfp',
        'external.cityscapes_step',
        'external.kitti_step_dvps',
        'external.vipseg_dvps',
        'external.dataset.dvps_pipelines.transforms',
        'external.dataset.dvps_pipelines.loading',
        'external.dataset.dvps_pipelines.tricks',
        'external.dataset.pipelines.formatting',
        'external.dataset.pipelines.transforms',
        'knet.video.knet',
        'knet.video.knet_quansi_dense',
        'knet.video.knet_quansi_dense_roi_gt_box',
        # 'knet.video.knet_track',
        # 'knet.video.knet_track_head',
        'knet.video.track_heads',
        'knet.video.kernel_head',
        'knet.video.kernel_iter_head',
        'knet.video.kernel_update_head',
        'knet.video.knet_uni_track',
        'knet.video.knet_quansi_dense',
        'knet.video.knet_quansi_dense_roi_gt_box',
        'knet.video.knet_quansi_dense_embed_fc',
        'knet.video.knet_quansi_dense_embed_fc_joint_train',
        'knet.video.qdtrack.losses.l2_loss',
        'knet.video.qdtrack.losses.multipos_cross_entropy_loss',
        'knet.video.qdtrack.trackers.quasi_dense_embed_tracker',

    ],
    allow_failed_imports=False
)


================================================
FILE: configs/det/_base_/models/video_knet_s3_r50_fpn_panoptic.py
================================================
num_stages = 3
num_proposals = 100
conv_kernel_size = 1
model = dict(
    type='VideoKNet',
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=0,
        add_extra_convs='on_input',
        num_outs=4),
    rpn_head=dict(
        type='VideoConvKernelHead',
        num_classes=133,  # modified for panoptic
        cat_stuff_mask=True,  # modified for panoptic
        conv_kernel_size=conv_kernel_size,
        feat_downsample_stride=2,
        feat_refine_stride=1,
        feat_refine=False,
        use_binary=True,
        num_loc_convs=1,
        num_seg_convs=1,
        conv_normal_init=True,
        localization_fpn=dict(
            type='SemanticFPNWrapper',
            in_channels=256,
            feat_channels=256,
            out_channels=256,
            start_level=0,
            end_level=3,
            upsample_times=2,
            positional_encoding=dict(
                type='SinePositionalEncoding', num_feats=128, normalize=True),
            cat_coors=False,
            cat_coors_level=3,
            fuse_by_cat=False,
            return_list=False,
            num_aux_convs=1,
            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)),
        num_proposals=num_proposals,
        proposal_feats_with_obj=True,
        xavier_init_kernel=False,
        kernel_init_std=1,
        num_cls_fcs=1,
        in_channels=256,
        feat_transform_cfg=None,
        loss_rank=dict(
            type='CrossEntropyLoss',
            use_sigmoid=False,
            loss_weight=0.1),
        loss_seg=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_mask=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_dice=dict(type='DiceLoss', loss_weight=4.0)),
    roi_head=dict(
        type='VideoKernelIterHead',
        do_panoptic=True,
        num_stages=num_stages,
        stage_loss_weights=[1] * num_stages,
        proposal_feature_channel=256,
        mask_head=[
            dict(
                type='VideoKernelUpdateHead',
                num_classes=133,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=2,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_rank=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=0.1),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0)) for _ in range(num_stages)
        ]),
    # training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaskHungarianAssigner',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
            sampler=dict(type='MaskPseudoSampler'),
            pos_weight=1),
        rcnn=[
            dict(
                assigner=dict(
                    type='MaskHungarianAssigner',
                    cls_cost=dict(type='FocalLossCost', weight=2.0),
                    dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                    mask_cost=dict(type='MaskCost', weight=1.0,
                                   pred_act=True)),
                sampler=dict(type='MaskPseudoSampler'),
                pos_weight=1) for _ in range(num_stages)
        ]),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=num_proposals,
            mask_thr=0.5,
            stuff_score_thr=0.05,
            merge_stuff_thing=dict(
                overlap_thr=0.6,
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3))))

custom_imports = dict(
    imports=[
        'knet.det.kernel_head',
        'knet.det.kernel_iter_head',
        'knet.det.kernel_update_head',
        'knet.det.semantic_fpn_wrapper',
        'knet.det.dice_loss',
        'knet.cross_entropy_loss',
        'knet.kernel_updator',
        'knet.det.mask_hungarian_assigner',
        'knet.det.mask_pseudo_sampler',
        'external.coco_panoptic',
        'external.youtubevis_clips',
        'external.cityscapes_vps',
        'external.cityscape_panoptic',
        'external.cityscapes_dvps',
        'swin.swin_transformer',
        'swin.mix_transformer',
        'swin.DetectRS',
        'swin.swin_transformer_rfp',
        # 'knet.video.knet_track',
        # 'knet.video.knet_track_head',
        'knet.video.track_heads',
        'knet.video.kernel_head',
        'knet.video.kernel_iter_head',
        'knet.video.kernel_update_head',
        'knet.video.knet_uni_track',
        'knet.video.knet_quansi_dense',
        'knet.video.knet_quansi_dense_conv_mask',
        'knet.video.knet_quansi_dense_roi_gt_box',
        'knet.video.knet_quansi_dense_embed_fc',
        # 'knet.video.knet_quansi_dense_embed_fc_joint_train',
        'knet.video.knet_quansi_dense_roi_gt_box_joint_train',
        'knet.video.qdtrack.losses.l2_loss',
        'knet.video.qdtrack.losses.multipos_cross_entropy_loss',
        'knet.video.qdtrack.trackers.quasi_dense_embed_tracker',

        'knet.video.knet_quansi_dense_embed_fc_toy_exp',
        'external.ext.ytvos',
        'external.ext.mask',

        'external.dataset.pipelines.transforms',
        'external.dataset.pipelines.loading',
        'external.dataset.pipelines.formatting',

        'external.dataset.dvps_pipelines.transforms',
        'external.dataset.dvps_pipelines.loading',
        'external.dataset.dvps_pipelines.tricks',
        'external.dataset.pipelines.formatting',
    ],
    allow_failed_imports=False)


================================================
FILE: configs/det/_base_/schedules/schedule_10e.py
================================================
# optimizer
# this is different from the original 1x schedule that use SGD
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    weight_decay=0.05,
    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)}))
optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))
# learning policy
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[8,])
runner = dict(type='EpochBasedRunner', max_epochs=10)


================================================
FILE: configs/det/_base_/schedules/schedule_1x.py
================================================
# optimizer
# this is different from the original 1x schedule that use SGD
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    weight_decay=0.05,
    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)}))
optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))
# learning policy
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[8, 11])
runner = dict(type='EpochBasedRunner', max_epochs=12)


================================================
FILE: configs/det/coco/knet_s3_r50_deformable_fpn_ms-3x_coco.py
================================================
_base_ = [
    '../_base_/models/knet_s3_r50_deformable_fpn.py',
    '../common/mstrain_3x_coco_instance.py'
]

model = dict(
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
        norm_eval=True,),

)

================================================
FILE: configs/det/coco/knet_s3_r50_fpn_ms-3x_coco-panoptic.py
================================================
_base_ = [
    '../_base_/models/knet_s3_r50_fpn_panoptic.py',
    '../common/mstrain_3x_coco_panoptic.py'
]
num_stages = 3
num_proposals = 100
conv_kernel_size = 1
model = dict(
    type='KNet',
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=0,
        add_extra_convs='on_input',
        num_outs=4),
    rpn_head=dict(
        type='ConvKernelHead',
        num_classes=133,  # modified for panoptic
        cat_stuff_mask=True,  # modified for panoptic
        conv_kernel_size=conv_kernel_size,
        feat_downsample_stride=2,
        feat_refine_stride=1,
        feat_refine=False,
        use_binary=True,
        num_loc_convs=1,
        num_seg_convs=1,
        conv_normal_init=True,
        localization_fpn=dict(
            type='SemanticFPNWrapper',
            in_channels=256,
            feat_channels=256,
            out_channels=256,
            start_level=0,
            end_level=3,
            upsample_times=2,
            positional_encoding=dict(
                type='SinePositionalEncoding', num_feats=128, normalize=True),
            cat_coors=False,
            cat_coors_level=3,
            fuse_by_cat=False,
            return_list=False,
            num_aux_convs=1,
            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)),
        num_proposals=num_proposals,
        proposal_feats_with_obj=True,
        xavier_init_kernel=False,
        kernel_init_std=1,
        num_cls_fcs=1,
        in_channels=256,
        feat_transform_cfg=None,
        loss_rank=dict(
            type='CrossEntropyLoss',
            use_sigmoid=False,
            loss_weight=0.1),
        loss_seg=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_mask=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_dice=dict(type='DiceLoss', loss_weight=4.0)),
    roi_head=dict(
        type='KernelIterHead',
        do_panoptic=True,
        merge_joint=True,
        num_stages=num_stages,
        stage_loss_weights=[1] * num_stages,
        proposal_feature_channel=256,
        mask_head=[
            dict(
                type='KernelUpdateHead',
                num_classes=133,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=2,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_rank=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=0.1),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0)) for _ in range(num_stages)
        ]),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=num_proposals,
            mask_thr=0.5,
            stuff_score_thr=0.05,
            merge_stuff_thing=dict(
                overlap_thr=0.6,
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3)))
)

================================================
FILE: configs/det/coco/knet_s3_r50_fpn_ms-3x_coco.py
================================================
_base_ = [
    '../_base_/models/knet_s3_r50_fpn.py',
    '../common/mstrain_3x_coco_instance.py'
]


================================================
FILE: configs/det/coco/knet_s3_swin-b_deformable_fpn_ms-3x_coco.py
================================================
_base_ = [
    '../_base_/models/knet_s3_r50_deformable_fpn.py',
    '../common/mstrain_3x_coco_instance.py'
]

model = dict(
    pretrained='/mnt/lustre/lixiangtai/pretrained/swin/swin_base_patch4_window7_224_22k.pth',
    backbone=dict(
        _delete_=True,
        type='SwinTransformerDIY',
        embed_dims=128,
        depths=[2, 2, 18, 2],
        num_heads=[4, 8, 16, 32],
        window_size=7,
        mlp_ratio=4.,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.,
        attn_drop_rate=0.,
        drop_path_rate=0.3,
        use_abs_pos_embed=False,
        patch_norm=True,
        out_indices=(0, 1, 2, 3),
        with_cp=False),
    neck=dict(in_channels=[128, 256, 512, 1024])
)


================================================
FILE: configs/det/common/lsj_coco_panoptic_50e.py
================================================
_base_ = '../_base_/default_runtime.py'
# dataset settings
dataset_type = 'CocoPanopticDatasetCustom'
data_root = 'data/coco/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
image_size = (1024, 1024)

# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
# multiscale_mode='range'
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
    dict(
        type='Resize',
        img_scale=image_size,
        ratio_range=(0.1, 2.0),
        multiscale_mode='range',
        keep_ratio=True),
    dict(
        type='RandomCrop',
        crop_type='absolute_range',
        crop_size=image_size,
        recompute_bbox=True,
        allow_negative_crop=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(
        type='Collect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
]

test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1333, 800),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]

# Use RepeatDataset to speed up training
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=1,
        dataset=dict(
            type=dataset_type,
            ann_file=dict(
                ins_ann=data_root + 'annotations/panoptic_train2017_thing_only_coco.json',
                panoptic_ann=data_root + 'annotations/panoptic_train2017.json'),
            img_prefix=data_root + 'train2017/',
            seg_prefix=data_root + 'panoptic_stuff_train2017/',
            pipeline=train_pipeline)),
    val=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_val2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_val2017.json'),
        seg_prefix=data_root + 'panoptic_val2017/',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_val2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_val2017.json'),
        seg_prefix=data_root + 'panoptic_val2017/',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline))

evaluation = dict(metric=['segm', 'panoptic'], interval=5)

checkpoint_config = dict(interval=5)

# optimizer
# this is different from the original 1x schedule that use SGD
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    weight_decay=0.05,
    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)}))
optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))

# learning policy
# Experiments show that using step=[9, 11] has higher performance
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[42, 48])
runner = dict(type='EpochBasedRunner', max_epochs=50)


================================================
FILE: configs/det/common/mstrain_3x_coco_instance.py
================================================
_base_ = '../_base_/default_runtime.py'
# dataset settings
dataset_type = 'CocoDataset'
data_root = 'data/coco/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
# multiscale_mode='range'
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
    dict(
        type='Resize',
        img_scale=[(1333, 640), (1333, 800)],
        multiscale_mode='range',
        keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1333, 800),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]

# Use RepeatDataset to speed up training
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=3,
        dataset=dict(
            type=dataset_type,
            ann_file=data_root + 'annotations/instances_train2017.json',
            img_prefix=data_root + 'train2017/',
            pipeline=train_pipeline)),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline))
evaluation = dict(interval=1, metric=['segm'])

# optimizer
# this is different from the original 1x schedule that use SGD
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    weight_decay=0.05,
    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)}))
optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))

# learning policy
# Experiments show that using step=[9, 11] has higher performance
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[9, 11])
runner = dict(type='EpochBasedRunner', max_epochs=12)


================================================
FILE: configs/det/common/mstrain_3x_coco_panoptic_inst_anno.py
================================================
_base_ = '../_base_/default_runtime.py'
# dataset settings
dataset_type = 'CocoPanopticDatasetCustom'
data_root = 'data/coco/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
# multiscale_mode='range'
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
    dict(
        type='Resize',
        img_scale=[(1333, 640), (1333, 800)],
        multiscale_mode='range',
        keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(
        type='Collect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
]

test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1333, 800),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]

# Use RepeatDataset to speed up training
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=3,
        dataset=dict(
            type=dataset_type,
            ann_file=dict(
                ins_ann=data_root + 'annotations/panoptic_train2017_thing_only_coco.json',
                panoptic_ann=data_root + 'annotations/panoptic_train2017.json'),
            img_prefix=data_root + 'train2017/',
            seg_prefix=data_root + 'panoptic_stuff_train2017/',
            pipeline=train_pipeline)),
    val=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_val2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_val2017.json'),
        seg_prefix=data_root + 'panoptic_val2017/',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_val2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_val2017.json'),
        seg_prefix=data_root + 'panoptic_val2017/',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline))
evaluation = dict(metric=['segm', 'panoptic'])

# optimizer
# this is different from the original 1x schedule that use SGD
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    weight_decay=0.05,
    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)}))
optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))

# learning policy
# Experiments show that using step=[9, 11] has higher performance
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[9, 11])
runner = dict(type='EpochBasedRunner', max_epochs=12)


================================================
FILE: configs/det/common/mstrain_3x_coco_panoptic_inst_anno_detr_aug.py
================================================
_base_ = '../_base_/default_runtime.py'
# dataset settings
dataset_type = 'CocoPanopticDatasetCustom'
data_root = 'data/coco/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
# multiscale_mode='range'
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(
        type='AutoAugment',
        policies=[[
            dict(
                type='Resize',
                img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
                           (608, 1333), (640, 1333), (672, 1333), (704, 1333),
                           (736, 1333), (768, 1333), (800, 1333)],
                multiscale_mode='value',
                keep_ratio=True)
        ],
          [
              dict(
                  type='Resize',
                  img_scale=[(400, 1333), (500, 1333), (600, 1333)],
                  multiscale_mode='value',
                  keep_ratio=True),
              dict(
                  type='RandomCrop',
                  crop_type='relative',
                  crop_size=(0.7, 0.7),
                  allow_negative_crop=True),
              dict(
                  type='Resize',
                  img_scale=[(480, 1333), (512, 1333), (544, 1333),
                             (576, 1333), (608, 1333), (640, 1333),
                             (672, 1333), (704, 1333), (736, 1333),
                             (768, 1333), (800, 1333)],
                  multiscale_mode='value',
                  override=True,
                  keep_ratio=True)
          ]]),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(
        type='Collect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
]

test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1333, 800),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]

# Use RepeatDataset to speed up training
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=3,
        dataset=dict(
            type=dataset_type,
            ann_file=dict(
                ins_ann=data_root + 'annotations/panoptic_train2017_thing_only_coco.json',
                panoptic_ann=data_root + 'annotations/panoptic_train2017.json'),
            img_prefix=data_root + 'train2017/',
            seg_prefix=data_root + 'panoptic_stuff_train2017/',
            pipeline=train_pipeline)),
    val=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_val2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_val2017.json'),
        seg_prefix=data_root + 'panoptic_val2017/',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instances_val2017.json',
            panoptic_ann=data_root + 'annotations/panoptic_val2017.json'),
        seg_prefix=data_root + 'panoptic_val2017/',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline))
evaluation = dict(metric=['segm', 'panoptic'])

# optimizer
# this is different from the original 1x schedule that use SGD
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    weight_decay=0.05,
    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)}))
optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))

# learning policy
# Experiments show that using step=[9, 11] has higher performance
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[9, 11])
runner = dict(type='EpochBasedRunner', max_epochs=12)


================================================
FILE: configs/det/common/mstrain_64e_city_panoptic.py
================================================
_base_ = '../_base_/default_runtime.py'
# dataset settings
dataset_type = 'CityscapesPanopticDataset'
data_root = 'data/cityscapes/'

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
    dict(
        type='Resize', img_scale=[(512, 1024), (2048, 4096)], multiscale_mode='range', keep_ratio=True),
    dict(type='RandomCrop', crop_size=(1024, 2048)),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
]


test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(2048, 1024),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]

data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=8,
        dataset=dict(
            type=dataset_type,
            ann_file=dict(
                ins_ann=data_root + 'annotations/instancesonly_filtered_gtFine_train.json',
                panoptic_ann=data_root + 'annotations/cityscapes_panoptic_train.json'
            ),
            img_prefix=data_root + 'leftImg8bit/train/',
            seg_prefix=data_root + 'gtFine/train',
            pipeline=train_pipeline)),
    val=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root +'annotations/instancesonly_filtered_gtFine_val.json',
            panoptic_ann=data_root + "annotations/cityscapes_panoptic_val.json"
        ),
        img_prefix=data_root + 'leftImg8bit/val/',
        seg_prefix=data_root + 'gtFine/cityscapes_panoptic_val',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=dict(
            ins_ann=data_root + 'annotations/instancesonly_filtered_gtFine_val.json',
            panoptic_ann=data_root + "annotations/cityscapes_panoptic_val.json"
        ),
        img_prefix=data_root + 'leftImg8bit/val/',
        seg_prefix=data_root + 'gtFine/cityscapes_panoptic_val',
        pipeline=test_pipeline))

evaluation = dict(metric=['panoptic'])

# optimizer
# this is different from the original 1x schedule that use SGD
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    weight_decay=0.05,
    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)}))
optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))

# learning policy
# Experiments show that using step=[9, 11] has higher performance
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=0.001,
    # [7] yields higher performance than [6]
    step=[7])
runner = dict(
    type='EpochBasedRunner', max_epochs=8)  # actual epoch = 8 * 8 = 64


================================================
FILE: configs/det/knet_cityscapes_step/knet_s3_r50_fpn.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_citystep_s3_r50_fpn.py',
    '../_base_/datasets/cityscapes_step.py',
]


num_proposals = 100
# load_from = "/mnt/lustre/lixiangtai/pretrained/video_knet_vis/knet_r50_city.pth"
load_from = None

work_dir = 'logger/blackhole'

runner = dict(type='EpochBasedRunner', max_epochs=8)

model = dict(
    type='KNet',
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    roi_head=dict(
            type='KernelIterHead',
            merge_joint=True,),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=num_proposals,
            mask_thr=0.5,
            stuff_score_thr=0.05,
            merge_stuff_thing=dict(
                overlap_thr=0.6,
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3)))
)

lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[7, ],
)
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
)


================================================
FILE: configs/det/knet_cityscapes_step/knet_s3_swin_b_fpn.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_citystep_s3_r50_fpn.py',
    '../_base_/datasets/cityscapes_step.py',
]


num_proposals = 100
# load_from = "/mnt/lustre/lixiangtai/pretrained/video_knet_vis/knet_swin_b_city.pth"
load_from = None

work_dir = 'logger/blackhole'

runner = dict(type='EpochBasedRunner', max_epochs=8)

model = dict(
    type='KNet',
    backbone=dict(
        _delete_=True,
        type='SwinTransformerDIY',
        embed_dims=128,
        depths=[2, 2, 18, 2],
        num_heads=[4, 8, 16, 32],
        window_size=7,
        mlp_ratio=4.,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.,
        attn_drop_rate=0.,
        drop_path_rate=0.3,
        use_abs_pos_embed=False,
        patch_norm=True,
        out_indices=(0, 1, 2, 3),
        with_cp=False),
    neck=dict(in_channels=[128, 256, 512, 1024]),
    roi_head=dict(
        type='KernelIterHead',
        merge_joint=True,
    ),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=num_proposals,
            mask_thr=0.5,
            stuff_score_thr=0.05,
            merge_stuff_thing=dict(
                overlap_thr=0.6,
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3)))
)

lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[7, ],
)
data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
)


================================================
FILE: configs/det/knet_cityscapes_step/knet_s3_swin_l_fpn.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_citystep_s3_r50_fpn.py',
    '../_base_/datasets/cityscapes_step.py',
]


num_proposals = 100
# load_from = "/mnt/lustre/lixiangtai/pretrained/video_knet_vis/knet_swin_l_city.pth"
load_from = None

work_dir = 'logger/blackhole'

runner = dict(type='EpochBasedRunner', max_epochs=8)

model = dict(
    type='KNet',
    backbone=dict(
        _delete_=True,
        type='SwinTransformerDIY',
        embed_dims=192,
        depths=[2, 2, 18, 2],
        num_heads=[6, 12, 24, 48],
        window_size=7,
        mlp_ratio=4.,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.,
        attn_drop_rate=0.,
        drop_path_rate=0.2,
        use_abs_pos_embed=False,
        patch_norm=True,
        out_indices=(0, 1, 2, 3),
        with_cp=False),
    neck=dict(in_channels=[192, 384, 768, 1536]),
    roi_head=dict(
        type='KernelIterHead',
        merge_joint=True,
    ),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=num_proposals,
            mask_thr=0.5,
            stuff_score_thr=0.05,
            merge_stuff_thing=dict(
                overlap_thr=0.6,
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3)))
)

lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[7, ],
)
data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
)


================================================
FILE: configs/det/video_knet_kitti_step/video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_kitti_step_s3_r50_fpn.py',
    '../_base_/datasets/kitti_step_vps.py',
]

load_from = None

num_stages = 3
conv_kernel_size = 1
num_thing_classes = 2
num_stuff_classes = 17
num_classes = num_thing_classes + num_stuff_classes


model = dict(
    type="VideoKNetQuansiEmbedFCJointTrain",
    cityscapes=False,
    kitti_step=True,
    link_previous=True,
    mask_assign_stride=2,
    num_thing_classes=2,
    num_stuff_classes=17,
    ignore_label=255,
    backbone=dict(
            type='ResNet',
            depth=50,
            num_stages=4,
            out_indices=(0, 1, 2, 3),
            frozen_stages=1,
            norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    rpn_head=dict(
        loss_seg=dict(
                _delete_=True,
                type='CrossEntropyLoss',
                use_sigmoid=False,
                loss_weight=1.0),
        feat_downsample_stride=4,
    ),
    # add video_knet_vis roi head
    track_head=dict(
        type='QuasiDenseMaskEmbedHeadGTMask',
        num_convs=0,
        num_fcs=2,
        roi_feat_size=1,
        in_channels=256,
        fc_out_channels=256,
        embed_channels=256,
        norm_cfg=dict(type='GN', num_groups=32),
        loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
        loss_track_aux=dict(
            type='L2Loss',
            neg_pos_ub=3,
            pos_margin=0,
            neg_margin=0.1,
            hard_mining=True,
            loss_weight=1.0),
    ),
    # add tracker config
    tracker=dict(
        type='QuasiDenseEmbedTracker',
        init_score_thr=0.35,
        obj_score_thr=0.3,
        match_score_thr=0.5,
        memo_tracklet_frames=5,
        memo_backdrop_frames=1,
        memo_momentum=0.8,
        nms_conf_thr=0.5,
        nms_backdrop_iou_thr=0.3,
        nms_class_iou_thr=0.7,
        with_cats=True,
        match_metric='bisoftmax'
    ),
    # roi head
    roi_head=dict(
        type='VideoKernelIterHead',
        num_stages=num_stages,
        num_thing_classes=num_thing_classes,
        num_stuff_classes=num_stuff_classes,
        with_track=True,
        merge_joint=True,
        mask_head=[
            dict(
                type='VideoKernelUpdateHead',
                num_classes=19,
                previous='placeholder',
                previous_type="ffn",
                num_thing_classes=2,
                num_stuff_classes=17,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=4,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0),
            ) for _ in range(num_stages)
        ]
    ),
    track_train_cfg=dict(
        assigner=dict(
            type='MaskHungarianAssigner',
            cls_cost=dict(type='FocalLossCost', weight=2.0),
            dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
            mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
        sampler=dict(type='MaskPseudoSampler'),),
    bbox_roi_extractor=None
)

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)

train_pipeline = [
    dict(type='LoadMultiImagesDirect'),
    dict(type='LoadMultiAnnotationsDirect', with_depth=False, divisor=-1, cherry_pick=True, cherry=[11, 13]),
    dict(type='SeqResizeWithDepth', img_scale=(384, 1248), ratio_range=[0.5, 2.0], keep_ratio=True),
    dict(type='SeqFlipWithDepth', flip_ratio=0.5),
    dict(type='SeqRandomCropWithDepth', crop_size=(384, 1248), share_params=True),
    dict(type='SeqNormalizeWithDepth', **img_norm_cfg),
    dict(type='SeqPadWithDepth', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids',]),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]

test_pipeline = [
    dict(type='LoadImgDirect'),
    dict(
        type='MultiScaleFlipAug',
        scale_factor=[1.0],
        flip=False,
        transforms=[
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect',
                 keys=['img', 'img_id', 'seq_id'],
                 meta_keys=[
                     'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                     'flip_direction', 'img_norm_cfg', 'ori_filename'
                 ]),
        ])
]

runner = dict(type='EpochBasedRunner', max_epochs=12)

lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[9, 11])

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=2,
        dataset=dict(
            split='train',
            ref_seq_index=[-2, -1, 1, 2],
            test_mode=False,
            pipeline=train_pipeline
        )),
    test=dict(
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
        split='val',
    )
)

find_unused_parameters=True

================================================
FILE: configs/det/video_knet_kitti_step/video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train_8e.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_kitti_step_s3_r50_fpn.py',
    '../_base_/datasets/kitti_step_vps.py',
]

load_from = None

num_stages = 3
conv_kernel_size = 1
num_thing_classes = 2
num_stuff_classes = 17
num_classes = num_thing_classes + num_stuff_classes


model = dict(
    type="VideoKNetQuansiEmbedFCJointTrain",
    cityscapes=False,
    kitti_step=True,
    link_previous=True,
    mask_assign_stride=2,
    num_thing_classes=2,
    num_stuff_classes=17,
    ignore_label=255,
    backbone=dict(
            type='ResNet',
            depth=50,
            num_stages=4,
            out_indices=(0, 1, 2, 3),
            frozen_stages=1,
            norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    rpn_head=dict(
        loss_seg=dict(
                _delete_=True,
                type='CrossEntropyLoss',
                use_sigmoid=False,
                loss_weight=1.0),
        feat_downsample_stride=4,
    ),
    # add video_knet_vis roi head
    track_head=dict(
        type='QuasiDenseMaskEmbedHeadGTMask',
        num_convs=0,
        num_fcs=2,
        roi_feat_size=1,
        in_channels=256,
        fc_out_channels=256,
        embed_channels=256,
        norm_cfg=dict(type='GN', num_groups=32),
        loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
        loss_track_aux=dict(
            type='L2Loss',
            neg_pos_ub=3,
            pos_margin=0,
            neg_margin=0.1,
            hard_mining=True,
            loss_weight=1.0),
    ),
    # add tracker config
    tracker=dict(
        type='QuasiDenseEmbedTracker',
        init_score_thr=0.35,
        obj_score_thr=0.3,
        match_score_thr=0.5,
        memo_tracklet_frames=5,
        memo_backdrop_frames=1,
        memo_momentum=0.8,
        nms_conf_thr=0.5,
        nms_backdrop_iou_thr=0.3,
        nms_class_iou_thr=0.7,
        with_cats=True,
        match_metric='bisoftmax'
    ),
    # roi head
    roi_head=dict(
        type='VideoKernelIterHead',
        num_stages=num_stages,
        num_thing_classes=num_thing_classes,
        num_stuff_classes=num_stuff_classes,
        with_track=True,
        merge_joint=True,
        mask_head=[
            dict(
                type='VideoKernelUpdateHead',
                num_classes=19,
                previous='placeholder',
                previous_type="ffn",
                num_thing_classes=2,
                num_stuff_classes=17,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=4,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0),
            ) for _ in range(num_stages)
        ]
    ),
    track_train_cfg=dict(
        assigner=dict(
            type='MaskHungarianAssigner',
            cls_cost=dict(type='FocalLossCost', weight=2.0),
            dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
            mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
        sampler=dict(type='MaskPseudoSampler'),),
    bbox_roi_extractor=None
)

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)

train_pipeline = [
    dict(type='LoadMultiImagesDirect'),
    dict(type='LoadMultiAnnotationsDirect', with_depth=False, divisor=-1, cherry_pick=True, cherry=[11, 13]),
    dict(type='SeqResizeWithDepth', img_scale=(384, 1248), ratio_range=[0.5, 2.0], keep_ratio=True),
    dict(type='SeqFlipWithDepth', flip_ratio=0.5),
    dict(type='SeqRandomCropWithDepth', crop_size=(384, 1248), share_params=True),
    dict(type='SeqNormalizeWithDepth', **img_norm_cfg),
    dict(type='SeqPadWithDepth', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids',]),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]

test_pipeline = [
    dict(type='LoadImgDirect'),
    dict(
        type='MultiScaleFlipAug',
        scale_factor=[1.0],
        flip=False,
        transforms=[
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect',
                 keys=['img', 'img_id', 'seq_id'],
                 meta_keys=[
                     'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                     'flip_direction', 'img_norm_cfg', 'ori_filename'
                 ]),
        ])
]

runner = dict(type='EpochBasedRunner', max_epochs=8)

lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[7])

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=2,
        dataset=dict(
            split='train',
            ref_seq_index=[-2, -1, 1, 2],
            test_mode=False,
            pipeline=train_pipeline
        )),
    test=dict(
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
        split='val',
    )
)

find_unused_parameters=True

================================================
FILE: configs/det/video_knet_kitti_step/video_knet_s3_swinb_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_update.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_kitti_step_s3_r50_fpn.py',
    '../_base_/datasets/kitti_step_vps.py',
]

load_from = None

num_stages = 3
conv_kernel_size = 1
num_thing_classes = 2
num_stuff_classes = 17
num_classes = num_thing_classes + num_stuff_classes

model = dict(
    type="VideoKNetQuansiEmbedFCJointTrain",
    cityscapes=False,
    kitti_step=True,
    link_previous=True,
    mask_assign_stride=2,
    num_thing_classes=num_thing_classes,
    num_stuff_classes=num_stuff_classes,
    ignore_label=255,
    backbone=dict(
        _delete_=True,
        type='SwinTransformerDIY',
        embed_dims=128,
        depths=[2, 2, 18, 2],
        num_heads=[4, 8, 16, 32],
        window_size=7,
        mlp_ratio=4,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.,
        attn_drop_rate=0.,
        drop_path_rate=0.3,
        use_abs_pos_embed=False,
        patch_norm=True,
        out_indices=(0, 1, 2, 3),
        with_cp=False
    ),
    neck=dict(in_channels=[128, 256, 512, 1024]),
    rpn_head=dict(
        loss_seg=dict(
                _delete_=True,
                type='CrossEntropyLoss',
                use_sigmoid=False,
                loss_weight=1.0),
        feat_downsample_stride=4,
    ),
    # add video_knet_vis roi head
    track_head=dict(
        type='QuasiDenseMaskEmbedHeadGTMask',
        num_convs=0,
        num_fcs=2,
        roi_feat_size=1,
        in_channels=256,
        fc_out_channels=256,
        embed_channels=256,
        norm_cfg=dict(type='GN', num_groups=32),
        loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
        loss_track_aux=dict(
            type='L2Loss',
            neg_pos_ub=3,
            pos_margin=0,
            neg_margin=0.1,
            hard_mining=True,
            loss_weight=1.0),
    ),
    # add tracker config
    tracker=dict(
        type='QuasiDenseEmbedTracker',
        init_score_thr=0.35,
        obj_score_thr=0.3,
        match_score_thr=0.5,
        memo_tracklet_frames=5,
        memo_backdrop_frames=1,
        memo_momentum=0.8,
        nms_conf_thr=0.5,
        nms_backdrop_iou_thr=0.3,
        nms_class_iou_thr=0.7,
        with_cats=True,
        match_metric='bisoftmax'
    ),
    # roi head
    roi_head=dict(
        type='VideoKernelIterHead',
        num_stages=num_stages,
        num_thing_classes=2,
        num_stuff_classes=17,
        with_track=True,
        merge_joint=True,
        mask_head=[
            dict(
                type='VideoKernelUpdateHead',
                num_classes=num_classes,
                previous='placeholder',
                previous_link="update_dynamic_cov",
                previous_type="update",
                num_thing_classes=num_thing_classes,
                num_stuff_classes=num_stuff_classes,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=4,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0),
            ) for _ in range(num_stages)
        ]
    ),
    track_train_cfg=dict(
        assigner=dict(
            type='MaskHungarianAssigner',
            cls_cost=dict(type='FocalLossCost', weight=2.0),
            dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
            mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
        sampler=dict(type='MaskPseudoSampler'),),
    bbox_roi_extractor=None
)



img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)

train_pipeline = [
    dict(type='LoadMultiImagesDirect'),
    dict(type='LoadMultiAnnotationsDirect', with_depth=False, divisor=-1, cherry_pick=True, cherry=[11, 13]),
    dict(type='SeqResizeWithDepth', img_scale=(384, 1248), ratio_range=[0.5, 2.0], keep_ratio=True),
    dict(type='SeqFlipWithDepth', flip_ratio=0.5),
    dict(type='SeqRandomCropWithDepth', crop_size=(384, 1248), share_params=True),
    dict(type='SeqNormalizeWithDepth', **img_norm_cfg),
    dict(type='SeqPadWithDepth', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids',]),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]

test_pipeline = [
    dict(type='LoadImgDirect'),
    dict(
        type='MultiScaleFlipAug',
        scale_factor=[1.0],
        flip=False,
        transforms=[
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect',
                 keys=['img', 'img_id', 'seq_id'],
                 meta_keys=[
                     'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                     'flip_direction', 'img_norm_cfg', 'ori_filename'
                 ]),
        ])
]

runner = dict(type='EpochBasedRunner', max_epochs=12)

lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[9, 11])

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=2,
        dataset=dict(
            split='train',
            ref_seq_index=[-2, -1, 1, 2],
            test_mode=False,
            pipeline=train_pipeline
        )),
    test=dict(
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
        split='val',
    )
)

find_unused_parameters=True

================================================
FILE: configs/det/video_knet_kitti_step/video_knet_s3_swinl_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_update.py
================================================
_base_ = [
    '../../_base_/schedules/schedule_1x.py',
    '../../_base_/default_runtime.py',
    '../../_base_/models/knet_kitti_step_s3_r50_fpn.py',
    '../../_base_/datasets/kitti_step_vps.py',
]

load_from = None

num_stages = 3
conv_kernel_size = 1
num_thing_classes = 2
num_stuff_classes = 17
num_classes = num_thing_classes + num_stuff_classes


model = dict(
    type="VideoKNetQuansiEmbedFCJointTrain",
    cityscapes=False,
    kitti_step=True,
    link_previous=True,
    mask_assign_stride=2,
    num_thing_classes=2,
    num_stuff_classes=17,
    ignore_label=255,
    backbone=dict(
        _delete_=True,
        type='SwinTransformerDIY',
        embed_dims=192,
        depths=[2, 2, 18, 2],
        num_heads=[6, 12, 24, 48],
        window_size=7,
        mlp_ratio=4.,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.,
        attn_drop_rate=0.,
        drop_path_rate=0.2,
        use_abs_pos_embed=False,
        patch_norm=True,
        out_indices=(0, 1, 2, 3),
        with_cp=False),
    neck=dict(in_channels=[192, 384, 768, 1536]),
    rpn_head=dict(
        loss_seg=dict(
                _delete_=True,
                type='CrossEntropyLoss',
                use_sigmoid=False,
                loss_weight=1.0),
        feat_downsample_stride=4,
    ),
    # add video_knet_vis roi head
    track_head=dict(
        type='QuasiDenseMaskEmbedHeadGTMask',
        num_convs=0,
        num_fcs=2,
        roi_feat_size=1,
        in_channels=256,
        fc_out_channels=256,
        embed_channels=256,
        norm_cfg=dict(type='GN', num_groups=32),
        loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
        loss_track_aux=dict(
            type='L2Loss',
            neg_pos_ub=3,
            pos_margin=0,
            neg_margin=0.1,
            hard_mining=True,
            loss_weight=1.0),
    ),
    # add tracker config
    tracker=dict(
        type='QuasiDenseEmbedTracker',
        init_score_thr=0.35,
        obj_score_thr=0.3,
        match_score_thr=0.5,
        memo_tracklet_frames=5,
        memo_backdrop_frames=1,
        memo_momentum=0.8,
        nms_conf_thr=0.5,
        nms_backdrop_iou_thr=0.3,
        nms_class_iou_thr=0.7,
        with_cats=True,
        match_metric='bisoftmax'
    ),
    # roi head
    roi_head=dict(
        type='VideoKernelIterHead',
        num_stages=num_stages,
        num_thing_classes=2,
        num_stuff_classes=17,
        with_track=True,
        merge_joint=True,
        mask_head=[
            dict(
                type='VideoKernelUpdateHead',
                num_classes=19,
                previous='placeholder',
                previous_link="update_dynamic_cov",
                previous_type="update",
                num_thing_classes=2,
                num_stuff_classes=17,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=4,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0),
            ) for _ in range(num_stages)
        ]
    ),
    track_train_cfg=dict(
        assigner=dict(
            type='MaskHungarianAssigner',
            cls_cost=dict(type='FocalLossCost', weight=2.0),
            dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
            mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
        sampler=dict(type='MaskPseudoSampler'),),
    bbox_roi_extractor=None
)

work_dir = 'logger/ks_wodepth_4x8_step_stride2_nocrop_2_17'

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)

train_pipeline = [
    dict(type='LoadMultiImagesDirect'),
    dict(type='LoadMultiAnnotationsDirect', with_depth=False, divisor=-1, cherry_pick=True, cherry=[11, 13]),
    dict(type='SeqResizeWithDepth', img_scale=(384, 1248), ratio_range=[0.5, 2.0], keep_ratio=True),
    dict(type='SeqFlipWithDepth', flip_ratio=0.5),
    dict(type='SeqRandomCropWithDepth', crop_size=(384, 1248), share_params=True),
    dict(type='SeqNormalizeWithDepth', **img_norm_cfg),
    dict(type='SeqPadWithDepth', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids',]),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]

test_pipeline = [
    dict(type='LoadImgDirect'),
    dict(
        type='MultiScaleFlipAug',
        scale_factor=[1.0],
        flip=False,
        transforms=[
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect',
                 keys=['img', 'img_id', 'seq_id'],
                 meta_keys=[
                     'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                     'flip_direction', 'img_norm_cfg', 'ori_filename'
                 ]),
        ])
]

runner = dict(type='EpochBasedRunner', max_epochs=12)

lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[9, 11])

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=2,
        dataset=dict(
            split='train',
            ref_seq_index=[-2, -1, 1, 2],
            test_mode=False,
            pipeline=train_pipeline
        )),
    test=dict(
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
        split='val',
    )
)

find_unused_parameters=True

================================================
FILE: configs/det/video_knet_kitti_step/video_knet_s3_swinl_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_update_conv_short_track_fc.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_kitti_step_s3_r50_fpn.py',
    '../_base_/datasets/kitti_step_vps.py',
]
# load_from = "/mnt/lustre/lixiangtai/project/Knet/work_dirs/city_step/swin_l_joint_8e/latest.pth"

load_from = None

num_stages = 3
conv_kernel_size = 1

model = dict(
    type="VideoKNetQuansiEmbedFCJointTrain",
    cityscapes=False,
    kitti_step=True,
    link_previous=True,
    mask_assign_stride=2,
    num_thing_classes=2,
    num_stuff_classes=17,
    ignore_label=255,
    backbone=dict(
        _delete_=True,
        type='SwinTransformerDIY',
        embed_dims=192,
        depths=[2, 2, 18, 2],
        num_heads=[6, 12, 24, 48],
        window_size=7,
        mlp_ratio=4.,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.,
        attn_drop_rate=0.,
        drop_path_rate=0.2,
        use_abs_pos_embed=False,
        patch_norm=True,
        out_indices=(0, 1, 2, 3),
        with_cp=False),
    neck=dict(in_channels=[192, 384, 768, 1536]),
    rpn_head=dict(
        loss_seg=dict(
                _delete_=True,
                type='CrossEntropyLoss',
                use_sigmoid=False,
                loss_weight=1.0),
        feat_downsample_stride=4,
    ),
    # add track roi head
    track_head=dict(
        type='QuasiDenseMaskEmbedHeadGTMask',
        num_convs=0,
        num_fcs=1,
        roi_feat_size=1,
        in_channels=256,
        fc_out_channels=256,
        embed_channels=256,
        norm_cfg=dict(type='GN', num_groups=32),
        loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
        loss_track_aux=dict(
            type='L2Loss',
            neg_pos_ub=3,
            pos_margin=0,
            neg_margin=0.1,
            hard_mining=True,
            loss_weight=1.0),
    ),
    # add tracker config
    tracker=dict(
        type='QuasiDenseEmbedTracker',
        init_score_thr=0.35,
        obj_score_thr=0.3,
        match_score_thr=0.5,
        memo_tracklet_frames=5,
        memo_backdrop_frames=1,
        memo_momentum=0.8,
        nms_conf_thr=0.5,
        nms_backdrop_iou_thr=0.3,
        nms_class_iou_thr=0.7,
        with_cats=True,
        match_metric='bisoftmax'
    ),
    # roi head
    roi_head=dict(
        type='VideoKernelIterHead',
        num_stages=num_stages,
        num_thing_classes=2,
        num_stuff_classes=17,
        with_track=True,
        merge_joint=True,
        mask_head=[
            dict(
                type='VideoKernelUpdateHead',
                num_classes=19,
                previous='placeholder',
                previous_link="update_dynamic_cov",
                previous_type="ffn",
                num_thing_classes=2,
                num_stuff_classes=17,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=4,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0),
            ) for _ in range(num_stages)
        ]
    ),
    track_train_cfg=dict(
        assigner=dict(
            type='MaskHungarianAssigner',
            cls_cost=dict(type='FocalLossCost', weight=2.0),
            dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
            mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
        sampler=dict(type='MaskPseudoSampler'),),
    bbox_roi_extractor=None
)


img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)

train_pipeline = [
    dict(type='LoadMultiImagesDirect'),
    dict(type='LoadMultiAnnotationsDirect', with_depth=False, divisor=-1, cherry_pick=True, cherry=[11, 13]),
    dict(type='SeqResizeWithDepth', img_scale=(384, 1248), ratio_range=[0.5, 2.0], keep_ratio=True),
    dict(type='SeqFlipWithDepth', flip_ratio=0.5),
    dict(type='SeqRandomCropWithDepth', crop_size=(384, 1248), share_params=True),
    dict(type='SeqNormalizeWithDepth', **img_norm_cfg),
    dict(type='SeqPadWithDepth', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg', 'gt_instance_ids',]),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]

test_pipeline = [
    dict(type='LoadImgDirect'),
    dict(
        type='MultiScaleFlipAug',
        scale_factor=[1.0],
        flip=False,
        transforms=[
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect',
                 keys=['img', 'img_id', 'seq_id'],
                 meta_keys=[
                     'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip',
                     'flip_direction', 'img_norm_cfg', 'ori_filename'
                 ]),
        ])
]

runner = dict(type='EpochBasedRunner', max_epochs=12)

lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[9, 11])

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=2,
        dataset=dict(
            split='train',
            ref_seq_index=[-2, -1, 1, 2],
            test_mode=False,
            pipeline=train_pipeline
        )),
    test=dict(
        ref_seq_index=None,
        test_mode=True,
        pipeline=test_pipeline,
        split='val',
    )
)

find_unused_parameters=True

================================================
FILE: configs/det/video_knet_vipseg/video_knet_s3_r50_rpn_vipseg_mask_embed_link_ffn_joint_train.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_vipseg_s3_r50_fpn.py',
    '../_base_/datasets/vipseg_dvps.py',
]

num_stages = 3
conv_kernel_size = 1
num_thing_classes = 58
num_stuff_classes = 66
num_classes = num_stuff_classes + num_thing_classes

model = dict(
    type="VideoKNetQuansiEmbedFCJointTrain",
    # use cityscape style label distribution. # thing first , stuff second
    cityscapes=False,
    vipseg=True,
    kitti_step=False,
    link_previous=True,
    mask_assign_stride=2,
    num_thing_classes=num_thing_classes,
    num_stuff_classes=num_stuff_classes,
    ignore_label=255,
    backbone=dict(
            type='ResNet',
            depth=50,
            num_stages=4,
            out_indices=(0, 1, 2, 3),
            frozen_stages=1,
            norm_cfg=dict(type='BN', requires_grad=True),
            norm_eval=True
    ),
    rpn_head=dict(
        num_thing_classes=num_thing_classes,
        num_stuff_classes=num_stuff_classes,
            loss_seg=dict(
                    _delete_=True,
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=1.0),
        feat_downsample_stride=4,
    ),
    # add track roi head
    track_head=dict(
        type='QuasiDenseMaskEmbedHeadGTMask',
        num_convs=0,
        num_fcs=2,
        roi_feat_size=1,
        in_channels=256,
        fc_out_channels=256,
        embed_channels=256,
        norm_cfg=dict(type='GN', num_groups=32),
        loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
        loss_track_aux=dict(
            type='L2Loss',
            neg_pos_ub=3,
            pos_margin=0,
            neg_margin=0.1,
            hard_mining=True,
            loss_weight=1.0),
    ),
    # add tracker config
    tracker=dict(
        type='QuasiDenseEmbedTracker',
        init_score_thr=0.35,
        obj_score_thr=0.3,
        match_score_thr=0.5,
        memo_tracklet_frames=5,
        memo_backdrop_frames=1,
        memo_momentum=0.8,
        nms_conf_thr=0.5,
        nms_backdrop_iou_thr=0.3,
        nms_class_iou_thr=0.7,
        with_cats=True,
        match_metric='bisoftmax'
    ),
    # roi head
    roi_head=dict(
        type='VideoKernelIterHead',
        num_stages=num_stages,
        num_thing_classes=num_thing_classes,
        num_stuff_classes=num_stuff_classes,
        with_track=True,
        merge_joint=True,
        mask_head=[
            dict(
                type='VideoKernelUpdateHead',
                num_classes=num_classes,
                previous='placeholder',
                previous_type="ffn",
                num_thing_classes=num_thing_classes,
                num_stuff_classes=num_stuff_classes,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=4,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0),
            ) for _ in range(num_stages)
        ]
    ),
    track_train_cfg=dict(
        assigner=dict(
            type='MaskHungarianAssigner',
            cls_cost=dict(type='FocalLossCost', weight=2.0),
            dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
            mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
        sampler=dict(type='MaskPseudoSampler'),),
    bbox_roi_extractor=None
)


runner = dict(type='EpochBasedRunner', max_epochs=12)

lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[9, 11])


find_unused_parameters=True

================================================
FILE: configs/det/video_knet_vipseg/video_knet_s3_swin_b_rpn_vipseg_mask_embed_link_ffn_joint_train_8e.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_vipseg_s3_r50_fpn.py',
    '../_base_/datasets/vipseg_dvps.py',
]


num_stages = 3
conv_kernel_size = 1
num_thing_classes = 58
num_stuff_classes = 66
num_classes = num_stuff_classes + num_thing_classes

model = dict(
    type="VideoKNetQuansiEmbedFCJointTrain",
    # use cityscape style label distribution. # thing first , stuff second
    cityscapes=False,
    vipseg=True,
    kitti_step=False,
    link_previous=True,
    mask_assign_stride=2,
    num_thing_classes=num_thing_classes,
    num_stuff_classes=num_stuff_classes,
    ignore_label=255,
    backbone=dict(
        _delete_=True,
        type='SwinTransformerDIY',
        embed_dims=128,
        depths=[2, 2, 18, 2],
        num_heads=[4, 8, 16, 32],
        window_size=7,
        mlp_ratio=4.,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.,
        attn_drop_rate=0.,
        drop_path_rate=0.3,
        use_abs_pos_embed=False,
        patch_norm=True,
        out_indices=(0, 1, 2, 3),
        with_cp=False),
    neck=dict(
        in_channels=[128, 256, 512, 1024],
    ),
    rpn_head=dict(
        num_thing_classes=num_thing_classes,
        num_stuff_classes=num_stuff_classes,
            loss_seg=dict(
                    _delete_=True,
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=1.0),
        feat_downsample_stride=4,
    ),
    # add track roi head
    track_head=dict(
        type='QuasiDenseMaskEmbedHeadGTMask',
        num_convs=0,
        num_fcs=2,
        roi_feat_size=1,
        in_channels=256,
        fc_out_channels=256,
        embed_channels=256,
        norm_cfg=dict(type='GN', num_groups=32),
        loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
        loss_track_aux=dict(
            type='L2Loss',
            neg_pos_ub=3,
            pos_margin=0,
            neg_margin=0.1,
            hard_mining=True,
            loss_weight=1.0),
    ),
    # add tracker config
    tracker=dict(
        type='QuasiDenseEmbedTracker',
        init_score_thr=0.35,
        obj_score_thr=0.3,
        match_score_thr=0.5,
        memo_tracklet_frames=5,
        memo_backdrop_frames=1,
        memo_momentum=0.8,
        nms_conf_thr=0.5,
        nms_backdrop_iou_thr=0.3,
        nms_class_iou_thr=0.7,
        with_cats=True,
        match_metric='bisoftmax'
    ),
    # roi head
    roi_head=dict(
        type='VideoKernelIterHead',
        num_stages=num_stages,
        num_thing_classes=num_thing_classes,
        num_stuff_classes=num_stuff_classes,
        with_track=True,
        merge_joint=True,
        mask_head=[
            dict(
                type='VideoKernelUpdateHead',
                num_classes=num_classes,
                previous='placeholder',
                previous_type="ffn",
                num_thing_classes=num_thing_classes,
                num_stuff_classes=num_stuff_classes,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=4,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0),
            ) for _ in range(num_stages)
        ]
    ),
    track_train_cfg=dict(
        assigner=dict(
            type='MaskHungarianAssigner',
            cls_cost=dict(type='FocalLossCost', weight=2.0),
            dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
            mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)),
        sampler=dict(type='MaskPseudoSampler'),),
    bbox_roi_extractor=None
)


runner = dict(type='EpochBasedRunner', max_epochs=8)

lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[7,])


find_unused_parameters=True

================================================
FILE: configs/video_knet_vis/_base_/datasets/coco_instance.py
================================================
dataset_type = 'CocoDataset'
data_root = 'data/coco/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1333, 800),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_train2017.json',
        img_prefix=data_root + 'train2017/',
        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline))
# we do not evaluate bbox because K-Net does not predict bounding boxes
evaluation = dict(metric=['segm'])


================================================
FILE: configs/video_knet_vis/_base_/datasets/youtubevis_2019.py
================================================
# dataset settings
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375],
    to_rgb=True
)

train_pipeline = [
    dict(type='LoadMultiImagesFromFile', to_float32=True),
    dict(
        type='SeqLoadAnnotations',
        with_bbox=True,
        with_mask=True,
        with_track=True),
    dict(
        type='SeqResize',
        multiscale_mode='value',
        share_params=True,
        img_scale=[(288,1e6), (320,1e6), (352,1e6), (392,1e6), (416,1e6), (448,1e6), (480,1e6), (512,1e6)],
        keep_ratio=True
    ),
    dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
    dict(type='SeqNormalize', **img_norm_cfg),
    dict(type='SeqPad', size_divisor=32),
    dict(
        type='VideoCollect',
        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_instance_ids'],
        reject_empty=True,
        num_ref_imgs=5,
    ),
    dict(type='ConcatVideoReferences'),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
]

test_pipeline = [
    dict(type='LoadMultiImagesFromFile', to_float32=True),
    dict(type='MultiScaleFlipAugVideo',
         img_scale=(640, 360),
         flip=False,
         transforms=[
             dict(type='SeqResize'),
             dict(type='SeqNormalize', **img_norm_cfg),
             dict(type='SeqPad', size_divisor=32),
             dict(
                 type='VideoCollect',
                 keys=['img'],
                 reject_empty=False,
                 num_ref_imgs=0,  # 0 means do not apply check
             ),
             dict(type='ConcatVideoReferences'),
             dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
         ])
]

dataset_type = 'YouTubeVISDataset'
data_root = 'data/youtube_vis_2019/'
dataset_version = '2019'
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type=dataset_type,
        dataset_version=dataset_version,
        ann_file=data_root + 'annotations/youtube_vis_2019_train.json',
        img_prefix=data_root + 'train/JPEGImages',
        ref_img_sampler=dict(
            num_ref_imgs=5,
            frame_range=[-2, 2],
            filter_key_img=False,
            method='uniform'),
        pipeline=train_pipeline
    ),
    val=dict(
        type=dataset_type,
        dataset_version=dataset_version,
        ann_file=data_root + 'annotations/youtube_vis_2019_valid.json',
        img_prefix=data_root + 'valid/JPEGImages',
        ref_img_sampler=None,
        load_all_frames=True,
        pipeline=test_pipeline
    ),
    test=dict(
        type=dataset_type,
        dataset_version=dataset_version,
        ann_file=data_root + 'annotations/youtube_vis_2019_valid.json',
        img_prefix=data_root + 'valid/JPEGImages',
        ref_img_sampler=None,
        load_all_frames=True,
        pipeline=test_pipeline
    )
)


================================================
FILE: configs/video_knet_vis/_base_/default_runtime.py
================================================
checkpoint_config = dict(interval=1)
log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
    ]
)
# custom_hooks = [dict(type='NumClassCheckHook')]

dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]

work_dir = 'logger/blackhole'


================================================
FILE: configs/video_knet_vis/_base_/models/knet_track_r50.py
================================================
num_stages = 3
num_proposals = 100
conv_kernel_size = 1
model = dict(
    type='KNetTrack',
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=0,
        add_extra_convs='on_input',
        num_outs=4),
    rpn_head=dict(
        type='ConvKernelHeadVideo',
        conv_kernel_size=conv_kernel_size,
        feat_downsample_stride=2,
        feat_refine_stride=1,
        feat_refine=False,
        use_binary=True,
        num_loc_convs=1,
        num_seg_convs=1,
        conv_normal_init=True,
        localization_fpn=dict(
            type='SemanticFPNWrapper',
            in_channels=256,
            feat_channels=256,
            out_channels=256,
            start_level=0,
            end_level=3,
            upsample_times=2,
            positional_encoding=dict(
                type='SinePositionalEncoding', num_feats=128, normalize=True),
            cat_coors=False,
            cat_coors_level=3,
            fuse_by_cat=False,
            return_list=False,
            num_aux_convs=1,
            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)
        ),
        num_proposals=num_proposals,
        proposal_feats_with_obj=True,
        xavier_init_kernel=False,
        kernel_init_std=1,
        num_cls_fcs=1,
        in_channels=256,
        num_classes=40,
        feat_transform_cfg=None,
        loss_seg=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_mask=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_dice=dict(type='DiceLoss', loss_weight=4.0)),
    roi_head=dict(
        type='KernelIterHeadVideo',
        num_stages=num_stages,
        stage_loss_weights=[1] * num_stages,
        proposal_feature_channel=256,
        num_thing_classes=40,
        num_stuff_classes=0,
        mask_head=[
            dict(
                type='KernelUpdateHead',
                num_classes=40,
                num_thing_classes=40,
                num_stuff_classes=0,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=2,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0)
            ) for _ in range(num_stages)
        ]),
    tracker=dict(
        type="KernelFrameIterHeadVideo",
        num_proposals=num_proposals,
        num_stages=3,
        assign_stages=2,
        proposal_feature_channel=256,
        stage_loss_weights=(1., 1., 1.),
        num_thing_classes=40,
        num_stuff_classes=0,
        mask_head=dict(
            type='KernelUpdateHeadVideo',
            num_proposals=num_proposals,
            num_classes=40,
            num_thing_classes=40,
            num_stuff_classes=0,
            num_ffn_fcs=2,
            num_heads=8,
            num_cls_fcs=1,
            num_mask_fcs=1,
            feedforward_channels=2048,
            in_channels=256,
            out_channels=256,
            dropout=0.0,
            mask_thr=0.5,
            conv_kernel_size=conv_kernel_size,
            mask_upsample_stride=2,
            ffn_act_cfg=dict(type='ReLU', inplace=True),
            with_ffn=True,
            feat_transform_cfg=dict(
                conv_cfg=dict(type='Conv2d'), act_cfg=None),
            kernel_updator_cfg=dict(
                type='KernelUpdator',
                in_channels=256,
                feat_channels=256,
                out_channels=256,
                input_feat_shape=3,
                act_cfg=dict(type='ReLU', inplace=True),
                norm_cfg=dict(type='LN')),
            loss_mask=dict(
                type='CrossEntropyLoss',
                use_sigmoid=True,
                loss_weight=1.0),
            loss_dice=dict(
                type='DiceLoss', loss_weight=4.0),
            loss_cls=dict(
                type='FocalLoss',
                use_sigmoid=True,
                gamma=2.0,
                alpha=0.25,
                loss_weight=2.0)
        ),

    ),
    # training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaskHungarianAssigner',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)
            ),
            sampler=dict(type='MaskPseudoSampler'),
            pos_weight=1),
        rcnn=[
            dict(
                assigner=dict(
                    type='MaskHungarianAssigner',
                    cls_cost=dict(type='FocalLossCost', weight=2.0),
                    dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                    mask_cost=dict(type='MaskCost', weight=1.0,
                                   pred_act=True)
                ),
                sampler=dict(type='MaskPseudoSampler'),
                pos_weight=1) for _ in range(num_stages)
        ],
        tracker=dict(
            assigner=dict(
                type='MaskHungarianAssignerVideo',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                mask_cost=dict(type='MaskCost', weight=1.0,
                               pred_act=True)
            ),
            sampler=dict(type='MaskPseudoSampler'),
            pos_weight=1)
    ),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=10,
            mask_thr=0.5,
            merge_stuff_thing=dict(
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3
            )
        ),
        tracker=dict(
            max_per_img=10,
            mask_thr=0.5,
            merge_stuff_thing=dict(
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3
            ),
        ),
    )
)

custom_imports = dict(
    imports=[
        'knet_vis.det.knet',
        'knet_vis.det.kernel_head',
        'knet_vis.det.kernel_iter_head',
        'knet_vis.det.kernel_update_head',
        'knet_vis.det.semantic_fpn_wrapper',
        'knet_vis.kernel_updator',
        'knet_vis.det.mask_hungarian_assigner',
        'knet_vis.det.mask_pseudo_sampler',
        'knet_vis.tracker.track',
        'knet_vis.tracker.kernel_head',
        'knet_vis.tracker.kernel_iter_head',
        'knet_vis.tracker.kernel_frame_iter_head',
        'knet_vis.tracker.mask_hungarian_assigner',
        'knet_vis.tracker.kernel_update_head',
        'swin.swin_transformer',
        'mmtrack.datasets.youtube_vis_dataset',
        'mmtrack.pipelines',
    ],
    allow_failed_imports=False
)


================================================
FILE: configs/video_knet_vis/_base_/models/knet_track_r50_deformablefpn.py
================================================
num_stages = 3
num_proposals = 100
conv_kernel_size = 1
model = dict(
    type='KNetTrack',
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        style='pytorch',
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
    neck=dict(
        type='MSDeformAttnPixelDecoder',
        num_outs=3,
        norm_cfg=dict(type='GN', num_groups=32),
        act_cfg=dict(type='ReLU'),
        return_one_list=True,
        encoder=dict(
            type='DetrTransformerEncoder',
            num_layers=6,
            transformerlayers=dict(
                type='BaseTransformerLayer',
                attn_cfgs=dict(
                    type='MultiScaleDeformableAttention',
                    embed_dims=256,
                    num_heads=8,
                    num_levels=3,
                    num_points=4,
                    im2col_step=64,
                    dropout=0.0,
                    batch_first=False,
                    norm_cfg=None,
                    init_cfg=None),
                ffn_cfgs=dict(
                    type='FFN',
                    embed_dims=256,
                    feedforward_channels=1024,
                    num_fcs=2,
                    ffn_drop=0.0,
                    act_cfg=dict(type='ReLU', inplace=True)),
                operation_order=('self_attn', 'norm', 'ffn', 'norm')),
            init_cfg=None),
        positional_encoding=dict(
            type='SinePositionalEncoding', num_feats=128, normalize=True),
        init_cfg=None),
    rpn_head=dict(
        type='ConvKernelHeadVideo',
        conv_kernel_size=conv_kernel_size,
        feat_downsample_stride=2,
        feat_refine_stride=1,
        feat_refine=False,
        use_binary=True,
        num_loc_convs=1,
        num_seg_convs=1,
        conv_normal_init=True,
        localization_fpn=dict(
            type='SemanticFPNWrapper',
            in_channels=256,
            feat_channels=256,
            out_channels=256,
            start_level=0,
            end_level=3,
            upsample_times=2,
            positional_encoding=dict(
                type='SinePositionalEncoding', num_feats=128, normalize=True),
            cat_coors=False,
            cat_coors_level=3,
            fuse_by_cat=False,
            return_list=False,
            num_aux_convs=1,
            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)
        ),
        num_proposals=num_proposals,
        proposal_feats_with_obj=True,
        xavier_init_kernel=False,
        kernel_init_std=1,
        num_cls_fcs=1,
        in_channels=256,
        num_classes=40,
        feat_transform_cfg=None,
        loss_seg=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_mask=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_dice=dict(type='DiceLoss', loss_weight=4.0)),
    roi_head=dict(
        type='KernelIterHeadVideo',
        num_stages=num_stages,
        stage_loss_weights=[1] * num_stages,
        proposal_feature_channel=256,
        num_thing_classes=40,
        num_stuff_classes=0,
        mask_head=[
            dict(
                type='KernelUpdateHead',
                num_classes=40,
                num_thing_classes=40,
                num_stuff_classes=0,
                num_ffn_fcs=2,
                num_heads=8,
                num_cls_fcs=1,
                num_mask_fcs=1,
                feedforward_channels=2048,
                in_channels=256,
                out_channels=256,
                dropout=0.0,
                mask_thr=0.5,
                conv_kernel_size=conv_kernel_size,
                mask_upsample_stride=2,
                ffn_act_cfg=dict(type='ReLU', inplace=True),
                with_ffn=True,
                feat_transform_cfg=dict(
                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
                kernel_updator_cfg=dict(
                    type='KernelUpdator',
                    in_channels=256,
                    feat_channels=256,
                    out_channels=256,
                    input_feat_shape=3,
                    act_cfg=dict(type='ReLU', inplace=True),
                    norm_cfg=dict(type='LN')),
                loss_mask=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=True,
                    loss_weight=1.0),
                loss_dice=dict(
                    type='DiceLoss', loss_weight=4.0),
                loss_cls=dict(
                    type='FocalLoss',
                    use_sigmoid=True,
                    gamma=2.0,
                    alpha=0.25,
                    loss_weight=2.0)
            ) for _ in range(num_stages)
        ]),
    tracker=dict(
        type="KernelFrameIterHeadVideo",
        num_proposals=num_proposals,
        num_stages=3,
        assign_stages=2,
        proposal_feature_channel=256,
        stage_loss_weights=(1., 1., 1.),
        num_thing_classes=40,
        num_stuff_classes=0,
        mask_head=dict(
            type='KernelUpdateHeadVideo',
            num_proposals=num_proposals,
            num_classes=40,
            num_thing_classes=40,
            num_stuff_classes=0,
            num_ffn_fcs=2,
            num_heads=8,
            num_cls_fcs=1,
            num_mask_fcs=1,
            feedforward_channels=2048,
            in_channels=256,
            out_channels=256,
            dropout=0.0,
            mask_thr=0.5,
            conv_kernel_size=conv_kernel_size,
            mask_upsample_stride=2,
            ffn_act_cfg=dict(type='ReLU', inplace=True),
            with_ffn=True,
            feat_transform_cfg=dict(
                conv_cfg=dict(type='Conv2d'), act_cfg=None),
            kernel_updator_cfg=dict(
                type='KernelUpdator',
                in_channels=256,
                feat_channels=256,
                out_channels=256,
                input_feat_shape=3,
                act_cfg=dict(type='ReLU', inplace=True),
                norm_cfg=dict(type='LN')),
            loss_mask=dict(
                type='CrossEntropyLoss',
                use_sigmoid=True,
                loss_weight=1.0),
            loss_dice=dict(
                type='DiceLoss', loss_weight=4.0),
            loss_cls=dict(
                type='FocalLoss',
                use_sigmoid=True,
                gamma=2.0,
                alpha=0.25,
                loss_weight=2.0)
        ),

    ),
    # training and testing settings
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaskHungarianAssigner',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                mask_cost=dict(type='MaskCost', weight=1.0, pred_act=True)
            ),
            sampler=dict(type='MaskPseudoSampler'),
            pos_weight=1),
        rcnn=[
            dict(
                assigner=dict(
                    type='MaskHungarianAssigner',
                    cls_cost=dict(type='FocalLossCost', weight=2.0),
                    dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                    mask_cost=dict(type='MaskCost', weight=1.0,
                                   pred_act=True)
                ),
                sampler=dict(type='MaskPseudoSampler'),
                pos_weight=1) for _ in range(num_stages)
        ],
        tracker=dict(
            assigner=dict(
                type='MaskHungarianAssignerVideo',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                dice_cost=dict(type='DiceCost', weight=4.0, pred_act=True),
                mask_cost=dict(type='MaskCost', weight=1.0,
                               pred_act=True)
            ),
            sampler=dict(type='MaskPseudoSampler'),
            pos_weight=1)
    ),
    test_cfg=dict(
        rpn=None,
        rcnn=dict(
            max_per_img=10,
            mask_thr=0.5,
            merge_stuff_thing=dict(
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3
            )
        ),
        tracker=dict(
            max_per_img=10,
            mask_thr=0.5,
            merge_stuff_thing=dict(
                iou_thr=0.5, stuff_max_area=4096, instance_score_thr=0.3
            ),
        ),
    )
)

custom_imports = dict(
    imports=[
        'knet_vis.det.knet',
        'knet_vis.det.kernel_head',
        'knet_vis.det.kernel_iter_head',
        'knet_vis.det.kernel_update_head',
        'knet_vis.det.semantic_fpn_wrapper',
        'knet_vis.kernel_updator',
        'knet.det.msdeformattn_decoder',
        'knet_vis.det.mask_hungarian_assigner',
        'knet_vis.det.mask_pseudo_sampler',
        'knet_vis.tracker.track',
        'knet_vis.tracker.kernel_head',
        'knet_vis.tracker.kernel_iter_head',
        'knet_vis.tracker.kernel_frame_iter_head',
        'knet_vis.tracker.mask_hungarian_assigner',
        'knet_vis.tracker.kernel_update_head',
        'swin.swin_transformer',
        'mmtrack.datasets.youtube_vis_dataset',
        'mmtrack.pipelines',
    ],
    allow_failed_imports=False
)


================================================
FILE: configs/video_knet_vis/_base_/schedules/schedule_0.75x.py
================================================
# optimizer
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    weight_decay=0.05,
    paramwise_cfg=dict(
        custom_keys={
            'backbone': dict(lr_mult=0.25)
        }
    )
)
optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))
# learning policy
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[5, 7]
)
runner = dict(type='EpochBasedRunner', max_epochs=8)


================================================
FILE: configs/video_knet_vis/_base_/schedules/schedule_1x.py
================================================
# optimizer
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    weight_decay=0.05,
    paramwise_cfg=dict(
        custom_keys={
            'backbone': dict(lr_mult=0.25)
        }
    )
)
optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))
# learning policy
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[8, 11]
)
runner = dict(type='EpochBasedRunner', max_epochs=12)


================================================
FILE: configs/video_knet_vis/_base_/schedules/schedule_8e.py
================================================
# optimizer
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    weight_decay=0.05,
    paramwise_cfg=dict(
        custom_keys={
            'backbone': dict(lr_mult=0.25)
        }
    )
)
optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))
# learning policy
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[7, ]
)
runner = dict(type='EpochBasedRunner', max_epochs=8)


================================================
FILE: configs/video_knet_vis/common/mstrain_3x_coco_instance.py
================================================
_base_ = '../_base_/default_runtime.py'
# dataset settings
dataset_type = 'CocoDataset'
data_root = 'data/coco/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
# multiscale_mode='range'
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
    dict(
        type='Resize',
        img_scale=[(1333, 640), (1333, 800)],
        multiscale_mode='range',
        keep_ratio=True),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1333, 800),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']),
        ])
]

# Use RepeatDataset to speed up training
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type='RepeatDataset',
        times=3,
        dataset=dict(
            type=dataset_type,
            ann_file=data_root + 'annotations/instances_train2017.json',
            img_prefix=data_root + 'train2017/',
            pipeline=train_pipeline)),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
        pipeline=test_pipeline))
evaluation = dict(interval=1, metric=['segm'])

# optimizer
# this is different from the original 1x schedule that use SGD
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    weight_decay=0.05,
    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.25)}))
optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))

# learning policy
# Experiments show that using step=[9, 11] has higher performance
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=0.001,
    step=[9, 11])
runner = dict(type='EpochBasedRunner', max_epochs=12)


================================================
FILE: configs/video_knet_vis/video_knet_vis/knet_track_r50_1x_youtubevis.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_track_r50.py',
    '../_base_/datasets/youtubevis_2019.py',
]

================================================
FILE: configs/video_knet_vis/video_knet_vis/knet_track_r50_deformable_fpn_1x_youtubevis.py
================================================
_base_ = [
    '../_base_/schedules/schedule_1x.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_track_r50_deformablefpn.py',
    '../_base_/datasets/youtubevis_2019.py',
]


data = dict(
    samples_per_gpu=1,
    workers_per_gpu=2,)


================================================
FILE: configs/video_knet_vis/video_knet_vis/knet_track_swinb_1x_youtubevis_8e.py
================================================
_base_ = [
    '../_base_/schedules/schedule_8e.py',
    '../_base_/default_runtime.py',
    '../_base_/models/knet_track_r50.py',
    '../_base_/datasets/youtubevis_2019.py',
]

model = dict(
    backbone=dict(
        _delete_=True,
        type='SwinTransformerDIY',
        embed_dims=128,
        depths=[2, 2, 18, 2],
        num_heads=[4, 8, 16, 32],
        window_size=7,
        mlp_ratio=4,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.,
        attn_drop_rate=0.,
        drop_path_rate=0.3,
        use_abs_pos_embed=False,
Download .txt
gitextract_0ef7ckct/

├── .gitignore
├── DATASET.md
├── LICENSE
├── README.md
├── configs/
│   ├── det/
│   │   ├── _base_/
│   │   │   ├── datasets/
│   │   │   │   ├── cityscapes_panoptic.py
│   │   │   │   ├── cityscapes_step.py
│   │   │   │   ├── cityscapes_vps_clips.py
│   │   │   │   ├── cityscapes_vps_clips_trainval.py
│   │   │   │   ├── coco_instance.py
│   │   │   │   ├── coco_panoptic.py
│   │   │   │   ├── coco_panoptic_instance_annotations.py
│   │   │   │   ├── kitti_step_dvps.py
│   │   │   │   ├── kitti_step_vps.py
│   │   │   │   ├── kitti_step_vps_trainval.py
│   │   │   │   ├── mapillary_panoptic.py
│   │   │   │   └── vipseg_dvps.py
│   │   │   ├── default_runtime.py
│   │   │   ├── models/
│   │   │   │   ├── knet_citystep_s3_r50_fpn.py
│   │   │   │   ├── knet_kitti_step_s3_r50_fpn.py
│   │   │   │   ├── knet_s3_r50_deformable_fpn.py
│   │   │   │   ├── knet_s3_r50_fpn.py
│   │   │   │   ├── knet_s3_r50_fpn_panoptic.py
│   │   │   │   ├── knet_vipseg_s3_r50_fpn.py
│   │   │   │   └── video_knet_s3_r50_fpn_panoptic.py
│   │   │   └── schedules/
│   │   │       ├── schedule_10e.py
│   │   │       └── schedule_1x.py
│   │   ├── coco/
│   │   │   ├── knet_s3_r50_deformable_fpn_ms-3x_coco.py
│   │   │   ├── knet_s3_r50_fpn_ms-3x_coco-panoptic.py
│   │   │   ├── knet_s3_r50_fpn_ms-3x_coco.py
│   │   │   └── knet_s3_swin-b_deformable_fpn_ms-3x_coco.py
│   │   ├── common/
│   │   │   ├── lsj_coco_panoptic_50e.py
│   │   │   ├── mstrain_3x_coco_instance.py
│   │   │   ├── mstrain_3x_coco_panoptic_inst_anno.py
│   │   │   ├── mstrain_3x_coco_panoptic_inst_anno_detr_aug.py
│   │   │   └── mstrain_64e_city_panoptic.py
│   │   ├── knet_cityscapes_step/
│   │   │   ├── knet_s3_r50_fpn.py
│   │   │   ├── knet_s3_swin_b_fpn.py
│   │   │   └── knet_s3_swin_l_fpn.py
│   │   ├── video_knet_kitti_step/
│   │   │   ├── video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train.py
│   │   │   ├── video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train_8e.py
│   │   │   ├── video_knet_s3_swinb_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_update.py
│   │   │   ├── video_knet_s3_swinl_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_update.py
│   │   │   └── video_knet_s3_swinl_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_update_conv_short_track_fc.py
│   │   └── video_knet_vipseg/
│   │       ├── video_knet_s3_r50_rpn_vipseg_mask_embed_link_ffn_joint_train.py
│   │       └── video_knet_s3_swin_b_rpn_vipseg_mask_embed_link_ffn_joint_train_8e.py
│   └── video_knet_vis/
│       ├── _base_/
│       │   ├── datasets/
│       │   │   ├── coco_instance.py
│       │   │   └── youtubevis_2019.py
│       │   ├── default_runtime.py
│       │   ├── models/
│       │   │   ├── knet_track_r50.py
│       │   │   └── knet_track_r50_deformablefpn.py
│       │   └── schedules/
│       │       ├── schedule_0.75x.py
│       │       ├── schedule_1x.py
│       │       └── schedule_8e.py
│       ├── common/
│       │   └── mstrain_3x_coco_instance.py
│       └── video_knet_vis/
│           ├── knet_track_r50_1x_youtubevis.py
│           ├── knet_track_r50_deformable_fpn_1x_youtubevis.py
│           ├── knet_track_swinb_1x_youtubevis_8e.py
│           └── knet_track_swinb_deformable_1x_youtubevis.py
├── external/
│   ├── cityscape_panoptic.py
│   ├── cityscapes_step.py
│   ├── cityscapes_vps.py
│   ├── coco_panoptic.py
│   ├── dataset/
│   │   ├── dvps_pipelines/
│   │   │   ├── __init__.py
│   │   │   ├── loading.py
│   │   │   ├── transforms.py
│   │   │   └── tricks.py
│   │   ├── forecasting_pipelines/
│   │   │   ├── __init__.py
│   │   │   ├── loading.py
│   │   │   └── transforms.py
│   │   ├── mIoU.py
│   │   └── pipelines/
│   │       ├── __init__.py
│   │       ├── formatting.py
│   │       ├── loading.py
│   │       ├── test_time_aug.py
│   │       └── transforms.py
│   ├── evalhooks.py
│   ├── ext/
│   │   ├── mask.py
│   │   └── ytvos.py
│   ├── fcn_mask_head.py
│   ├── kitti_step_dvps.py
│   ├── panoptic_fpn.py
│   ├── panoptic_head.py
│   ├── semantic_seg_head.py
│   ├── semkitti_dvps.py
│   ├── test.py
│   ├── train.py
│   ├── utils.py
│   └── vipseg_dvps.py
├── knet/
│   ├── __init__.py
│   ├── cross_entropy_loss.py
│   ├── det/
│   │   ├── dice_loss.py
│   │   ├── kernel_head.py
│   │   ├── kernel_iter_head.py
│   │   ├── kernel_update_head.py
│   │   ├── knet.py
│   │   ├── mask_hungarian_assigner.py
│   │   ├── mask_pseudo_sampler.py
│   │   ├── msdeformattn_decoder.py
│   │   ├── semantic_fpn_wrapper.py
│   │   └── utils.py
│   ├── kernel_updator.py
│   └── video/
│       ├── __init__.py
│       ├── dice_loss.py
│       ├── kernel_head.py
│       ├── kernel_iter_head.py
│       ├── kernel_update_head.py
│       ├── knet.py
│       ├── knet_quansi_dense.py
│       ├── knet_quansi_dense_embed_fc.py
│       ├── knet_quansi_dense_embed_fc_joint_train.py
│       ├── knet_quansi_dense_embed_fc_toy_exp.py
│       ├── knet_quansi_dense_roi_gt_box.py
│       ├── knet_quansi_dense_roi_gt_box_joint_train.py
│       ├── knet_track_head.py
│       ├── knet_track_head_roi_align.py
│       ├── knet_uni_track.py
│       ├── mask_hungarian_assigner.py
│       ├── mask_pseudo_sampler.py
│       ├── qdtrack/
│       │   ├── builder.py
│       │   ├── losses/
│       │   │   ├── __init__.py
│       │   │   ├── l2_loss.py
│       │   │   └── multipos_cross_entropy_loss.py
│       │   ├── track/
│       │   │   ├── __init__.py
│       │   │   ├── similarity.py
│       │   │   └── transforms.py
│       │   └── trackers/
│       │       ├── __init__.py
│       │       ├── quasi_dense_embed_tracker.py
│       │       └── tao_tracker.py
│       ├── track_heads.py
│       ├── tracker.py
│       └── util.py
├── knet_vis/
│   ├── __init__.py
│   ├── det/
│   │   ├── __init__.py
│   │   ├── kernel_head.py
│   │   ├── kernel_iter_head.py
│   │   ├── kernel_update_head.py
│   │   ├── knet.py
│   │   ├── mask_hungarian_assigner.py
│   │   ├── mask_pseudo_sampler.py
│   │   ├── semantic_fpn_wrapper.py
│   │   └── utils.py
│   ├── kernel_updator.py
│   └── tracker/
│       ├── __init__.py
│       ├── kernel_frame_head.py
│       ├── kernel_frame_iter_head.py
│       ├── kernel_head.py
│       ├── kernel_iter_head.py
│       ├── kernel_update_head.py
│       ├── mask_hungarian_assigner.py
│       ├── positional_encoding.py
│       ├── semantic_fpn_wrapper3D.py
│       └── track.py
├── mmtrack/
│   ├── datasets/
│   │   ├── coco_video_dataset.py
│   │   ├── parsers/
│   │   │   ├── __init__.py
│   │   │   └── coco_video_parser.py
│   │   └── youtube_vis_dataset.py
│   ├── pipelines/
│   │   ├── __init__.py
│   │   ├── formatting.py
│   │   ├── loading.py
│   │   ├── test_time_aug.py
│   │   └── transforms.py
│   └── transform.py
├── scripts/
│   ├── kitti_step_prepare.py
│   └── visualizer.py
├── swin/
│   ├── DetectRS.py
│   ├── ckpt_convert.py
│   ├── mix_transformer.py
│   ├── swin_checkpoint.py
│   ├── swin_transformer.py
│   ├── swin_transformer_rfp.py
│   └── transformer.py
├── tools/
│   ├── dataset/
│   │   ├── cityscapes_instance_idmap.py
│   │   └── youtubevis2coco.py
│   ├── dist_step_test.sh
│   ├── dist_test.sh
│   ├── dist_train.sh
│   ├── dist_train_new.sh
│   ├── dist_vps_test.sh
│   ├── docker.sh
│   ├── eval_dstq.py
│   ├── eval_dstq_step.py
│   ├── eval_dstq_vipseg.py
│   ├── eval_dvpq_step.py
│   ├── eval_dvpq_vipseg.py
│   ├── flops_counter.py
│   ├── get_flops.py
│   ├── inference_kitti_step.sh
│   ├── slurm_test.sh
│   ├── slurm_test_dvps.sh
│   ├── slurm_test_step.sh
│   ├── slurm_test_vis.sh
│   ├── slurm_test_vps.sh
│   ├── slurm_train.sh
│   ├── test.py
│   ├── test_dvps.py
│   ├── test_step.py
│   ├── test_vps.py
│   ├── train.py
│   ├── utils/
│   │   ├── DSTQ.py
│   │   ├── STQ.py
│   │   └── cityscapesvps_eval.py
│   └── visualization.py
├── tools_vis/
│   ├── apis/
│   │   ├── __init__.py
│   │   └── test.py
│   ├── dist_test_whole_video.sh
│   ├── docker.sh
│   ├── slurm_test_vis.sh
│   ├── test.py
│   └── test_whole_video.py
└── unitrack/
    ├── __init__.py
    ├── basetrack.py
    ├── box.py
    ├── core/
    │   ├── __init__.py
    │   ├── association/
    │   │   ├── __init__.py
    │   │   └── matching.py
    │   ├── motion/
    │   │   └── kalman_filter.py
    │   └── propagation/
    │       ├── __init__.py
    │       ├── propagate_box.py
    │       ├── propagate_mask.py
    │       └── propagate_pose.py
    ├── mask.py
    ├── mask_with_train_embs.py
    ├── model/
    │   ├── __init__.py
    │   ├── functional.py
    │   ├── hrnet.py
    │   ├── model.py
    │   ├── random_feat_generator.py
    │   └── resnet.py
    ├── multitracker.py
    └── utils/
        ├── __init__.py
        ├── box.py
        ├── io.py
        ├── log.py
        ├── mask.py
        ├── meter.py
        ├── palette.py
        └── visualize.py
Download .txt
SYMBOL INDEX (1528 symbols across 144 files)

FILE: external/cityscape_panoptic.py
  class CityscapesPanopticDataset (line 24) | class CityscapesPanopticDataset(CocoDataset):
    method load_annotations (line 29) | def load_annotations(self, ann_file):
    method _filter_imgs (line 83) | def _filter_imgs(self, min_size=32):
    method _parse_ann_info (line 111) | def _parse_ann_info(self, img_info, ann_info):
    method _panoptic2json (line 164) | def _panoptic2json(self, results, outfile_prefix):
    method results2json (line 202) | def results2json(self, results, outfile_prefix):
    method results2txt (line 256) | def results2txt(self, results, outfile_prefix):
    method format_results (line 322) | def format_results(self, results, jsonfile_prefix="./test", **kwargs):
    method evaluate (line 350) | def evaluate(self,
    method _evaluate_cityscapes (line 574) | def _evaluate_cityscapes(self, results, txtfile_prefix, logger):

FILE: external/cityscapes_step.py
  class CityscapesSTEP (line 12) | class CityscapesSTEP:
    method __init__ (line 18) | def __init__(
    method pre_pipeline (line 70) | def pre_pipeline(self, results):
    method prepare_test_img (line 78) | def prepare_test_img(self, idx):
    method prepare_val_annotation (line 89) | def prepare_val_annotation(self, idx):
    method prepare_train_img (line 102) | def prepare_train_img(self, idx):
    method __getitem__ (line 120) | def __getitem__(self, idx):
    method _rand_another (line 141) | def _rand_another(self, idx):
    method __len__ (line 146) | def __len__(self):
    method _set_groups (line 149) | def _set_groups(self):
    method evaluate (line 153) | def evaluate(
  function vpq_eval (line 247) | def vpq_eval(element):

FILE: external/cityscapes_vps.py
  class CityscapesVPSDataset (line 24) | class CityscapesVPSDataset(CocoDataset):
    method __init__ (line 25) | def __init__(self,
    method load_ref_annotations (line 56) | def load_ref_annotations(self, ann_file):
    method load_annotations (line 71) | def load_annotations(self, ann_file):
    method _filter_imgs (line 124) | def _filter_imgs(self, min_size=32):
    method prepare_train_img (line 152) | def prepare_train_img(self, idx):
    method check_whether_has_correspondence (line 194) | def check_whether_has_correspondence(self, ref_iid, iid):
    method check_match (line 206) | def check_match(self, ref_ann_info, ann_info):
    method prepare_test_img (line 213) | def prepare_test_img(self, idx):
    method pre_pipeline (line 234) | def pre_pipeline(self, results):
    method pre_test_pipeline (line 248) | def pre_test_pipeline(self, results):
    method _parse_ann_info (line 258) | def _parse_ann_info(self, img_info, ann_info):
    method get_ref_ann_info_by_iid (line 319) | def get_ref_ann_info_by_iid(self, img_id, ref_img_info):
    method _panoptic2json (line 324) | def _panoptic2json(self, results, outfile_prefix):
    method results2json (line 358) | def results2json(self, results, outfile_prefix):
    method results2txt (line 412) | def results2txt(self, results, outfile_prefix):
    method format_results (line 478) | def format_results(self, results, jsonfile_prefix=None, **kwargs):
    method evaluate (line 506) | def evaluate(self,
    method _evaluate_cityscapes (line 730) | def _evaluate_cityscapes(self, results, txtfile_prefix, logger):

FILE: external/coco_panoptic.py
  class CocoPanopticDatasetCustom (line 19) | class CocoPanopticDatasetCustom(CocoDataset):
    method load_annotations (line 21) | def load_annotations(self, ann_file):
    method get_ann_info (line 72) | def get_ann_info(self, idx):
    method get_cat_ids (line 87) | def get_cat_ids(self, idx):
    method _parse_ann_info (line 102) | def _parse_ann_info(self, img_info, ann_info):
    method _panoptic2json (line 161) | def _panoptic2json(self, results, outfile_prefix):
    method results2json (line 188) | def results2json(self, results, outfile_prefix):
    method format_results (line 242) | def format_results(self, results, jsonfile_prefix=None, **kwargs):
    method evaluate (line 270) | def evaluate(self,
  function parse_pq_results (line 484) | def parse_pq_results(pq_res):
  function _print_panoptic_results (line 498) | def _print_panoptic_results(pq_res):

FILE: external/dataset/dvps_pipelines/loading.py
  function bitmasks2bboxes (line 7) | def bitmasks2bboxes(bitmasks):
  class LoadImgDirect (line 21) | class LoadImgDirect:
    method __init__ (line 25) | def __init__(self,
    method __call__ (line 31) | def __call__(self, results):
    method __repr__ (line 54) | def __repr__(self):
  class LoadMultiImagesDirect (line 62) | class LoadMultiImagesDirect(LoadImgDirect):
    method __init__ (line 68) | def __init__(self, *args, **kwargs):
    method __call__ (line 71) | def __call__(self, results):
  class LoadAnnotationsDirect (line 89) | class LoadAnnotationsDirect:
    method __init__ (line 93) | def __init__(self,
    method __call__ (line 110) | def __call__(self, results):
  class LoadMultiAnnotationsDirect (line 225) | class LoadMultiAnnotationsDirect(LoadAnnotationsDirect):
    method __init__ (line 226) | def __init__(self, *args, **kwargs):
    method __call__ (line 229) | def __call__(self, results):

FILE: external/dataset/dvps_pipelines/transforms.py
  class ResizeWithDepth (line 8) | class ResizeWithDepth(Resize):
    method __init__ (line 12) | def __init__(self, *args, **kwargs):
    method _resize_depth (line 16) | def _resize_depth(self, results):
    method __call__ (line 34) | def __call__(self, results):
  class SeqResizeWithDepth (line 41) | class SeqResizeWithDepth(ResizeWithDepth):
    method __init__ (line 50) | def __init__(self, share_params=True, *args, **kwargs):
    method __call__ (line 54) | def __call__(self, results):
  class RandomFlipWithDepth (line 78) | class RandomFlipWithDepth(RandomFlip):
    method __init__ (line 79) | def __init__(self, *args, **kwargs):
    method __call__ (line 82) | def __call__(self, results):
  class SeqFlipWithDepth (line 92) | class SeqFlipWithDepth(RandomFlipWithDepth):
    method __init__ (line 101) | def __init__(self, share_params=True, *args, **kwargs):
    method __call__ (line 105) | def __call__(self, results):
  class SeqRandomCropWithDepth (line 149) | class SeqRandomCropWithDepth(object):
    method __init__ (line 173) | def __init__(self,
    method get_offsets (line 196) | def get_offsets(self, img):
    method random_crop (line 204) | def random_crop(self, results, offsets=None):
    method __call__ (line 272) | def __call__(self, results):
    method check_match (line 305) | def check_match(self, ref_results, results):
  class PadWithDepth (line 314) | class PadWithDepth(Pad):
    method _pad_depth (line 316) | def _pad_depth(self, results):
    method _pad_seg (line 325) | def _pad_seg(self, results):
    method __call__ (line 335) | def __call__(self, results):
  class SeqPadWithDepth (line 352) | class SeqPadWithDepth(PadWithDepth):
    method __init__ (line 358) | def __init__(self, *args, **kwargs):
    method __call__ (line 361) | def __call__(self, results):
  class SeqNormalizeWithDepth (line 381) | class SeqNormalizeWithDepth(Normalize):
    method __init__ (line 387) | def __init__(self, *args, **kwargs):
    method __call__ (line 390) | def __call__(self, results):

FILE: external/dataset/dvps_pipelines/tricks.py
  class SeqAutoAug (line 7) | class SeqAutoAug(AutoAugment):
    method __init__ (line 11) | def __init__(self, policies):
    method __call__ (line 14) | def __call__(self, results):

FILE: external/dataset/forecasting_pipelines/loading.py
  function bitmasks2bboxes (line 8) | def bitmasks2bboxes(bitmasks):
  class LoadMultiImagesFromFile (line 22) | class LoadMultiImagesFromFile:
    method __init__ (line 39) | def __init__(self,
    method __call__ (line 48) | def __call__(self, results):
    method __repr__ (line 75) | def __repr__(self):
  class LoadAnnotationsInstanceMasks (line 84) | class LoadAnnotationsInstanceMasks:
    method __init__ (line 85) | def __init__(self,
    method _load_masks (line 96) | def _load_masks(self, results):
    method _load_semantic_seg (line 131) | def _load_semantic_seg(self, results):
    method __call__ (line 144) | def __call__(self, results):
    method __repr__ (line 162) | def __repr__(self):

FILE: external/dataset/forecasting_pipelines/transforms.py
  class NormalizeMultiple (line 8) | class NormalizeMultiple:
    method __init__ (line 20) | def __init__(self, mean, std, to_rgb=True):
    method __call__ (line 25) | def __call__(self, results):
    method __repr__ (line 51) | def __repr__(self):
  class PadFutureMMDet (line 58) | class PadFutureMMDet:
    method __init__ (line 72) | def __init__(self,
    method _pad_img (line 98) | def _pad_img(self, results):
    method _pad_masks (line 116) | def _pad_masks(self, results):
    method _pad_seg (line 123) | def _pad_seg(self, results):
    method __call__ (line 131) | def __call__(self, results):
    method __repr__ (line 143) | def __repr__(self):
  class KNetInsAdapter (line 153) | class KNetInsAdapter:
    method __init__ (line 158) | def __init__(self, stuff_nums=11):
    method __call__ (line 161) | def __call__(self, results):

FILE: external/dataset/mIoU.py
  function eval_miou (line 4) | def eval_miou(results, targets, num_classes, ignore_index=255):

FILE: external/dataset/pipelines/formatting.py
  class ConcatVideoReferences (line 9) | class ConcatVideoReferences(object):
    method __call__ (line 27) | def __call__(self, results):
  class ConcatVideos (line 91) | class ConcatVideos(object):
    method __call__ (line 109) | def __call__(self, results):
  class MultiImagesToTensor (line 165) | class MultiImagesToTensor(object):
    method __init__ (line 177) | def __init__(self, ref_prefix='ref'):
    method __call__ (line 180) | def __call__(self, results):
    method images_to_tensor (line 208) | def images_to_tensor(self, results):
  class SeqDefaultFormatBundle (line 227) | class SeqDefaultFormatBundle(object):
    method __init__ (line 252) | def __init__(self, ref_prefix='ref'):
    method __call__ (line 255) | def __call__(self, results):
    method default_format_bundle (line 289) | def default_format_bundle(self, results):
    method __repr__ (line 336) | def __repr__(self):
  class VideoCollect (line 341) | class VideoCollect(object):
    method __init__ (line 355) | def __init__(self,
    method __call__ (line 379) | def __call__(self, results):
    method _collect_meta_keys (line 416) | def _collect_meta_keys(self, results):
    method _add_default_meta_keys (line 430) | def _add_default_meta_keys(self, results):
  class ToList (line 457) | class ToList(object):
    method __call__ (line 467) | def __call__(self, results):
  class ReIDFormatBundle (line 475) | class ReIDFormatBundle(object):
    method __init__ (line 486) | def __init__(self, *args, **kwargs):
    method __call__ (line 489) | def __call__(self, results):
    method reid_format_bundle (line 517) | def reid_format_bundle(self, results):
  class ImageToTensorWithRef (line 544) | class ImageToTensorWithRef(object):
    method __init__ (line 546) | def __init__(self, keys):
    method __call__ (line 549) | def __call__(self, results):
    method __repr__ (line 567) | def __repr__(self):
  class LabelConsistentChecker (line 571) | class LabelConsistentChecker:
    method __init__ (line 574) | def __init__(self, num_frames=5):
    method __call__ (line 577) | def __call__(self, results):

FILE: external/dataset/pipelines/loading.py
  class LoadMultiImagesFromFile (line 12) | class LoadMultiImagesFromFile(LoadImageFromFile):
    method __init__ (line 18) | def __init__(self, *args, **kwargs):
    method __call__ (line 21) | def __call__(self, results):
  class SeqLoadAnnotations (line 39) | class SeqLoadAnnotations(LoadAnnotations):
    method __init__ (line 47) | def __init__(self, with_track=False, *args, **kwargs):
    method _load_track (line 51) | def _load_track(self, results):
    method __call__ (line 63) | def __call__(self, results):
  class LoadRefImageFromFile (line 85) | class LoadRefImageFromFile(object):
    method __init__ (line 91) | def __init__(self, sample=True, to_float32=False):
    method __call__ (line 95) | def __call__(self, results):
    method __repr__ (line 123) | def __repr__(self):
  function bitmasks2bboxes (line 128) | def bitmasks2bboxes(bitmasks):
  class LoadAnnotationsInstanceMasks (line 142) | class LoadAnnotationsInstanceMasks:
    method __init__ (line 143) | def __init__(self,
    method _load_masks (line 156) | def _load_masks(self, results):
    method _load_semantic_seg (line 193) | def _load_semantic_seg(self, results):
    method __call__ (line 206) | def __call__(self, results):
    method __repr__ (line 224) | def __repr__(self):

FILE: external/dataset/pipelines/test_time_aug.py
  class MultiScaleFlipAugVideo (line 11) | class MultiScaleFlipAugVideo:
    method __init__ (line 47) | def __init__(self,
    method __call__ (line 78) | def __call__(self, results):
    method __repr__ (line 110) | def __repr__(self):

FILE: external/dataset/pipelines/transforms.py
  class SeqColorAug (line 10) | class SeqColorAug(object):
    method __init__ (line 21) | def __init__(self,
    method __call__ (line 29) | def __call__(self, results):
  class SeqBlurAug (line 56) | class SeqBlurAug(object):
    method __init__ (line 63) | def __init__(self, prob=[0.0, 0.2]):
    method __call__ (line 66) | def __call__(self, results):
  class SeqResize (line 96) | class SeqResize(Resize):
    method __init__ (line 105) | def __init__(self, share_params=True, *args, **kwargs):
    method __call__ (line 109) | def __call__(self, results):
  class SeqNormalize (line 133) | class SeqNormalize(Normalize):
    method __init__ (line 139) | def __init__(self, *args, **kwargs):
    method __call__ (line 142) | def __call__(self, results):
  class SeqRandomFlip (line 161) | class SeqRandomFlip(RandomFlip):
    method __init__ (line 170) | def __init__(self, share_params, *args, **kwargs):
    method __call__ (line 174) | def __call__(self, results):
  class SeqPad (line 218) | class SeqPad(Pad):
    method __init__ (line 224) | def __init__(self, *args, **kwargs):
    method __call__ (line 227) | def __call__(self, results):
  class SeqRandomCrop (line 246) | class SeqRandomCrop(object):
    method __init__ (line 270) | def __init__(self,
    method get_offsets (line 293) | def get_offsets(self, img):
    method random_crop (line 301) | def random_crop(self, results, offsets=None):
    method __call__ (line 364) | def __call__(self, results):
    method check_match (line 391) | def check_match(self, ref_results, results):
  class SeqPhotoMetricDistortion (line 400) | class SeqPhotoMetricDistortion(object):
    method __init__ (line 419) | def __init__(self,
    method get_params (line 431) | def get_params(self):
    method photo_metric_distortion (line 467) | def photo_metric_distortion(self, results, params=None):
    method __call__ (line 524) | def __call__(self, results):
    method __repr__ (line 543) | def __repr__(self):
  class ResizeWithRef (line 555) | class ResizeWithRef(object):
    method __init__ (line 580) | def __init__(self,
    method random_select (line 606) | def random_select(img_scales):
    method random_sample (line 613) | def random_sample(img_scales):
    method random_sample_ratio (line 627) | def random_sample_ratio(img_scale, ratio_range):
    method _random_scale (line 635) | def _random_scale(self, results):
    method _resize_img (line 651) | def _resize_img(self, results):
    method _resize_bboxes (line 668) | def _resize_bboxes(self, results):
    method _resize_masks (line 680) | def _resize_masks(self, results):
    method __call__ (line 703) | def __call__(self, results):
    method __repr__ (line 712) | def __repr__(self):
  class RandomFlipWithRef (line 723) | class RandomFlipWithRef(object):
    method __init__ (line 734) | def __init__(self, flip_ratio=None):
    method bbox_flip (line 739) | def bbox_flip(self, bboxes, img_shape):
    method __call__ (line 753) | def __call__(self, results):
    method __repr__ (line 776) | def __repr__(self):
  class PadWithRef (line 782) | class PadWithRef(object):
    method __init__ (line 794) | def __init__(self, size=None, size_divisor=None, pad_val=0):
    method _pad_img (line 802) | def _pad_img(self, results):
    method _pad_masks (line 815) | def _pad_masks(self, results):
    method __call__ (line 826) | def __call__(self, results):
    method __repr__ (line 831) | def __repr__(self):
  class NormalizeWithRef (line 839) | class NormalizeWithRef(object):
    method __init__ (line 849) | def __init__(self, mean, std, to_rgb=True):
    method __call__ (line 854) | def __call__(self, results):
    method __repr__ (line 864) | def __repr__(self):
  class RandomCropWithRef (line 872) | class RandomCropWithRef(object):
    method __init__ (line 879) | def __init__(self, crop_size):
    method __call__ (line 882) | def __call__(self, results):
    method __repr__ (line 948) | def __repr__(self):
  class PadFutureMMDet (line 954) | class PadFutureMMDet:
    method __init__ (line 968) | def __init__(self,
    method _pad_img (line 994) | def _pad_img(self, results):
    method _pad_masks (line 1012) | def _pad_masks(self, results):
    method _pad_seg (line 1019) | def _pad_seg(self, results):
    method __call__ (line 1027) | def __call__(self, results):
    method __repr__ (line 1039) | def __repr__(self):
  class KNetInsAdapter (line 1049) | class KNetInsAdapter:
    method __init__ (line 1054) | def __init__(self, stuff_nums=11):
    method __call__ (line 1057) | def __call__(self, results):
  class KNetInsAdapterCherryPick (line 1069) | class KNetInsAdapterCherryPick:
    method __init__ (line 1074) | def __init__(self, stuff_nums=11, cherry=(11, 13)):
    method __call__ (line 1078) | def __call__(self, results):

FILE: external/evalhooks.py
  class EvalHook (line 15) | class EvalHook(Hook):
    method __init__ (line 50) | def __init__(self,
    method _init_rule (line 82) | def _init_rule(self, rule, key_indicator):
    method before_run (line 109) | def before_run(self, runner):
    method before_train_epoch (line 116) | def before_train_epoch(self, runner):
    method evaluation_flag (line 124) | def evaluation_flag(self, runner):
    method after_train_epoch (line 143) | def after_train_epoch(self, runner):
    method after_train_iter (line 151) | def after_train_iter(self, runner):
    method save_best_checkpoint (line 159) | def save_best_checkpoint(self, runner, key_score):
    method evaluate (line 174) | def evaluate(self, runner, results):
  class DistEvalHook (line 189) | class DistEvalHook(EvalHook):
    method __init__ (line 223) | def __init__(self,
    method _broadcast_bn_buffer (line 246) | def _broadcast_bn_buffer(self, runner):
    method after_train_epoch (line 260) | def after_train_epoch(self, runner):
    method after_train_iter (line 281) | def after_train_iter(self, runner):

FILE: external/ext/mask.py
  function encode (line 80) | def encode(bimask):
  function decode (line 87) | def decode(rleObjs):
  function area (line 93) | def area(rleObjs):
  function toBbox (line 99) | def toBbox(rleObjs):

FILE: external/ext/ytvos.py
  function _isArrayLike (line 37) | def _isArrayLike(obj):
  class YTVOS (line 41) | class YTVOS:
    method __init__ (line 42) | def __init__(self, annotation_file=None):
    method createIndex (line 61) | def createIndex(self):
    method info (line 92) | def info(self):
    method getAnnIds (line 100) | def getAnnIds(self, vidIds=[], catIds=[], areaRng=[], iscrowd=None):
    method getCatIds (line 128) | def getCatIds(self, catNms=[], supNms=[], catIds=[]):
    method getVidIds (line 150) | def     getVidIds(self, vidIds=[], catIds=[]):
    method loadAnns (line 171) | def loadAnns(self, ids=[]):
    method loadCats (line 182) | def loadCats(self, ids=[]):
    method loadVids (line 193) | def loadVids(self, ids=[]):
    method loadRes (line 205) | def loadRes(self, resFile):
    method annToRLE (line 255) | def annToRLE(self, ann, frameId):
    method annToMask (line 276) | def annToMask(self, ann, frameId):

FILE: external/fcn_mask_head.py
  class InstanceMaskHead (line 14) | class InstanceMaskHead(FCNMaskHead):
    method __init__ (line 16) | def __init__(self, **kwargs):
    method get_seg_masks (line 19) | def get_seg_masks(self, mask_pred, det_bboxes, det_labels, rcnn_test_cfg,

FILE: external/kitti_step_dvps.py
  class SeqObj (line 19) | class SeqObj:
    method __init__ (line 23) | def __init__(self, the_dict: Dict):
    method __hash__ (line 27) | def __hash__(self):
    method __eq__ (line 30) | def __eq__(self, other):
    method __getitem__ (line 33) | def __getitem__(self, attr):
  class KITTISTEPDVPSDataset (line 38) | class KITTISTEPDVPSDataset:
    method __init__ (line 44) | def __init__(self,
    method pre_pipelines (line 124) | def pre_pipelines(self, results):
    method prepare_train_img (line 133) | def prepare_train_img(self, idx):
    method prepare_test_img (line 147) | def prepare_test_img(self, idx):
    method _rand_another (line 155) | def _rand_another(self, idx):
    method __getitem__ (line 161) | def __getitem__(self, idx):
    method __len__ (line 182) | def __len__(self):
    method _set_groups (line 186) | def _set_groups(self):
    method evaluate (line 190) | def evaluate(
  function vpq_eval (line 321) | def vpq_eval(element):

FILE: external/panoptic_fpn.py
  class PanopticFPN (line 6) | class PanopticFPN(TwoStageDetector):
    method __init__ (line 9) | def __init__(self,
    method with_semantic (line 27) | def with_semantic(self):

FILE: external/panoptic_head.py
  class PanopticTestMixin (line 7) | class PanopticTestMixin(object):
    method simple_test_semantic (line 9) | def simple_test_semantic(self, x, img_metas):
    method generate_panoptic (line 20) | def generate_panoptic(self, det_bboxes, det_labels, mask_preds, sem_seg,
  class PanopticHead (line 31) | class PanopticHead(StandardRoIHead, PanopticTestMixin):
    method __init__ (line 34) | def __init__(self, *args, semantic_head, **kwargs):
    method with_semantic (line 39) | def with_semantic(self):
    method init_weights (line 46) | def init_weights(self, pretrained):
    method forward_train (line 57) | def forward_train(self,
    method async_simple_test (line 131) | async def async_simple_test(self,
    method simple_test (line 140) | def simple_test(self,
  function mask2result (line 182) | def mask2result(mask_preds, labels, num_classes):
  function merge_stuff_thing (line 196) | def merge_stuff_thing(det_bboxes,

FILE: external/semantic_seg_head.py
  class SemanticHead (line 10) | class SemanticHead(FusedSemanticHead):
    method __init__ (line 29) | def __init__(self,
    method init_weights (line 54) | def init_weights(self):
    method forward (line 58) | def forward(self, feats):
    method loss (line 64) | def loss(self, mask_pred, labels):
    method get_semantic_seg (line 80) | def get_semantic_seg(self, seg_preds, ori_shape, img_shape_withoutpad):

FILE: external/semkitti_dvps.py
  class SeqObj (line 15) | class SeqObj:
    method __init__ (line 19) | def __init__(self, the_dict: Dict):
    method __hash__ (line 23) | def __hash__(self):
    method __eq__ (line 26) | def __eq__(self, other):
    method __getitem__ (line 29) | def __getitem__(self, attr):
  class KITTIDVPSDataset (line 34) | class KITTIDVPSDataset:
    method __init__ (line 39) | def __init__(self,
    method pre_pipelines (line 121) | def pre_pipelines(self, results):
    method prepare_train_img (line 130) | def prepare_train_img(self, idx):
    method prepare_test_img (line 144) | def prepare_test_img(self, idx):
    method _rand_another (line 152) | def _rand_another(self, idx):
    method __getitem__ (line 158) | def __getitem__(self, idx):
    method __len__ (line 179) | def __len__(self):
    method _set_groups (line 183) | def _set_groups(self):
    method evaluate (line 187) | def evaluate(
  function vpq_eval (line 299) | def vpq_eval(element):

FILE: external/test.py
  function single_gpu_test (line 13) | def single_gpu_test(model,
  function multi_gpu_test (line 78) | def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):

FILE: external/train.py
  function train_detector (line 16) | def train_detector(model,

FILE: external/utils.py
  function encode_panoptic (line 7) | def encode_panoptic(panoptic_results):

FILE: external/vipseg_dvps.py
  function vip2hb (line 282) | def vip2hb(pan_map):
  class SeqObj (line 303) | class SeqObj:
    method __init__ (line 307) | def __init__(self, the_dict: Dict):
    method __hash__ (line 311) | def __hash__(self):
    method __eq__ (line 314) | def __eq__(self, other):
    method __getitem__ (line 317) | def __getitem__(self, attr):
  class VIPSegDVPSDataset (line 322) | class VIPSegDVPSDataset:
    method __init__ (line 327) | def __init__(self,
    method pre_pipelines (line 427) | def pre_pipelines(self, results):
    method prepare_train_img (line 438) | def prepare_train_img(self, idx):
    method prepare_test_img (line 452) | def prepare_test_img(self, idx):
    method _rand_another (line 460) | def _rand_another(self, idx):
    method __getitem__ (line 466) | def __getitem__(self, idx):
    method __len__ (line 487) | def __len__(self):
    method _set_groups (line 491) | def _set_groups(self):
    method evaluate (line 495) | def evaluate(

FILE: knet/cross_entropy_loss.py
  function cross_entropy (line 8) | def cross_entropy(pred,
  function _expand_onehot_labels (line 45) | def _expand_onehot_labels(labels, label_weights, label_channels):
  function binary_cross_entropy (line 61) | def binary_cross_entropy(pred,
  function mask_cross_entropy (line 95) | def mask_cross_entropy(pred,
  class CrossEntropyLoss (line 140) | class CrossEntropyLoss(nn.Module):
    method __init__ (line 142) | def __init__(self,
    method forward (line 175) | def forward(self,

FILE: knet/det/dice_loss.py
  function dice_loss (line 9) | def dice_loss(input, target, eps=1e-3, numerator_eps=0):

FILE: knet/det/kernel_head.py
  class ConvKernelHead (line 12) | class ConvKernelHead(nn.Module):
    method __init__ (line 14) | def __init__(self,
    method _init_layers (line 122) | def _init_layers(self):
    method init_weights (line 169) | def init_weights(self):
    method _decode_init_proposals (line 196) | def _decode_init_proposals(self, img, img_metas):
    method forward_train (line 267) | def forward_train(self,
    method loss (line 337) | def loss(self,
    method _get_target_single (line 430) | def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask,
    method get_targets (line 469) | def get_targets(self,
    method simple_test_rpn (line 506) | def simple_test_rpn(self, img, img_metas):
    method forward_dummy (line 510) | def forward_dummy(self, img, img_metas):

FILE: knet/det/kernel_iter_head.py
  class KernelIterHead (line 12) | class KernelIterHead(BaseRoIHead):
    method __init__ (line 14) | def __init__(self,
    method init_bbox_head (line 76) | def init_bbox_head(self, mask_roi_extractor, mask_head):
    method init_assigner_sampler (line 85) | def init_assigner_sampler(self):
    method init_weights (line 97) | def init_weights(self):
    method init_mask_head (line 101) | def init_mask_head(self, mask_roi_extractor, mask_head):
    method _mask_forward (line 118) | def _mask_forward(self, stage, x, object_feats, mask_preds, img_metas):
    method forward_train (line 139) | def forward_train(self,
    method simple_test (line 233) | def simple_test(self,
    method simple_test_mask_preds (line 285) | def simple_test_mask_preds(self,
    method aug_test (line 314) | def aug_test(self, features, proposal_list, img_metas, rescale=False):
    method forward_dummy (line 317) | def forward_dummy(self, x, proposal_boxes, proposal_feats, img_metas):
    method get_panoptic (line 332) | def get_panoptic(self, cls_scores, mask_preds, test_cfg, img_meta):
    method split_thing_stuff (line 372) | def split_thing_stuff(self, mask_preds, det_labels, cls_scores):
    method merge_stuff_thing (line 386) | def merge_stuff_thing(self,
    method merge_stuff_thing_stuff_joint (line 467) | def merge_stuff_thing_stuff_joint(self,

FILE: knet/det/kernel_update_head.py
  class KernelUpdateHead (line 17) | class KernelUpdateHead(nn.Module):
    method __init__ (line 19) | def __init__(self,
    method init_weights (line 151) | def init_weights(self):
    method forward (line 170) | def forward(self,
    method loss (line 280) | def loss(self,
    method _get_target_single (line 351) | def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask,
    method get_targets (line 406) | def get_targets(self,
    method rescale_masks (line 443) | def rescale_masks(self, masks_per_img, img_meta):
    method get_seg_masks (line 460) | def get_seg_masks(self, masks_per_img, labels_per_img, scores_per_img,
    method segm2result (line 469) | def segm2result(self, mask_preds, det_labels, cls_scores):

FILE: knet/det/knet.py
  class KNet (line 10) | class KNet(TwoStageDetector):
    method __init__ (line 12) | def __init__(self,
    method forward_train (line 32) | def forward_train(self,
    method simple_test (line 161) | def simple_test(self, img, img_metas, rescale=False):
    method forward_dummy (line 192) | def forward_dummy(self, img):

FILE: knet/det/mask_hungarian_assigner.py
  class DiceCost (line 15) | class DiceCost(object):
    method __init__ (line 34) | def __init__(self,
    method dice_loss (line 44) | def dice_loss(cls, input, target, eps=1e-3):
    method __call__ (line 56) | def __call__(self, mask_preds, gt_masks):
  class MaskCost (line 78) | class MaskCost(object):
    method __init__ (line 85) | def __init__(self, weight=1., pred_act=False, act_mode='sigmoid'):
    method __call__ (line 90) | def __call__(self, cls_pred, target):
  class MaskHungarianAssigner (line 118) | class MaskHungarianAssigner(BaseAssigner):
    method __init__ (line 146) | def __init__(self,
    method assign (line 161) | def assign(self,

FILE: knet/det/mask_pseudo_sampler.py
  class MaskSamplingResult (line 7) | class MaskSamplingResult(SamplingResult):
    method __init__ (line 26) | def __init__(self, pos_inds, neg_inds, masks, gt_masks, assign_result,
    method masks (line 55) | def masks(self):
    method __nice__ (line 59) | def __nice__(self):
    method info (line 68) | def info(self):
  class MaskSamplingResultWithScore (line 81) | class MaskSamplingResultWithScore(SamplingResult):
    method __init__ (line 100) | def __init__(self, pos_inds, neg_inds, masks, scores, gt_masks, assign...
    method masks (line 133) | def masks(self):
    method __nice__ (line 137) | def __nice__(self):
    method info (line 146) | def info(self):
  class MaskPseudoSampler (line 159) | class MaskPseudoSampler(BaseSampler):
    method __init__ (line 162) | def __init__(self, **kwargs):
    method _sample_pos (line 165) | def _sample_pos(self, **kwargs):
    method _sample_neg (line 169) | def _sample_neg(self, **kwargs):
    method sample (line 173) | def sample(self, assign_result, masks, gt_masks, **kwargs):
  class MaskScorePseudoSampler (line 195) | class MaskScorePseudoSampler(BaseSampler):
    method __init__ (line 198) | def __init__(self, **kwargs):
    method _sample_pos (line 201) | def _sample_pos(self, **kwargs):
    method _sample_neg (line 205) | def _sample_neg(self, **kwargs):
    method sample (line 209) | def sample(self, assign_result, masks, score, gt_masks, **kwargs):

FILE: knet/det/msdeformattn_decoder.py
  class MSDeformAttnPixelDecoder (line 18) | class MSDeformAttnPixelDecoder(BaseModule):
    method __init__ (line 42) | def __init__(self,
    method init_weights (line 139) | def init_weights(self):
    method forward (line 165) | def forward(self, feats):

FILE: knet/det/semantic_fpn_wrapper.py
  class SemanticFPNWrapper (line 17) | class SemanticFPNWrapper(nn.Module):
    method __init__ (line 33) | def __init__(self,
    method init_weights (line 180) | def init_weights(self):
    method generate_coord (line 187) | def generate_coord(self, input_feat):
    method forward (line 198) | def forward(self, inputs):
  class UperNetAlignHead (line 239) | class UperNetAlignHead(BaseModule):
    method __init__ (line 241) | def __init__(self, in_channels=[256, 512, 1024, 2048], out_channels=25...
    method forward (line 287) | def forward(self, conv_out):
  class AlignedModule (line 321) | class AlignedModule(nn.Module):
    method __init__ (line 323) | def __init__(self, inplane, outplane, kernel_size=3):
    method forward (line 329) | def forward(self, x):
    method flow_warp (line 342) | def flow_warp(self, input, flow, size):
  class AlignedModulev2PoolingAtten (line 357) | class AlignedModulev2PoolingAtten(nn.Module):
    method __init__ (line 359) | def __init__(self, inplane, outplane, kernel_size=3):
    method forward (line 369) | def forward(self, x):
    method flow_warp (line 395) | def flow_warp(self, input, flow, size):
  class STDCNet1446 (line 413) | class STDCNet1446(nn.Module):
    method __init__ (line 414) | def __init__(self, base=64, layers=[4, 5, 3], block_num=4, type="cat",...
    method init_weight (line 453) | def init_weight(self, pretrain_model):
    method init_params (line 461) | def init_params(self):
    method _make_layers (line 475) | def _make_layers(self, base, layers, block_num, block, norm_layer):
    method forward (line 493) | def forward(self, x):
  class STDCNet813 (line 506) | class STDCNet813(nn.Module):
    method __init__ (line 507) | def __init__(self, base=64, layers=[2, 2, 2], block_num=4, type="cat",...
    method init_weight (line 546) | def init_weight(self, pretrain_model):
    method init_params (line 554) | def init_params(self):
    method _make_layers (line 568) | def _make_layers(self, base, layers, block_num, block, norm_layer):
    method forward (line 586) | def forward(self, x):
  class AddBottleneck (line 600) | class AddBottleneck(nn.Module):
    method __init__ (line 601) | def __init__(self, in_planes, out_planes, block_num=3, stride=1, norm_...
    method forward (line 633) | def forward(self, x):
  class CatBottleneck (line 650) | class CatBottleneck(nn.Module):
    method __init__ (line 651) | def __init__(self, in_planes, out_planes, block_num=3, stride=1, norm_...
    method forward (line 678) | def forward(self, x):
  class ConvX (line 700) | class ConvX(nn.Module):
    method __init__ (line 701) | def __init__(self, in_planes, out_planes, kernel=3, stride=1, norm_lay...
    method forward (line 707) | def forward(self, x):

FILE: knet/det/utils.py
  function sem2ins_masks (line 8) | def sem2ins_masks(gt_sem_seg,
  function sem2ins_masks_cityscapes (line 34) | def sem2ins_masks_cityscapes(gt_sem_seg,
  function sem2ins_masks_kitti_step (line 63) | def sem2ins_masks_kitti_step(gt_sem_seg,

FILE: knet/kernel_updator.py
  class KernelUpdator (line 8) | class KernelUpdator(nn.Module):
    method __init__ (line 10) | def __init__(self,
    method forward (line 56) | def forward(self, update_feature, input_feature):

FILE: knet/video/dice_loss.py
  function dice_loss (line 8) | def dice_loss(input, target, eps=1e-3, numerator_eps=0):

FILE: knet/video/kernel_head.py
  class VideoConvKernelHead (line 12) | class VideoConvKernelHead(nn.Module):
    method __init__ (line 16) | def __init__(self,
    method _init_layers (line 126) | def _init_layers(self):
    method init_weights (line 172) | def init_weights(self):
    method _decode_init_proposals (line 199) | def _decode_init_proposals(self, img, img_metas,
    method forward_train (line 272) | def forward_train(self,
    method loss (line 345) | def loss(self,
    method _get_target_single (line 436) | def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask,
    method get_targets (line 475) | def get_targets(self,
    method simple_test_rpn (line 512) | def simple_test_rpn(self, img, img_metas,
    method forward_dummy (line 517) | def forward_dummy(self, img, img_metas):

FILE: knet/video/kernel_iter_head.py
  class VideoKernelIterHead (line 11) | class VideoKernelIterHead(BaseRoIHead):
    method __init__ (line 13) | def __init__(self,
    method init_bbox_head (line 76) | def init_bbox_head(self, mask_roi_extractor, mask_head):
    method init_assigner_sampler (line 85) | def init_assigner_sampler(self):
    method init_weights (line 97) | def init_weights(self):
    method init_mask_head (line 101) | def init_mask_head(self, mask_roi_extractor, mask_head):
    method _mask_forward (line 118) | def _mask_forward(self, stage, x, object_feats, mask_preds, img_metas,
    method forward_train (line 150) | def forward_train(self,
    method forward_train_with_previous (line 255) | def forward_train_with_previous(self,
    method simple_test (line 378) | def simple_test(self,
    method simple_test_with_previous (line 435) | def simple_test_with_previous(self,
    method simple_test_mask_preds (line 508) | def simple_test_mask_preds(self,
    method simple_test_mask_preds_plus_previous (line 531) | def simple_test_mask_preds_plus_previous(
    method get_masked_feature (line 566) | def get_masked_feature(self, x, mask_pred):
    method aug_test (line 573) | def aug_test(self, features, proposal_list, img_metas, rescale=False):
    method forward_dummy (line 576) | def forward_dummy(self, x, proposal_boxes, proposal_feats, img_metas):
    method get_panoptic (line 591) | def get_panoptic(self, cls_scores, mask_preds, test_cfg, img_meta, obj...
    method split_thing_stuff (line 642) | def split_thing_stuff(self, mask_preds, det_labels, cls_scores):
    method merge_stuff_thing_thing_first (line 656) | def merge_stuff_thing_thing_first(self,
    method merge_stuff_thing_stuff_first (line 743) | def merge_stuff_thing_stuff_first(self,
    method merge_stuff_thing_stuff_joint (line 832) | def merge_stuff_thing_stuff_joint(self,

FILE: knet/video/kernel_update_head.py
  class VideoKernelUpdateHead (line 18) | class VideoKernelUpdateHead(nn.Module):
    method __init__ (line 20) | def __init__(self,
    method init_weights (line 262) | def init_weights(self):
    method forward (line 281) | def forward(self,
    method loss (line 544) | def loss(self,
    method _get_target_single (line 615) | def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask,
    method get_targets (line 670) | def get_targets(self,
    method rescale_masks (line 708) | def rescale_masks(self, masks_per_img, img_meta):
    method get_seg_masks (line 725) | def get_seg_masks(self, masks_per_img, labels_per_img, scores_per_img,
    method segm2result (line 734) | def segm2result(self, mask_preds, det_labels, cls_scores):

FILE: knet/video/knet.py
  class VideoKNet (line 10) | class VideoKNet(TwoStageDetector):
    method __init__ (line 12) | def __init__(self,
    method forward_train (line 31) | def forward_train(self,
    method simple_test (line 182) | def simple_test(self, img, img_metas, rescale=False):
    method forward_dummy (line 210) | def forward_dummy(self, img):
    method extract_feat (line 230) | def extract_feat(self, img):

FILE: knet/video/knet_quansi_dense.py
  class VideoKNetQuansiTrack (line 15) | class VideoKNetQuansiTrack(BaseDetector):
    method __init__ (line 19) | def __init__(self,
    method init_tracker (line 113) | def init_tracker(self):
    method _freeze_detector (line 116) | def _freeze_detector(self):
    method init_track_assigner_sampler (line 126) | def init_track_assigner_sampler(self):
    method preprocess_gt_masks (line 137) | def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_seman...
    method forward_train (line 200) | def forward_train(self,
    method simple_test (line 402) | def simple_test(self, img, img_metas, rescale=False, ref_img=None, **k...
    method _track_forward (line 511) | def _track_forward(self, x, mask_pred):
    method forward_dummy (line 528) | def forward_dummy(self, img):
    method extract_feat (line 547) | def extract_feat(self, img):
    method with_rpn (line 555) | def with_rpn(self):
    method with_roi_head (line 560) | def with_roi_head(self):
    method aug_test (line 564) | def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
    method get_things_id_for_tracking (line 572) | def get_things_id_for_tracking(self, panoptic_seg, seg_infos):
    method pack_things_object (line 586) | def pack_things_object(self, object_feats, ref_object_feats):
    method pack_things_masks (line 592) | def pack_things_masks(self, mask_pred, ref_mask_pred):
    method get_semantic_seg (line 597) | def get_semantic_seg(self, panoptic_seg, segments_info):
    method generate_track_id_maps (line 625) | def generate_track_id_maps(self, ids, masks, panopitc_seg_maps):
  function log_masks_for_inference (line 642) | def log_masks_for_inference(masks_preds, names, output_dirs="work_dirs/v...

FILE: knet/video/knet_quansi_dense_embed_fc.py
  class VideoKNetQuansiEmbedFC (line 18) | class VideoKNetQuansiEmbedFC(BaseDetector):
    method __init__ (line 23) | def __init__(self,
    method init_tracker (line 126) | def init_tracker(self):
    method _freeze_detector (line 129) | def _freeze_detector(self):
    method init_track_assigner_sampler (line 139) | def init_track_assigner_sampler(self):
    method preprocess_gt_masks (line 150) | def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_seman...
    method forward_train (line 222) | def forward_train(self,
    method simple_test (line 468) | def simple_test(self, img, img_metas, rescale=False, ref_img=None, **k...
    method _track_forward (line 607) | def _track_forward(self, track_feats, x=None, mask_pred=None):
    method forward_dummy (line 625) | def forward_dummy(self, img):
    method extract_feat (line 644) | def extract_feat(self, img):
    method with_rpn (line 652) | def with_rpn(self):
    method with_roi_head (line 657) | def with_roi_head(self):
    method aug_test (line 661) | def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
    method get_things_id_for_tracking (line 669) | def get_things_id_for_tracking(self, panoptic_seg, seg_infos):
    method pack_things_object (line 683) | def pack_things_object(self, object_feats, ref_object_feats):
    method pack_things_masks (line 690) | def pack_things_masks(self, mask_pred, ref_mask_pred):
    method get_semantic_seg (line 696) | def get_semantic_seg(self, panoptic_seg, segments_info):
    method generate_track_id_maps (line 724) | def generate_track_id_maps(self, ids, masks, panopitc_seg_maps):

FILE: knet/video/knet_quansi_dense_embed_fc_joint_train.py
  class VideoKNetQuansiEmbedFCJointTrain (line 18) | class VideoKNetQuansiEmbedFCJointTrain(BaseDetector):
    method __init__ (line 22) | def __init__(self,
    method init_tracker (line 128) | def init_tracker(self):
    method _freeze_detector (line 131) | def _freeze_detector(self):
    method init_track_assigner_sampler (line 141) | def init_track_assigner_sampler(self):
    method preprocess_gt_masks (line 152) | def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_seman...
    method forward_train (line 225) | def forward_train(self,
    method simple_test (line 472) | def simple_test(self, img, img_metas, rescale=False, ref_img=None, **k...
    method _track_forward (line 614) | def _track_forward(self, track_feats, x=None, mask_pred=None):
    method forward_dummy (line 625) | def forward_dummy(self, img, img_metas=None):
    method extract_feat (line 648) | def extract_feat(self, img):
    method with_rpn (line 656) | def with_rpn(self):
    method with_roi_head (line 661) | def with_roi_head(self):
    method aug_test (line 665) | def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
    method get_things_id_for_tracking (line 673) | def get_things_id_for_tracking(self, panoptic_seg, seg_infos):
    method pack_things_object (line 687) | def pack_things_object(self, object_feats, ref_object_feats):
    method pack_things_masks (line 693) | def pack_things_masks(self, mask_pred, ref_mask_pred):
    method get_semantic_seg (line 698) | def get_semantic_seg(self, panoptic_seg, segments_info):
    method generate_track_id_maps (line 724) | def generate_track_id_maps(self, ids, masks, panopitc_seg_maps):
    method add_ref_loss (line 738) | def add_ref_loss(self, loss_dict):
    method add_ref_rpn_loss (line 744) | def add_ref_rpn_loss(self, loss_dict):

FILE: knet/video/knet_quansi_dense_embed_fc_toy_exp.py
  class VideoKNetQuansiEmbedFCToy (line 15) | class VideoKNetQuansiEmbedFCToy(BaseDetector):
    method __init__ (line 19) | def __init__(self,
    method init_tracker (line 127) | def init_tracker(self):
    method _freeze_detector (line 130) | def _freeze_detector(self):
    method init_track_assigner_sampler (line 140) | def init_track_assigner_sampler(self):
    method preprocess_gt_masks (line 151) | def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_seman...
    method forward_train (line 223) | def forward_train(self,
    method simple_test (line 467) | def simple_test(self, img, img_metas, rescale=False, ref_img=None, **k...
    method _track_forward (line 587) | def _track_forward(self, track_feats, x=None, mask_pred=None):
    method forward_dummy (line 605) | def forward_dummy(self, img):
    method extract_feat (line 624) | def extract_feat(self, img):
    method with_rpn (line 632) | def with_rpn(self):
    method with_roi_head (line 637) | def with_roi_head(self):
    method aug_test (line 641) | def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
    method get_things_id_for_tracking (line 649) | def get_things_id_for_tracking(self, panoptic_seg, seg_infos):
    method pack_things_object (line 663) | def pack_things_object(self, object_feats, ref_object_feats):
    method pack_things_masks (line 669) | def pack_things_masks(self, mask_pred, ref_mask_pred):
    method get_semantic_seg (line 674) | def get_semantic_seg(self, panoptic_seg, segments_info):
    method generate_track_id_maps (line 702) | def generate_track_id_maps(self, ids, masks, panopitc_seg_maps):

FILE: knet/video/knet_quansi_dense_roi_gt_box.py
  class VideoKNetQuansiTrackROIGTBox (line 16) | class VideoKNetQuansiTrackROIGTBox(BaseDetector):
    method __init__ (line 20) | def __init__(self,
    method init_tracker (line 111) | def init_tracker(self):
    method _freeze_detector (line 114) | def _freeze_detector(self):
    method init_track_assigner_sampler (line 124) | def init_track_assigner_sampler(self):
    method preprocess_gt_masks (line 135) | def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_seman...
    method forward_train (line 207) | def forward_train(self,
    method simple_test (line 436) | def simple_test(self, img, img_metas, rescale=False, ref_img=None, **k...
    method _track_forward (line 567) | def _track_forward(self, x, mask_pred):
    method forward_dummy (line 580) | def forward_dummy(self, img):
    method extract_feat (line 599) | def extract_feat(self, img):
    method with_rpn (line 607) | def with_rpn(self):
    method with_roi_head (line 612) | def with_roi_head(self):
    method aug_test (line 616) | def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
    method get_things_id_for_tracking (line 624) | def get_things_id_for_tracking(self, panoptic_seg, seg_infos):
    method pack_things_object (line 639) | def pack_things_object(self, object_feats, ref_object_feats):
    method pack_things_masks (line 645) | def pack_things_masks(self, mask_pred, ref_mask_pred):
    method get_semantic_seg (line 650) | def get_semantic_seg(self, panoptic_seg, segments_info):
    method generate_track_id_maps (line 678) | def generate_track_id_maps(self, ids, masks, panopitc_seg_maps):

FILE: knet/video/knet_quansi_dense_roi_gt_box_joint_train.py
  class VideoKNetQuansiTrackROIGTBoxJointTrain (line 17) | class VideoKNetQuansiTrackROIGTBoxJointTrain(BaseDetector):
    method __init__ (line 21) | def __init__(self,
    method init_tracker (line 112) | def init_tracker(self):
    method _freeze_detector (line 115) | def _freeze_detector(self):
    method init_track_assigner_sampler (line 125) | def init_track_assigner_sampler(self):
    method preprocess_gt_masks (line 136) | def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_seman...
    method forward_train (line 205) | def forward_train(self,
    method simple_test (line 450) | def simple_test(self, img, img_metas, rescale=False, ref_img=None, **k...
    method _track_forward (line 575) | def _track_forward(self, x, mask_pred):
    method forward_dummy (line 588) | def forward_dummy(self, img):
    method extract_feat (line 607) | def extract_feat(self, img):
    method with_rpn (line 615) | def with_rpn(self):
    method with_roi_head (line 620) | def with_roi_head(self):
    method aug_test (line 624) | def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
    method get_things_id_for_tracking (line 632) | def get_things_id_for_tracking(self, panoptic_seg, seg_infos):
    method pack_things_object (line 647) | def pack_things_object(self, object_feats, ref_object_feats):
    method pack_things_masks (line 653) | def pack_things_masks(self, mask_pred, ref_mask_pred):
    method get_semantic_seg (line 658) | def get_semantic_seg(self, panoptic_seg, segments_info):
    method generate_track_id_maps (line 686) | def generate_track_id_maps(self, ids, masks, panopitc_seg_maps):
    method add_ref_loss (line 697) | def add_ref_loss(self, loss_dict):
    method add_ref_rpn_loss (line 703) | def add_ref_rpn_loss(self, loss_dict):

FILE: knet/video/knet_track_head.py
  class VideoKNetFuseTrack (line 12) | class VideoKNetFuseTrack(BaseDetector):
    method __init__ (line 16) | def __init__(self,
    method preprocess_gt_masks (line 76) | def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_seman...
    method forward_train (line 139) | def forward_train(self,
    method simple_test (line 312) | def simple_test(self, img, img_metas, rescale=False, ref_img=None):
    method forward_dummy (line 394) | def forward_dummy(self, img):
    method extract_feat (line 413) | def extract_feat(self, img):
    method with_rpn (line 421) | def with_rpn(self):
    method with_roi_head (line 426) | def with_roi_head(self):
    method aug_test (line 430) | def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
    method pack_things_object (line 438) | def pack_things_object(self, object_feats, ref_object_feats):
    method add_track_loss (line 444) | def add_track_loss(self, loss_dict):
    method add_ref_rpn_loss (line 450) | def add_ref_rpn_loss(self, loss_dict):
    method pack_stuff_things_result (line 456) | def pack_stuff_things_result(self, panoptic_seg, segments_info):
    method generate_track_id_maps (line 478) | def generate_track_id_maps(self, track_results, panopitc_seg_maps):

FILE: knet/video/knet_track_head_roi_align.py
  class VideoKNetFuseROITrack (line 12) | class VideoKNetFuseROITrack(BaseDetector):
    method __init__ (line 16) | def __init__(self,
    method preprocess_gt_masks (line 76) | def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_seman...
    method forward_train (line 139) | def forward_train(self,
    method simple_test (line 313) | def simple_test(self, img, img_metas, rescale=False, ref_img=None):
    method forward_dummy (line 395) | def forward_dummy(self, img):
    method extract_feat (line 414) | def extract_feat(self, img):
    method with_rpn (line 422) | def with_rpn(self):
    method with_roi_head (line 427) | def with_roi_head(self):
    method aug_test (line 431) | def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
    method pack_things_object (line 439) | def pack_things_object(self, object_feats, ref_object_feats):
    method pack_things_masks (line 445) | def pack_things_masks(self, mask_pred, ref_mask_pred):
    method add_track_loss (line 450) | def add_track_loss(self, loss_dict):
    method add_ref_rpn_loss (line 456) | def add_ref_rpn_loss(self, loss_dict):
    method pack_stuff_things_result (line 462) | def pack_stuff_things_result(self, panoptic_seg, segments_info):
    method generate_track_id_maps (line 484) | def generate_track_id_maps(self, track_results, panopitc_seg_maps):

FILE: knet/video/knet_uni_track.py
  class VideoKNetUniTrack (line 13) | class VideoKNetUniTrack(BaseDetector):
    method __init__ (line 14) | def __init__(self,
    method preprocess_gt_masks (line 72) | def preprocess_gt_masks(self, img_metas, gt_masks, gt_labels, gt_seman...
    method forward_train (line 135) | def forward_train(self,
    method simple_test (line 282) | def simple_test(self, img, img_metas, rescale=False, ref_img=None):
    method forward_dummy (line 348) | def forward_dummy(self, img):
    method extract_feat (line 368) | def extract_feat(self, img):
    method with_rpn (line 376) | def with_rpn(self):
    method with_roi_head (line 381) | def with_roi_head(self):
    method aug_test (line 385) | def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
    method add_track_loss (line 393) | def add_track_loss(self, loss_dict):
    method add_ref_rpn_loss (line 399) | def add_ref_rpn_loss(self, loss_dict):
    method pack_stuff_things_result (line 405) | def pack_stuff_things_result(self, panoptic_seg, segments_info):
    method generate_track_id_maps (line 427) | def generate_track_id_maps(self, track_results, panopitc_seg_maps):
    method get_semantic_seg (line 436) | def get_semantic_seg(self, panoptic_seg, segments_info):

FILE: knet/video/mask_hungarian_assigner.py
  class DiceCost (line 14) | class DiceCost(object):
    method __init__ (line 33) | def __init__(self,
    method dice_loss (line 43) | def dice_loss(cls, input, target, eps=1e-3):
    method __call__ (line 55) | def __call__(self, mask_preds, gt_masks):
  class MaskCost (line 76) | class MaskCost(object):
    method __init__ (line 83) | def __init__(self, weight=1., pred_act=False, act_mode='sigmoid'):
    method __call__ (line 88) | def __call__(self, cls_pred, target):
  class MaskHungarianAssigner (line 116) | class MaskHungarianAssigner(BaseAssigner):
    method __init__ (line 144) | def __init__(self,
    method assign (line 159) | def assign(self,
  class MaskHungarianAssignerWithEmbed (line 274) | class MaskHungarianAssignerWithEmbed(BaseAssigner):
    method __init__ (line 302) | def __init__(self,
    method assign (line 317) | def assign(self,

FILE: knet/video/mask_pseudo_sampler.py
  class MaskSamplingResult (line 7) | class MaskSamplingResult(SamplingResult):
    method __init__ (line 26) | def __init__(self, pos_inds, neg_inds, masks, gt_masks, assign_result,
    method masks (line 50) | def masks(self):
    method __nice__ (line 54) | def __nice__(self):
    method info (line 63) | def info(self):

FILE: knet/video/qdtrack/builder.py
  function build_tracker (line 7) | def build_tracker(cfg):

FILE: knet/video/qdtrack/losses/l2_loss.py
  function l2_loss (line 8) | def l2_loss(pred, target):
  class L2Loss (line 24) | class L2Loss(nn.Module):
    method __init__ (line 33) | def __init__(self,
    method forward (line 48) | def forward(self,
    method update_weight (line 75) | def update_weight(self, pred, target, weight, avg_factor):
    method random_choice (line 113) | def random_choice(gallery, num):

FILE: knet/video/qdtrack/losses/multipos_cross_entropy_loss.py
  function multi_pos_cross_entropy (line 6) | def multi_pos_cross_entropy(pred,
  class MultiPosCrossEntropyLoss (line 44) | class MultiPosCrossEntropyLoss(nn.Module):
    method __init__ (line 46) | def __init__(self, reduction='mean', loss_weight=1.0):
    method forward (line 51) | def forward(self,

FILE: knet/video/qdtrack/track/similarity.py
  function cal_similarity (line 5) | def cal_similarity(key_embeds,

FILE: knet/video/qdtrack/track/transforms.py
  function track2result (line 5) | def track2result(bboxes, labels, ids, num_classes):
  function restore_result (line 24) | def restore_result(result, return_ids=False):

FILE: knet/video/qdtrack/trackers/quasi_dense_embed_tracker.py
  class QuasiDenseEmbedTracker (line 9) | class QuasiDenseEmbedTracker(object):
    method __init__ (line 11) | def __init__(self,
    method empty (line 44) | def empty(self):
    method update_memo (line 47) | def update_memo(self, ids, bboxes, embeds, labels, frame_id):
    method memo (line 105) | def memo(self):
    method match (line 137) | def match(self, bboxes, labels, track_feats, frame_id, asso_tau=-1):

FILE: knet/video/qdtrack/trackers/tao_tracker.py
  class TaoTracker (line 19) | class TaoTracker(object):
    method __init__ (line 21) | def __init__(self,
    method reset (line 49) | def reset(self):
    method valid_ids (line 57) | def valid_ids(self):
    method empty (line 64) | def empty(self):
    method update_memo (line 67) | def update_memo(self, ids, bboxes, labels, embeds, frame_id):
    method memo (line 99) | def memo(self):
    method init_tracklets (line 116) | def init_tracklets(self, ids, obj_scores):
    method match (line 126) | def match(self,
  function random_color (line 330) | def random_color(seed):
  function imshow_tracklets (line 337) | def imshow_tracklets(img,

FILE: knet/video/track_heads.py
  class QueryTrackHead (line 16) | class QueryTrackHead(nn.Module):
    method __init__ (line 24) | def __init__(self,
    method init_weights (line 54) | def init_weights(self):
    method compute_comp_scores (line 59) | def compute_comp_scores(self, match_ll, bbox_scores, bbox_ious, label_...
    method forward (line 77) | def forward(self, x, ref_x, x_n, ref_x_n):
    method loss (line 111) | def loss(self,
    method get_targets (line 141) | def get_targets(self,
    method _get_target_single (line 163) | def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask, p...
  class TrackHeadWithROIAlign (line 182) | class TrackHeadWithROIAlign(nn.Module):
    method __init__ (line 190) | def __init__(self,
    method init_weights (line 231) | def init_weights(self):
    method compute_comp_scores (line 236) | def compute_comp_scores(self, match_ll, bbox_scores, bbox_ious, label_...
    method forward (line 254) | def forward(self, x, ref_x, mask_pred, ref_mask_pred, x_n, ref_x_n):
    method loss (line 312) | def loss(self,
    method get_targets (line 341) | def get_targets(self,
    method _get_target_single (line 363) | def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask, p...
  class QuasiDenseMaskEmbedHead (line 382) | class QuasiDenseMaskEmbedHead(nn.Module):
    method __init__ (line 384) | def __init__(self,
    method _add_conv_fc_branch (line 425) | def _add_conv_fc_branch(self, num_convs, num_fcs, in_channels):
    method init_weights (line 453) | def init_weights(self):
    method forward (line 461) | def forward(self, x):
    method get_track_targets (line 473) | def get_track_targets(self, gt_match_indices, key_sampling_results,
    method match (line 492) | def match(self, key_embeds, ref_embeds, key_sampling_results,
    method loss (line 516) | def loss(self, dists, cos_dists, targets, weights):
    method random_choice (line 535) | def random_choice(gallery, num):
  class QuasiDenseMaskEmbedHeadGTMask (line 553) | class QuasiDenseMaskEmbedHeadGTMask(nn.Module):
    method __init__ (line 555) | def __init__(self,
    method _add_conv_fc_branch (line 596) | def _add_conv_fc_branch(self, num_convs, num_fcs, in_channels):
    method init_weights (line 624) | def init_weights(self):
    method forward (line 632) | def forward(self, x):
    method get_track_targets (line 644) | def get_track_targets(self, gt_match_indices, key_sampling_results,
    method match (line 663) | def match(self, key_embeds, ref_embeds, key_sampling_results,
    method loss (line 686) | def loss(self, dists, cos_dists, targets, weights):
    method random_choice (line 705) | def random_choice(gallery, num):

FILE: knet/video/tracker.py
  class SimpleMaskTracker (line 14) | class SimpleMaskTracker(object):
    method __init__ (line 15) | def __init__(self, score_thresh, max_age=32):
    method reset_all (line 24) | def reset_all(self):
    method init_track (line 30) | def init_track(self, results):
    method step (line 53) | def step(self, output_results, track_results):

FILE: knet/video/util.py
  function box_cxcywh_to_xyxy (line 9) | def box_cxcywh_to_xyxy(x):
  function box_xyxy_to_cxcywh (line 16) | def box_xyxy_to_cxcywh(x):
  function box_iou (line 24) | def box_iou(boxes1, boxes2):
  function generalized_box_iou (line 40) | def generalized_box_iou(boxes1, boxes2):
  function masks_to_boxes (line 64) | def masks_to_boxes(masks):

FILE: knet_vis/det/kernel_head.py
  class ConvKernelHead (line 12) | class ConvKernelHead(nn.Module):
    method __init__ (line 14) | def __init__(self,
    method _init_layers (line 122) | def _init_layers(self):
    method init_weights (line 173) | def init_weights(self):
    method _decode_init_proposals (line 200) | def _decode_init_proposals(self, img, img_metas):
    method forward_train (line 266) | def forward_train(self,
    method loss (line 336) | def loss(self,
    method _get_target_single (line 427) | def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask,
    method get_targets (line 466) | def get_targets(self,
    method simple_test_rpn (line 504) | def simple_test_rpn(self, img, img_metas):
    method forward_dummy (line 508) | def forward_dummy(self, img, img_metas):

FILE: knet_vis/det/kernel_iter_head.py
  class KernelIterHead (line 13) | class KernelIterHead(BaseRoIHead):
    method __init__ (line 15) | def __init__(self,
    method init_bbox_head (line 76) | def init_bbox_head(self, mask_roi_extractor, mask_head):
    method init_assigner_sampler (line 85) | def init_assigner_sampler(self):
    method init_weights (line 97) | def init_weights(self):
    method init_mask_head (line 101) | def init_mask_head(self, mask_roi_extractor, mask_head):
    method _mask_forward (line 118) | def _mask_forward(self, stage, x, object_feats, mask_preds, img_metas):
    method forward_train (line 139) | def forward_train(self,
    method simple_test (line 231) | def simple_test(self,
    method aug_test (line 285) | def aug_test(self, features, proposal_list, img_metas, rescale=False):
    method forward_dummy (line 288) | def forward_dummy(self, x, proposal_boxes, proposal_feats, img_metas):
    method get_panoptic (line 303) | def get_panoptic(self, cls_scores, mask_preds, test_cfg, img_meta):
    method merge_stuff_thing (line 322) | def merge_stuff_thing(self,

FILE: knet_vis/det/kernel_update_head.py
  class KernelUpdateHead (line 20) | class KernelUpdateHead(nn.Module):
    method __init__ (line 22) | def __init__(self,
    method init_weights (line 154) | def init_weights(self):
    method forward (line 173) | def forward(self,
    method loss (line 281) | def loss(self,
    method _get_target_single (line 352) | def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask,
    method get_targets (line 407) | def get_targets(self,
    method rescale_masks (line 442) | def rescale_masks(self, masks_per_img, img_meta):
    method get_seg_masks (line 459) | def get_seg_masks(self, masks_per_img, labels_per_img, scores_per_img,
    method segm2result (line 468) | def segm2result(self, mask_preds, det_labels, cls_scores):
    method get_seg_masks_tracking (line 484) | def get_seg_masks_tracking(self, masks_per_img, labels_per_img, scores...

FILE: knet_vis/det/knet.py
  class KNet (line 11) | class KNet(TwoStageDetector):
    method __init__ (line 13) | def __init__(self,
    method forward_train (line 29) | def forward_train(self,
    method simple_test (line 118) | def simple_test(self, img, img_metas, rescale=False):
    method forward_dummy (line 133) | def forward_dummy(self, img):

FILE: knet_vis/det/mask_hungarian_assigner.py
  class DiceCost (line 15) | class DiceCost(object):
    method __init__ (line 34) | def __init__(self,
    method dice_loss (line 44) | def dice_loss(cls, input, target, eps=1e-3):
    method __call__ (line 56) | def __call__(self, mask_preds, gt_masks):
  class MaskCost (line 77) | class MaskCost(object):
    method __init__ (line 84) | def __init__(self, weight=1., pred_act=False, act_mode='sigmoid'):
    method __call__ (line 89) | def __call__(self, cls_pred, target):
  class MaskHungarianAssigner (line 114) | class MaskHungarianAssigner(BaseAssigner):
    method __init__ (line 142) | def __init__(self,
    method assign (line 157) | def assign(self,

FILE: knet_vis/det/mask_pseudo_sampler.py
  class MaskSamplingResult (line 7) | class MaskSamplingResult(SamplingResult):
    method __init__ (line 26) | def __init__(self, pos_inds, neg_inds, masks, gt_masks, assign_result,
    method masks (line 50) | def masks(self):
    method __nice__ (line 54) | def __nice__(self):
    method info (line 63) | def info(self):
  class MaskPseudoSampler (line 77) | class MaskPseudoSampler(BaseSampler):
    method __init__ (line 80) | def __init__(self, **kwargs):
    method _sample_pos (line 83) | def _sample_pos(self, **kwargs):
    method _sample_neg (line 87) | def _sample_neg(self, **kwargs):
    method sample (line 91) | def sample(self, assign_result, masks, gt_masks, **kwargs):

FILE: knet_vis/det/semantic_fpn_wrapper.py
  class SemanticFPNWrapper (line 10) | class SemanticFPNWrapper(nn.Module):
    method __init__ (line 25) | def __init__(self,
    method init_weights (line 172) | def init_weights(self):
    method generate_coord (line 179) | def generate_coord(self, input_feat):
    method forward (line 190) | def forward(self, inputs):

FILE: knet_vis/det/utils.py
  function sem2ins_masks (line 4) | def sem2ins_masks(gt_sem_seg,

FILE: knet_vis/kernel_updator.py
  class KernelUpdator (line 8) | class KernelUpdator(nn.Module):
    method __init__ (line 10) | def __init__(self,
    method forward (line 56) | def forward(self, update_feature, input_feature):

FILE: knet_vis/tracker/kernel_frame_head.py
  class ConvKernelHeadVolume (line 12) | class ConvKernelHeadVolume(nn.Module):
    method __init__ (line 13) | def __init__(self,
    method _init_layers (line 121) | def _init_layers(self):
    method init_weights (line 172) | def init_weights(self):
    method _decode_init_proposals (line 199) | def _decode_init_proposals(self, img, img_metas, ref_img_metas):
    method forward_train (line 267) | def forward_train(self,
    method loss (line 349) | def loss(self,
    method _get_target_single (line 440) | def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask,
    method get_targets (line 479) | def get_targets(self,
    method simple_test_rpn (line 516) | def simple_test_rpn(self, img, img_metas, ref_img_metas):
    method forward_dummy (line 520) | def forward_dummy(self, img, img_metas, ref_img_metas):

FILE: knet_vis/tracker/kernel_frame_iter_head.py
  class KernelFrameIterHeadVideo (line 14) | class KernelFrameIterHeadVideo(BaseRoIHead):
    method __init__ (line 15) | def __init__(self,
    method init_mask_head (line 82) | def init_mask_head(self, bbox_roi_extractor=None, mask_head=None):
    method init_assigner_sampler (line 92) | def init_assigner_sampler(self):
    method init_bbox_head (line 104) | def init_bbox_head(self, mask_roi_extractor, mask_head):
    method _mask_forward (line 113) | def _mask_forward(self, stage, x, object_feats, mask_preds):
    method _query_fusion (line 139) | def _query_fusion(self, obj_feats, num_imgs, num_frames):
    method _mask_init (line 164) | def _mask_init(self, object_feats, x_feats, num_imgs):
    method forward_train (line 181) | def forward_train(self,
    method simple_test (line 313) | def simple_test(self,
    method init_weights (line 377) | def init_weights(self):

FILE: knet_vis/tracker/kernel_head.py
  class ConvKernelHeadVideo (line 12) | class ConvKernelHeadVideo(nn.Module):
    method __init__ (line 13) | def __init__(self,
    method _init_layers (line 121) | def _init_layers(self):
    method init_weights (line 172) | def init_weights(self):
    method _decode_init_proposals (line 199) | def _decode_init_proposals(self, img, img_metas, ref_img_metas):
    method forward_train (line 267) | def forward_train(self,
    method loss (line 336) | def loss(self,
    method _get_target_single (line 427) | def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask,
    method get_targets (line 466) | def get_targets(self,
    method simple_test_rpn (line 503) | def simple_test_rpn(self, img, img_metas, ref_img_metas):
    method forward_dummy (line 507) | def forward_dummy(self, img, img_metas, ref_img_metas):

FILE: knet_vis/tracker/kernel_iter_head.py
  class KernelIterHeadVideo (line 14) | class KernelIterHeadVideo(BaseRoIHead):
    method __init__ (line 15) | def __init__(self,
    method init_bbox_head (line 75) | def init_bbox_head(self, mask_roi_extractor, mask_head):
    method init_assigner_sampler (line 84) | def init_assigner_sampler(self):
    method init_weights (line 96) | def init_weights(self):
    method init_mask_head (line 100) | def init_mask_head(self, mask_roi_extractor, mask_head):
    method _mask_forward (line 117) | def _mask_forward(self, stage, x, object_feats, mask_preds, img_metas=...
    method forward_train (line 138) | def forward_train(self,
    method simple_test (line 243) | def simple_test(self,
    method aug_test (line 315) | def aug_test(self, features, proposal_list, img_metas, rescale=False):
    method forward_dummy (line 318) | def forward_dummy(self, x, proposal_boxes, proposal_feats, img_metas):
    method get_panoptic (line 333) | def get_panoptic(self, cls_scores, mask_preds, test_cfg, img_meta):
    method merge_stuff_thing (line 352) | def merge_stuff_thing(self,

FILE: knet_vis/tracker/kernel_update_head.py
  class KernelUpdateHeadVideo (line 20) | class KernelUpdateHeadVideo(nn.Module):
    method __init__ (line 22) | def __init__(self,
    method init_weights (line 190) | def init_weights(self):
    method forward (line 209) | def forward(self,
    method loss (line 377) | def loss(self,
    method _get_target_single (line 449) | def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask,
    method get_targets (line 504) | def get_targets(self,
    method rescale_masks (line 539) | def rescale_masks(self, masks_per_img, img_meta):
    method get_seg_masks (line 556) | def get_seg_masks(self, masks_per_img, labels_per_img, scores_per_img,
    method segm2result (line 565) | def segm2result(self, mask_preds, det_labels, cls_scores):
    method get_seg_masks_tracking (line 581) | def get_seg_masks_tracking(self, masks_per_img, labels_per_img, scores...

FILE: knet_vis/tracker/mask_hungarian_assigner.py
  class MaskHungarianAssignerVideo (line 17) | class MaskHungarianAssignerVideo(BaseAssigner):
    method __init__ (line 45) | def __init__(self,
    method assign (line 60) | def assign(self,

FILE: knet_vis/tracker/positional_encoding.py
  class PositionEmbeddingSine3D (line 15) | class PositionEmbeddingSine3D(BaseModule):
    method __init__ (line 21) | def __init__(self, num_feats=64, temperature=10000, normalize=False, s...
    method forward (line 32) | def forward(self, x, mask=None):

FILE: knet_vis/tracker/semantic_fpn_wrapper3D.py
  class SemanticFPNWrapper3D (line 10) | class SemanticFPNWrapper3D(nn.Module):
    method __init__ (line 25) | def __init__(self,
    method init_weights (line 172) | def init_weights(self):
    method generate_coord (line 179) | def generate_coord(self, input_feat):
    method forward (line 190) | def forward(self, inputs, num_imgs, num_frames):

FILE: knet_vis/tracker/track.py
  class KNetTrack (line 16) | class KNetTrack(TwoStageDetector):
    method __init__ (line 18) | def __init__(self,
    method gt_transform (line 52) | def gt_transform(self, img_metas, gt_masks, gt_labels, gt_semantic_seg):
    method ref_gt_transform (line 106) | def ref_gt_transform(self, ref_img_metas, ref_gt_masks, ref_gt_labels,...
    method forward_train (line 142) | def forward_train(self,
    method forward_test (line 243) | def forward_test(self, imgs, img_metas, **kwargs):
    method simple_test (line 289) | def simple_test(self, imgs, img_metas, **kwargs):
    method forward_dummy (line 350) | def forward_dummy(self, img):
    method init_weights (line 369) | def init_weights(self):

FILE: mmtrack/datasets/coco_video_dataset.py
  class CocoVideoDataset (line 14) | class CocoVideoDataset(CocoDataset):
    method __init__ (line 28) | def __init__(self,
    method load_annotations (line 51) | def load_annotations(self, ann_file):
    method load_video_anns (line 66) | def load_video_anns(self, ann_file):
    method key_img_sampling (line 101) | def key_img_sampling(self, img_ids, interval=1):
    method ref_img_sampling (line 105) | def ref_img_sampling(self,
    method get_ann_info (line 239) | def get_ann_info(self, img_info):
    method prepare_results (line 253) | def prepare_results(self, img_info):
    method prepare_data (line 266) | def prepare_data(self, idx):
    method prepare_train_img (line 292) | def prepare_train_img(self, idx):
    method prepare_test_img (line 304) | def prepare_test_img(self, idx):
    method _parse_ann_info (line 316) | def _parse_ann_info(self, img_info, ann_info):
    method evaluate (line 385) | def evaluate(self,
    method __repr__ (line 475) | def __repr__(self):

FILE: mmtrack/datasets/parsers/coco_video_parser.py
  class CocoVID (line 9) | class CocoVID(COCO):
    method __init__ (line 18) | def __init__(self, annotation_file=None, load_img_as_vid=False):
    method convert_img_to_vid (line 23) | def convert_img_to_vid(self, dataset):
    method createIndex (line 39) | def createIndex(self, use_ext=False):
    method get_vid_ids (line 91) | def get_vid_ids(self, vidIds=[]):
    method get_img_ids_from_vid (line 108) | def get_img_ids_from_vid(self, vidId):
    method get_ins_ids_from_vid (line 121) | def get_ins_ids_from_vid(self, vidId):
    method get_img_ids_from_ins_id (line 130) | def get_img_ids_from_ins_id(self, insId):
    method load_vids (line 139) | def load_vids(self, ids=[]):

FILE: mmtrack/datasets/youtube_vis_dataset.py
  function results2outs (line 15) | def results2outs(bbox_results=None,
  class YouTubeVISDataset (line 70) | class YouTubeVISDataset(CocoVideoDataset):
    method __init__ (line 92) | def __init__(self, dataset_version, *args, **kwargs):
    method set_dataset_classes (line 97) | def set_dataset_classes(cls, dataset_version):
    method format_results (line 106) | def format_results(self,

FILE: mmtrack/pipelines/formatting.py
  class ConcatVideoReferences (line 9) | class ConcatVideoReferences(object):
    method __call__ (line 27) | def __call__(self, results):
  class ConcatVideos (line 75) | class ConcatVideos(object):
    method __call__ (line 93) | def __call__(self, results):
  class MultiImagesToTensor (line 149) | class MultiImagesToTensor(object):
    method __init__ (line 161) | def __init__(self, ref_prefix='ref'):
    method __call__ (line 164) | def __call__(self, results):
    method images_to_tensor (line 192) | def images_to_tensor(self, results):
  class SeqDefaultFormatBundle (line 211) | class SeqDefaultFormatBundle(object):
    method __init__ (line 236) | def __init__(self, ref_prefix='ref'):
    method __call__ (line 239) | def __call__(self, results):
    method default_format_bundle (line 273) | def default_format_bundle(self, results):
    method __repr__ (line 311) | def __repr__(self):
  class VideoCollect (line 316) | class VideoCollect(object):
    method __init__ (line 330) | def __init__(self,
    method __call__ (line 354) | def __call__(self, results):
    method _collect_meta_keys (line 391) | def _collect_meta_keys(self, results):
    method _add_default_meta_keys (line 405) | def _add_default_meta_keys(self, results):
  class ToList (line 432) | class ToList(object):
    method __call__ (line 442) | def __call__(self, results):
  class ReIDFormatBundle (line 450) | class ReIDFormatBundle(object):
    method __init__ (line 461) | def __init__(self, *args, **kwargs):
    method __call__ (line 464) | def __call__(self, results):
    method reid_format_bundle (line 492) | def reid_format_bundle(self, results):
  class ImageToTensorWithRef (line 519) | class ImageToTensorWithRef(object):
    method __init__ (line 521) | def __init__(self, keys):
    method __call__ (line 524) | def __call__(self, results):
    method __repr__ (line 542) | def __repr__(self):
  class LabelConsistentChecker (line 546) | class LabelConsistentChecker:
    method __init__ (line 549) | def __init__(self, num_frames=5):
    method __call__ (line 552) | def __call__(self, results):
  class MM2CLIP (line 572) | class MM2CLIP:
    method __init__ (line 575) | def __init__(self, num_frames=5):
    method __call__ (line 578) | def __call__(self, results):

FILE: mmtrack/pipelines/loading.py
  class LoadMultiImagesFromFile (line 12) | class LoadMultiImagesFromFile(LoadImageFromFile):
    method __init__ (line 18) | def __init__(self, *args, **kwargs):
    method __call__ (line 21) | def __call__(self, results):
  class SeqLoadAnnotations (line 39) | class SeqLoadAnnotations(LoadAnnotations):
    method __init__ (line 47) | def __init__(self, with_track=False, *args, **kwargs):
    method _load_track (line 51) | def _load_track(self, results):
    method __call__ (line 63) | def __call__(self, results):
  class LoadRefImageFromFile (line 85) | class LoadRefImageFromFile(object):
    method __init__ (line 91) | def __init__(self, sample=True, to_float32=False):
    method __call__ (line 95) | def __call__(self, results):
    method __repr__ (line 123) | def __repr__(self):
  function bitmasks2bboxes (line 128) | def bitmasks2bboxes(bitmasks):
  class LoadAnnotationsInstanceMasks (line 142) | class LoadAnnotationsInstanceMasks:
    method __init__ (line 143) | def __init__(self,
    method _load_masks (line 156) | def _load_masks(self, results):
    method _load_semantic_seg (line 193) | def _load_semantic_seg(self, results):
    method __call__ (line 206) | def __call__(self, results):
    method __repr__ (line 224) | def __repr__(self):

FILE: mmtrack/pipelines/test_time_aug.py
  class MultiScaleFlipAugVideo (line 11) | class MultiScaleFlipAugVideo:
    method __init__ (line 47) | def __init__(self,
    method __call__ (line 78) | def __call__(self, results):
    method __repr__ (line 110) | def __repr__(self):

FILE: mmtrack/pipelines/transforms.py
  class SeqColorAug (line 10) | class SeqColorAug(object):
    method __init__ (line 21) | def __init__(self,
    method __call__ (line 29) | def __call__(self, results):
  class SeqBlurAug (line 56) | class SeqBlurAug(object):
    method __init__ (line 63) | def __init__(self, prob=[0.0, 0.2]):
    method __call__ (line 66) | def __call__(self, results):
  class SeqResize (line 96) | class SeqResize(Resize):
    method __init__ (line 105) | def __init__(self, share_params=True, *args, **kwargs):
    method __call__ (line 109) | def __call__(self, results):
  class SeqNormalize (line 133) | class SeqNormalize(Normalize):
    method __init__ (line 139) | def __init__(self, *args, **kwargs):
    method __call__ (line 142) | def __call__(self, results):
  class SeqRandomFlip (line 161) | class SeqRandomFlip(RandomFlip):
    method __init__ (line 170) | def __init__(self, share_params, *args, **kwargs):
    method __call__ (line 174) | def __call__(self, results):
  class SeqPad (line 218) | class SeqPad(Pad):
    method __init__ (line 224) | def __init__(self, *args, **kwargs):
    method __call__ (line 227) | def __call__(self, results):
  class SeqRandomCrop (line 246) | class SeqRandomCrop(object):
    method __init__ (line 270) | def __init__(self,
    method get_offsets (line 293) | def get_offsets(self, img):
    method random_crop (line 301) | def random_crop(self, results, offsets=None):
    method __call__ (line 364) | def __call__(self, results):
    method check_match (line 391) | def check_match(self, ref_results, results):
  class SeqPhotoMetricDistortion (line 400) | class SeqPhotoMetricDistortion(object):
    method __init__ (line 419) | def __init__(self,
    method get_params (line 431) | def get_params(self):
    method photo_metric_distortion (line 467) | def photo_metric_distortion(self, results, params=None):
    method __call__ (line 524) | def __call__(self, results):
    method __repr__ (line 543) | def __repr__(self):
  class ResizeWithRef (line 555) | class ResizeWithRef(object):
    method __init__ (line 580) | def __init__(self,
    method random_select (line 606) | def random_select(img_scales):
    method random_sample (line 613) | def random_sample(img_scales):
    method random_sample_ratio (line 627) | def random_sample_ratio(img_scale, ratio_range):
    method _random_scale (line 635) | def _random_scale(self, results):
    method _resize_img (line 651) | def _resize_img(self, results):
    method _resize_bboxes (line 668) | def _resize_bboxes(self, results):
    method _resize_masks (line 680) | def _resize_masks(self, results):
    method __call__ (line 703) | def __call__(self, results):
    method __repr__ (line 712) | def __repr__(self):
  class RandomFlipWithRef (line 723) | class RandomFlipWithRef(object):
    method __init__ (line 734) | def __init__(self, flip_ratio=None):
    method bbox_flip (line 739) | def bbox_flip(self, bboxes, img_shape):
    method __call__ (line 753) | def __call__(self, results):
    method __repr__ (line 776) | def __repr__(self):
  class PadWithRef (line 782) | class PadWithRef(object):
    method __init__ (line 794) | def __init__(self, size=None, size_divisor=None, pad_val=0):
    method _pad_img (line 802) | def _pad_img(self, results):
    method _pad_masks (line 815) | def _pad_masks(self, results):
    method __call__ (line 826) | def __call__(self, results):
    method __repr__ (line 831) | def __repr__(self):
  class NormalizeWithRef (line 839) | class NormalizeWithRef(object):
    method __init__ (line 849) | def __init__(self, mean, std, to_rgb=True):
    method __call__ (line 854) | def __call__(self, results):
    method __repr__ (line 864) | def __repr__(self):
  class RandomCropWithRef (line 872) | class RandomCropWithRef(object):
    method __init__ (line 879) | def __init__(self, crop_size):
    method __call__ (line 882) | def __call__(self, results):
    method __repr__ (line 948) | def __repr__(self):
  class PadFutureMMDet (line 954) | class PadFutureMMDet:
    method __init__ (line 968) | def __init__(self,
    method _pad_img (line 994) | def _pad_img(self, results):
    method _pad_masks (line 1012) | def _pad_masks(self, results):
    method _pad_seg (line 1019) | def _pad_seg(self, results):
    method __call__ (line 1027) | def __call__(self, results):
    method __repr__ (line 1039) | def __repr__(self):
  class KNetInsAdapter (line 1049) | class KNetInsAdapter:
    method __init__ (line 1054) | def __init__(self, stuff_nums=11):
    method __call__ (line 1057) | def __call__(self, results):
  class KNetInsAdapterCherryPick (line 1069) | class KNetInsAdapterCherryPick:
    method __init__ (line 1074) | def __init__(self, stuff_nums=11, cherry=(11, 13)):
    method __call__ (line 1078) | def __call__(self, results):

FILE: mmtrack/transform.py
  function outs2results (line 6) | def outs2results(bboxes=None,

FILE: scripts/kitti_step_prepare.py
  function build_panoptic (line 14) | def build_panoptic(seq_id, input_dir, output_dir):
  function build_img (line 29) | def build_img(seq_id, input_dir, output_dir):

FILE: scripts/visualizer.py
  function sha256num (line 30) | def sha256num(num):
  function id2rgb (line 36) | def id2rgb(id_map):
  function cityscapes_cat2rgb (line 52) | def cityscapes_cat2rgb(cat_map):
  function trackmap2rgb (line 62) | def trackmap2rgb(track_map):
  function draw_bbox_on_img (line 72) | def draw_bbox_on_img(vis_img, bboxes):

FILE: swin/DetectRS.py
  class Bottleneck (line 16) | class Bottleneck(_Bottleneck):
    method __init__ (line 34) | def __init__(self,
    method rfp_forward (line 72) | def rfp_forward(self, x, rfp_feat):
  class ResLayer (line 119) | class ResLayer(Sequential):
    method __init__ (line 143) | def __init__(self,
  class DetectoRS_ResNet_Custom (line 209) | class DetectoRS_ResNet_Custom(ResNet):
    method __init__ (line 230) | def __init__(self,
    method init_weights (line 283) | def init_weights(self):
    method make_res_layer (line 310) | def make_res_layer(self, **kwargs):
    method forward (line 314) | def forward(self, x):
    method rfp_forward (line 321) | def rfp_forward(self, x, rfp_feats):

FILE: swin/ckpt_convert.py
  function pvt_convert (line 12) | def pvt_convert(ckpt):
  function swin_converter (line 85) | def swin_converter(ckpt):

FILE: swin/mix_transformer.py
  class Mlp (line 21) | class Mlp(nn.Module):
    method __init__ (line 22) | def __init__(self, in_features, hidden_features=None, out_features=Non...
    method _init_weights (line 34) | def _init_weights(self, m):
    method forward (line 49) | def forward(self, x, H, W):
  class Attention (line 59) | class Attention(nn.Module):
    method __init__ (line 60) | def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, at...
    method _init_weights (line 82) | def _init_weights(self, m):
    method forward (line 97) | def forward(self, x, H, W):
  class Block (line 121) | class Block(nn.Module):
    method __init__ (line 123) | def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_sc...
    method _init_weights (line 139) | def _init_weights(self, m):
    method forward (line 154) | def forward(self, x, H, W):
  class OverlapPatchEmbed (line 161) | class OverlapPatchEmbed(nn.Module):
    method __init__ (line 165) | def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, e...
    method _init_weights (line 180) | def _init_weights(self, m):
    method forward (line 195) | def forward(self, x):
  class MixVisionTransformer (line 204) | class MixVisionTransformer(BaseModule):
    method __init__ (line 205) | def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classe...
    method _init_weights (line 262) | def _init_weights(self, m):
    method reset_drop_path (line 282) | def reset_drop_path(self, drop_path_rate):
    method freeze_patch_emb (line 300) | def freeze_patch_emb(self):
    method no_weight_decay (line 304) | def no_weight_decay(self):
    method get_classifier (line 307) | def get_classifier(self):
    method reset_classifier (line 310) | def reset_classifier(self, num_classes, global_pool=''):
    method forward_features (line 314) | def forward_features(self, x):
    method forward (line 352) | def forward(self, x):
  class DWConv (line 359) | class DWConv(nn.Module):
    method __init__ (line 360) | def __init__(self, dim=768):
    method forward (line 364) | def forward(self, x, H, W):
  class mit_b0 (line 374) | class mit_b0(MixVisionTransformer):
    method __init__ (line 375) | def __init__(self, **kwargs):
  class mit_b1 (line 383) | class mit_b1(MixVisionTransformer):
    method __init__ (line 384) | def __init__(self, **kwargs):
  class mit_b2 (line 392) | class mit_b2(MixVisionTransformer):
    method __init__ (line 393) | def __init__(self, **kwargs):
  class mit_b3 (line 401) | class mit_b3(MixVisionTransformer):
    method __init__ (line 402) | def __init__(self, **kwargs):
  class mit_b4 (line 410) | class mit_b4(MixVisionTransformer):
    method __init__ (line 411) | def __init__(self, **kwargs):
  class mit_b5 (line 419) | class mit_b5(MixVisionTransformer):
    method __init__ (line 420) | def __init__(self, **kwargs):
  class ResNetV1c (line 428) | class ResNetV1c(ResNet):
    method __init__ (line 437) | def __init__(self, **kwargs):

FILE: swin/swin_checkpoint.py
  function _get_mmcv_home (line 29) | def _get_mmcv_home():
  function load_state_dict (line 40) | def load_state_dict(module, state_dict, strict=False, logger=None):
  function load_url_dist (line 107) | def load_url_dist(url, model_dir=None):
  function load_pavimodel_dist (line 121) | def load_pavimodel_dist(model_path, map_location=None):
  function load_fileclient_dist (line 149) | def load_fileclient_dist(filename, backend, map_location):
  function get_torchvision_models (line 170) | def get_torchvision_models():
  function get_external_models (line 182) | def get_external_models():
  function get_mmcls_models (line 196) | def get_mmcls_models():
  function get_deprecated_model_names (line 203) | def get_deprecated_model_names():
  function _process_mmcls_checkpoint (line 212) | def _process_mmcls_checkpoint(checkpoint):
  function _load_checkpoint (line 223) | def _load_checkpoint(filename, map_location=None):
  function load_checkpoint (line 283) | def load_checkpoint(model,
  function weights_to_cpu (line 356) | def weights_to_cpu(state_dict):
  function _save_to_state_dict (line 370) | def _save_to_state_dict(module, destination, prefix, keep_vars):
  function get_state_dict (line 389) | def get_state_dict(module, destination=None, prefix='', keep_vars=False):
  function save_checkpoint (line 430) | def save_checkpoint(model, filename, optimizer=None, meta=None):

FILE: swin/swin_transformer.py
  class Mlp (line 20) | class Mlp(nn.Module):
    method __init__ (line 23) | def __init__(self,
    method forward (line 37) | def forward(self, x):
  function window_partition (line 46) | def window_partition(x, window_size):
  function window_reverse (line 62) | def window_reverse(windows, window_size, H, W):
  class WindowAttention (line 79) | class WindowAttention(nn.Module):
    method __init__ (line 94) | def __init__(self,
    method forward (line 141) | def forward(self, x, mask=None):
  class SwinTransformerBlock (line 183) | class SwinTransformerBlock(nn.Module):
    method __init__ (line 201) | def __init__(self,
    method forward (line 245) | def forward(self, x, mask_matrix):
  class PatchMerging (line 314) | class PatchMerging(nn.Module):
    method __init__ (line 321) | def __init__(self, dim, norm_layer=nn.LayerNorm):
    method forward (line 327) | def forward(self, x, H, W):
  class BasicLayer (line 357) | class BasicLayer(nn.Module):
    method __init__ (line 376) | def __init__(self,
    method forward (line 419) | def forward(self, x, H, W):
  class PatchEmbed (line 466) | class PatchEmbed(nn.Module):
    method __init__ (line 475) | def __init__(self,
    method forward (line 494) | def forward(self, x):
  class SwinTransformerDIY (line 516) | class SwinTransformerDIY(nn.Module):
    method __init__ (line 544) | def __init__(self,
    method _freeze_stages (line 637) | def _freeze_stages(self):
    method init_weights (line 654) | def init_weights(self, pretrained=None):
    method forward (line 682) | def forward(self, x):
    method train (line 716) | def train(self, mode=True):

FILE: swin/swin_transformer_rfp.py
  class WindowMSA (line 20) | class WindowMSA(BaseModule):
    method __init__ (line 38) | def __init__(self,
    method init_weights (line 74) | def init_weights(self):
    method forward (line 77) | def forward(self, x, mask=None):
    method double_step_seq (line 117) | def double_step_seq(step1, len1, step2, len2):
  class ShiftWindowMSA (line 123) | class ShiftWindowMSA(BaseModule):
    method __init__ (line 145) | def __init__(self,
    method forward (line 174) | def forward(self, query, hw_shape):
    method window_reverse (line 250) | def window_reverse(self, windows, H, W):
    method window_partition (line 266) | def window_partition(self, x):
  class SwinBlock (line 282) | class SwinBlock(BaseModule):
    method __init__ (line 307) | def __init__(self,
    method forward (line 352) | def forward(self, x, hw_shape):
  class SwinBlockSequence (line 375) | class SwinBlockSequence(BaseModule):
    method __init__ (line 403) | def __init__(self,
    method forward (line 448) | def forward(self, x, hw_shape):
  class SwinTransformer (line 459) | class SwinTransformer(BaseModule):
    method __init__ (line 514) | def __init__(self,
    method train (line 633) | def train(self, mode=True):
    method _freeze_stages (line 638) | def _freeze_stages(self):
    method init_weights (line 660) | def init_weights(self):
    method forward (line 739) | def forward(self, x):
  class SwinRFPLayer (line 760) | class SwinRFPLayer(BaseModule):
    method __init__ (line 788) | def __init__(self,
    method forward (line 846) | def forward(self, x, hw_shape):
    method rfp_forward (line 856) | def rfp_forward(self, x, hw_shape, rfp_feat):
  class SwinTransformerRFP (line 874) | class SwinTransformerRFP(SwinTransformer):
    method __init__ (line 875) | def __init__(
    method forward (line 972) | def forward(self, x):
    method rfp_forward (line 979) | def rfp_forward(self, x, rfp_feats):

FILE: swin/transformer.py
  function nlc_to_nchw (line 32) | def nlc_to_nchw(x, hw_shape):
  function nchw_to_nlc (line 47) | def nchw_to_nlc(x):
  class AdaptivePadding (line 58) | class AdaptivePadding(nn.Module):
    method __init__ (line 88) | def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corne...
    method get_pad_shape (line 104) | def get_pad_shape(self, input_shape):
    method forward (line 116) | def forward(self, x):
  class PatchEmbed (line 129) | class PatchEmbed(BaseModule):
    method __init__ (line 155) | def __init__(
    method forward (line 229) | def forward(self, x):
  class PatchMerging (line 251) | class PatchMerging(BaseModule):
    method __init__ (line 280) | def __init__(self,
    method forward (line 329) | def forward(self, x, input_size):
  function inverse_sigmoid (line 375) | def inverse_sigmoid(x, eps=1e-5):

FILE: tools/dataset/cityscapes_instance_idmap.py
  function convert_json_to_label (line 9) | def convert_json_to_label(json_file):
  function parse_args (line 14) | def parse_args():
  function main (line 26) | def main():

FILE: tools/dataset/youtubevis2coco.py
  function parse_args (line 11) | def parse_args():
  function convert_vis (line 32) | def convert_vis(ann_dir, save_dir, dataset_version, mode='train'):
  function main (line 143) | def main():

FILE: tools/eval_dstq.py
  function parse_args (line 15) | def parse_args():
  function updater (line 29) | def updater(pred_ins_name,
  function eval_dstq (line 63) | def eval_dstq(result_dir, gt_dir, seq_ids, with_depth=True):

FILE: tools/eval_dstq_step.py
  function parse_args (line 15) | def parse_args():
  function updater (line 29) | def updater(pred_ins_name,
  function eval_dstq (line 63) | def eval_dstq(result_dir, gt_dir, seq_ids, with_depth=True):

FILE: tools/eval_dstq_vipseg.py
  function vip2hb (line 280) | def vip2hb(pan_map):
  function parse_args (line 301) | def parse_args():
  function updater (line 315) | def updater(pred_ins_name,
  function eval_dstq (line 351) | def eval_dstq(result_dir, gt_dir, with_depth=True):

FILE: tools/eval_dvpq_step.py
  function vpq_eval (line 21) | def vpq_eval(element):
  function eval (line 100) | def eval(element):
  function main (line 144) | def main():

FILE: tools/eval_dvpq_vipseg.py
  function vip2hb (line 275) | def vip2hb(pan_map):
  function parse_args (line 296) | def parse_args():
  function vpq_eval (line 310) | def vpq_eval(element):
  function read_to_eval (line 389) | def read_to_eval(element):
  function eval_dvpq (line 412) | def eval_dvpq(result_dir, gt_dir, split='val', k=1, with_depth=True):

FILE: tools/flops_counter.py
  function get_model_complexity_info (line 19) | def get_model_complexity_info(model, input_res,
  function flops_to_string (line 59) | def flops_to_string(flops, units='GMac', precision=2):
  function params_to_string (line 80) | def params_to_string(params_num, units=None, precision=2):
  function accumulate_flops (line 97) | def accumulate_flops(self):
  function print_model_with_flops (line 107) | def print_model_with_flops(model, total_flops, total_params, units='GMac',
  function get_model_parameters_number (line 153) | def get_model_parameters_number(model):
  function add_flops_counting_methods (line 158) | def add_flops_counting_methods(net_main_module):
  function compute_average_flops_cost (line 172) | def compute_average_flops_cost(self):
  function start_flops_count (line 192) | def start_flops_count(self, **kwargs):
  function stop_flops_count (line 228) | def stop_flops_count(self):
  function reset_flops_count (line 239) | def reset_flops_count(self):
  function empty_flops_counter_hook (line 250) | def empty_flops_counter_hook(module, input, output):
  function upsample_flops_counter_hook (line 254) | def upsample_flops_counter_hook(module, input, output):
  function relu_flops_counter_hook (line 263) | def relu_flops_counter_hook(module, input, output):
  function linear_flops_counter_hook (line 268) | def linear_flops_counter_hook(module, input, output):
  function pool_flops_counter_hook (line 276) | def pool_flops_counter_hook(module, input, output):
  function bn_flops_counter_hook (line 281) | def bn_flops_counter_hook(module, input, output):
  function conv_flops_counter_hook (line 290) | def conv_flops_counter_hook(conv_module, input, output):
  function batch_counter_hook (line 321) | def batch_counter_hook(module, input, output):
  function rnn_flops (line 334) | def rnn_flops(flops, rnn_module, w_ih, w_hh, input_size):
  function rnn_flops_counter_hook (line 359) | def rnn_flops_counter_hook(rnn_module, input, output):
  function rnn_cell_flops_counter_hook (line 392) | def rnn_cell_flops_counter_hook(rnn_cell_module, input, output):
  function ffn_hook (line 408) | def ffn_hook(module, input, output):
  function multihead_attention_counter_hook (line 422) | def multihead_attention_counter_hook(multihead_attention_module, input, ...
  function add_batch_counter_variables_or_reset (line 467) | def add_batch_counter_variables_or_reset(module):
  function add_batch_counter_hook_function (line 472) | def add_batch_counter_hook_function(module):
  function remove_batch_counter_hook_function (line 480) | def remove_batch_counter_hook_function(module):
  function add_flops_counter_variable_or_reset (line 486) | def add_flops_counter_variable_or_reset(module):
  function norm_flops_counter_hook (line 498) | def norm_flops_counter_hook(module, input, output):
  function is_supported_instance (line 573) | def is_supported_instance(module):
  function remove_flops_counter_hook_function (line 579) | def remove_flops_counter_hook_function(module):

FILE: tools/get_flops.py
  function parse_args (line 17) | def parse_args():
  function main (line 46) | def main():

FILE: tools/test.py
  function parse_args (line 19) | def parse_args():
  function main (line 101) | def main():

FILE: tools/test_dvps.py
  function single_gpu_test (line 22) | def single_gpu_test(model,
  function parse_args (line 79) | def parse_args():
  function main (line 181) | def main():

FILE: tools/test_step.py
  function single_gpu_test (line 24) | def single_gpu_test(model,
  function parse_args (line 78) | def parse_args():
  function main (line 170) | def main():

FILE: tools/test_vps.py
  function single_gpu_test (line 22) | def single_gpu_test(model,
  function parse_args (line 69) | def parse_args():
  function main (line 171) | def main():

FILE: tools/train.py
  function parse_args (line 23) | def parse_args():
  function main (line 94) | def main():

FILE: tools/utils/DSTQ.py
  class DSTQuality (line 9) | class DSTQuality(STQuality):
    method __init__ (line 10) | def __init__(
    method update_state (line 38) | def update_state(
    method result (line 78) | def result(self):
    method reset_states (line 145) | def reset_states(self):

FILE: tools/utils/STQ.py
  function _update_dict_stats (line 31) | def _update_dict_stats(stat_dict: MutableMapping[int, np.ndarray],
  class STQuality (line 42) | class STQuality(object):
    method __init__ (line 62) | def __init__(self, num_classes: int, things_list: Sequence[int],
    method get_semantic (line 102) | def get_semantic(self, y: np.ndarray) -> np.ndarray:
    method update_state (line 106) | def update_state(self, y_true: np.ndarray, y_pred: np.ndarray, sequenc...
    method result (line 190) | def result(self) -> Mapping[Text, Any]:
    method reset_states (line 285) | def reset_states(self):

FILE: tools/utils/cityscapesvps_eval.py
  class CityscapesVps (line 14) | class CityscapesVps(Dataset):
    method __init__ (line 16) | def __init__(self):
    method _save_image_single_core (line 24) | def _save_image_single_core(self, proc_id, images_set, names_set, colo...
    method inference_panoptic_video (line 40) | def inference_panoptic_video(self, pred_pans_2ch, output_dir,
    method converter_2ch_track_core (line 111) | def converter_2ch_track_core(self, proc_id, pan_2ch_set, color_generat...

FILE: tools/visualization.py
  function single_gpu_test (line 22) | def single_gpu_test(model,
  function parse_args (line 44) | def parse_args():
  function main (line 126) | def main():

FILE: tools_vis/apis/test.py
  function single_gpu_test (line 18) | def single_gpu_test(model,
  function multi_gpu_test (line 48) | def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
  function collect_results_cpu (line 99) | def collect_results_cpu(result_part, size, tmpdir=None):
  function collect_results_gpu (line 142) | def collect_results_gpu(result_part, size):

FILE: tools_vis/test.py
  function parse_args (line 23) | def parse_args():
  function main (line 114) | def main():

FILE: tools_vis/test_whole_video.py
  function parse_args (line 23) | def parse_args():
  function main (line 114) | def main():

FILE: unitrack/basetrack.py
  class TrackState (line 10) | class TrackState(object):
  class BaseTrack (line 17) | class BaseTrack(object):
    method end_frame (line 36) | def end_frame(self):
    method next_id (line 40) | def next_id():
    method activate (line 44) | def activate(self, *args):
    method predict (line 47) | def predict(self):
    method update (line 50) | def update(self, *args, **kwargs):
    method mark_lost (line 53) | def mark_lost(self):
    method mark_removed (line 56) | def mark_removed(self):
  class STrack (line 60) | class STrack(BaseTrack):
    method __init__ (line 63) | def __init__(self, tlwh, score, temp_feat, buffer_size=30,
    method update_features (line 85) | def update_features(self, feat):
    method predict (line 95) | def predict(self):
    method multi_predict (line 102) | def multi_predict(stracks):
    method activate (line 115) | def activate(self, kalman_filter, frame_id):
    method re_activate (line 129) | def re_activate(self, new_track, frame_id, new_id=False, update_featur...
    method update (line 148) | def update(self, new_track, frame_id, update_feature=True):
    method tlwh (line 182) | def tlwh(self):
    method tlbr (line 194) | def tlbr(self):
    method to_xyah (line 203) | def to_xyah(self):
    method __repr__ (line 207) | def __repr__(self):
  function joint_stracks (line 211) | def joint_stracks(tlista, tlistb):
  function sub_stracks (line 225) | def sub_stracks(tlista, tlistb):
  function remove_duplicate_stracks (line 236) | def remove_duplicate_stracks(stracksa, stracksb, ioudist=0.15):

FILE: unitrack/box.py
  class BoxAssociationTracker (line 16) | class BoxAssociationTracker(AssociationTracker):
    method __init__ (line 17) | def __init__(self, opt):
    method extract_emb (line 20) | def extract_emb(self, img, obs):
    method prepare_obs (line 29) | def prepare_obs(self, img, img0, obs, embs=None):

FILE: unitrack/core/association/matching.py
  function merge_matches (line 12) | def merge_matches(m1, m2, shape):
  function linear_assignment (line 29) | def linear_assignment(cost_matrix, thresh):
  function ious (line 43) | def ious(atlbrs, btlbrs):
  function iou_distance (line 63) | def iou_distance(atracks, btracks):
  function embedding_distance (line 83) | def embedding_distance(tracks, detections, metric='cosine'):
  function fuse_motion (line 100) | def fuse_motion(kf, cost_matrix, tracks, detections, only_position=False...
  function center_emb_distance (line 115) | def center_emb_distance(tracks, detections, metric='cosine'):
  function recons_distance (line 134) | def recons_distance(tracks, detections, tmp=100):
  function get_track_feat (line 174) | def get_track_feat(tracks, feat_flag='curr'):
  function reconsdot_distance (line 194) | def reconsdot_distance(tracks, detections, tmp=100):
  function category_gate (line 241) | def category_gate(cost_matrix, tracks, detections):

FILE: unitrack/core/motion/kalman_filter.py
  class KalmanFilter (line 23) | class KalmanFilter(object):
    method __init__ (line 40) | def __init__(self):
    method initiate (line 55) | def initiate(self, measurement):
    method predict (line 88) | def predict(self, mean, covariance):
    method project (line 125) | def project(self, mean, covariance):
    method multi_predict (line 154) | def multi_predict(self, mean, covariance):
    method update (line 196) | def update(self, mean, covariance, measurement):
    method gating_distance (line 230) | def gating_distance(self, mean, covariance, measurements,

FILE: unitrack/core/propagation/__init__.py
  function propagate (line 16) | def propagate(temp_feats, obs, img, model, format='box'):

FILE: unitrack/core/propagation/propagate_box.py
  function propagate_box (line 12) | def propagate_box(temp_feats, box, img, model):

FILE: unitrack/core/propagation/propagate_mask.py
  function propagate_mask (line 12) | def propagate_mask(temp_feats, mask, img, model):

FILE: unitrack/core/propagation/propagate_pose.py
  function propagate_pose (line 12) | def propagate_pose(temp_feats, pose, img, model):

FILE: unitrack/mask.py
  class MaskAssociationTracker (line 18) | class MaskAssociationTracker(AssociationTracker):
    method __init__ (line 19) | def __init__(self, opt):
    method extract_emb (line 22) | def extract_emb(self, img, obs):
    method prepare_obs (line 48) | def prepare_obs(self, img, img0, obs, embs=None):

FILE: unitrack/mask_with_train_embs.py
  class AssociationTrackerWithTrainedEmbed (line 19) | class AssociationTrackerWithTrainedEmbed(object):
    method __init__ (line 20) | def __init__(self, opt):
    method extract_emb (line 40) | def extract_emb(self, img, obs):
    method prepare_obs (line 43) | def prepare_obs(self, img, img0, obs, embs=None):
    method update (line 46) | def update(self, img, img0, obs, embs=None):
    method reset_all (line 153) | def reset_all(self, ):
  class MaskAssociationTracker (line 160) | class MaskAssociationTracker(AssociationTrackerWithTrainedEmbed):
    method __init__ (line 161) | def __init__(self, opt):
    method extract_emb (line 164) | def extract_emb(self, img, obs, embs):
    method prepare_obs (line 192) | def prepare_obs(self, img, img0, obs, embs=None):

FILE: unitrack/model/functional.py
  function hard_prop (line 17) | def hard_prop(pred):
  function context_index_bank (line 24) | def context_index_bank(n_context, long_mem, N):
  function mem_efficient_batched_affinity (line 42) | def mem_efficient_batched_affinity(
  function batched_affinity (line 75) | def batched_affinity(query, keys, mask, temperature, topk, long_mem, dev...
  function process_pose (line 103) | def process_pose(pred, lbl_set, topk=3):
  class MaskedAttention (line 127) | class MaskedAttention(nn.Module):
    method __init__ (line 132) | def __init__(self, radius, flat=True):
    method mask (line 139) | def mask(self, H, W):
    method index (line 144) | def index(self, H, W):
    method make (line 149) | def make(self, H, W):
    method flatten (line 164) | def flatten(self, D):
    method make_index (line 167) | def make_index(self, H, W, pad=False):
    method forward (line 175) | def forward(self, x):

FILE: unitrack/model/hrnet.py
  function conv3x3 (line 29) | def conv3x3(in_planes, out_planes, stride=1):
  class BasicBlock (line 35) | class BasicBlock(nn.Module):
    method __init__ (line 38) | def __init__(self, inplanes, planes, stride=1, downsample=None):
    method forward (line 48) | def forward(self, x):
  class Bottleneck (line 67) | class Bottleneck(nn.Module):
    method __init__ (line 70) | def __init__(self, inplanes, planes, stride=1, downsample=None):
    method forward (line 85) | def forward(self, x):
  class HighResolutionModule (line 108) | class HighResolutionModule(nn.Module):
    method __init__ (line 109) | def __init__(self, num_branches, blocks, num_blocks, num_inchannels,
    method _check_branches (line 126) | def _check_branches(self, num_branches, blocks, num_blocks,
    method _make_one_branch (line 146) | def _make_one_branch(self, branch_index, block, num_blocks, num_channels,
    method _make_branches (line 170) | def _make_branches(self, num_branches, block, num_blocks, num_channels):
    method _make_fuse_layers (line 179) | def _make_fuse_layers(self):
    method get_num_inchannels (line 227) | def get_num_inchannels(self):
    method forward (line 230) | def forward(self, x):
  class HighResolutionNet (line 263) | class HighResolutionNet(nn.Module):
    method __init__ (line 265) | def __init__(self, cfg, **kwargs):
    method _make_head (line 320) | def _make_head(self, pre_stage_channels):
    method _make_transition_layer (line 369) | def _make_transition_layer(
    method _make_layer (line 405) | def _make_layer(self, block, inplanes, planes, blocks, stride=1):
    method _make_stage (line 422) | def _make_stage(self, layer_config, num_inchannels,
    method forward (line 452) | def forward(self, x):
    method init_weights (line 500) | def init_weights(self, pretrained='',):
  function get_cls_net (line 601) | def get_cls_net(c, **kwargs):

FILE: unitrack/model/model.py
  class AppearanceModel (line 11) | class AppearanceModel(nn.Module):
    method __init__ (line 12) | def __init__(self, args):
    method forward (line 17) | def forward(self, x):
  function partial_load (line 21) | def partial_load(pretrained_dict, model, skip_keys=[], log=False):
  function load_vince_model (line 40) | def load_vince_model(path):
  function load_uvc_model (line 46) | def load_uvc_model(ckpt_path):
  function load_tc_model (line 57) | def load_tc_model(ckpt_path):
  class From3D (line 74) | class From3D(nn.Module):
    method __init__ (line 76) | def __init__(self, resnet):
    method forward (line 80) | def forward(self, x):
  function make_encoder (line 88) | def make_encoder(args):

FILE: unitrack/model/random_feat_generator.py
  class RandomFeatGenerator (line 15) | class RandomFeatGenerator(nn.Module):
    method __init__ (line 16) | def __init__(self, args):
    method forward (line 21) | def forward(self, x):
    method __str__ (line 36) | def __str__(self):

FILE: unitrack/model/resnet.py
  class ResNet (line 23) | class ResNet(torch_resnet.ResNet):
    method __init__ (line 24) | def __init__(self, *args, **kwargs):
    method modify (line 27) | def modify(self, remove_layers=[], padding=''):
    method forward (line 50) | def forward(self, x):
  function _resnet (line 64) | def _resnet(arch, block, layers, pretrained, progress, **kwargs):
  function resnet18 (line 72) | def resnet18(pretrained=False, progress=True, **kwargs):
  function resnet50 (line 76) | def resnet50(pretrained=False, progress=True, **kwargs) -> ResNet:
  function resnet101 (line 80) | def resnet101(pretrained=False, progress=True, **kwargs):
  function resnet152 (line 84) | def resnet152(pretrained=False, progress=True, **kwargs):
  function resnext50_32x4d (line 96) | def resnext50_32x4d(pretrained=False, progress=True, **kwargs):
  function resnext101_32x8d (line 110) | def resnext101_32x8d(pretrained=False, progress=True, **kwargs):
  function wide_resnet50_2 (line 124) | def wide_resnet50_2(pretrained=False, progress=True, **kwargs):
  function wide_resnet101_2 (line 142) | def wide_resnet101_2(pretrained=False, progress=True, **kwargs):

FILE: unitrack/multitracker.py
  class AssociationTracker (line 25) | class AssociationTracker(object):
    method __init__ (line 26) | def __init__(self, opt):
    method extract_emb (line 46) | def extract_emb(self, img, obs):
    method prepare_obs (line 49) | def prepare_obs(self, img, img0, obs, embs=None):
    method update (line 52) | def update(self, img, img0, obs, embs=None):
    method reset_all (line 159) | def reset_all(self, ):

FILE: unitrack/utils/__init__.py
  function to_numpy (line 21) | def to_numpy(tensor):
  function to_torch (line 29) | def to_torch(ndarray):
  function im_to_numpy (line 37) | def im_to_numpy(img):
  function im_to_torch (line 42) | def im_to_torch(img):

FILE: unitrack/utils/box.py
  function xyxy2xywh (line 13) | def xyxy2xywh(x):
  function xywh2xyxy (line 23) | def xywh2xyxy(x):
  function tlwh2xyxy (line 33) | def tlwh2xyxy(x):
  function tlwh_to_xywh (line 41) | def tlwh_to_xywh(tlwh):
  function tlwh_to_xyah (line 47) | def tlwh_to_xyah(tlwh):
  function tlbr_to_tlwh (line 57) | def tlbr_to_tlwh(tlbr):
  function tlwh_to_tlbr (line 63) | def tlwh_to_tlbr(tlwh):
  function scale_box (line 69) | def scale_box(scale, coords):
  function scale_box_letterbox_size (line 76) | def scale_box_letterbox_size(img_size, coords, img0_shape):
  function scale_box_input_size (line 88) | def scale_box_input_size(img_size, coords, img0_shape):
  function clip_boxes (line 101) | def clip_boxes(boxes, im_shape):
  function clip_box (line 120) | def clip_box(bbox, im_shape):
  function int_box (line 131) | def int_box(box):
  function remove_duplicated_box (line 137) | def remove_duplicated_box(boxes, iou_th=0.5):
  function skltn2box (line 153) | def skltn2box(skltn):

FILE: unitrack/utils/io.py
  function mkdir_if_missing (line 8) | def mkdir_if_missing(d):
  function write_mots_results (line 12) | def write_mots_results(filename, results, data_type='mot'):
  function write_mot_results (line 35) | def write_mot_results(filename, results, data_type='mot'):
  function read_mot_results (line 63) | def read_mot_results(filename, data_type='mot', is_gt=False, is_ignore=F...
  function _read_mot_results (line 90) | def _read_mot_results(filename, is_gt, is_ignore):
  function unzip_objs (line 132) | def unzip_objs(objs):

FILE: unitrack/utils/log.py
  function get_logger (line 4) | def get_logger(name='root'):

FILE: unitrack/utils/mask.py
  function coords2bbox (line 18) | def coords2bbox(coords, extend=2):
  function coords2bbox_all (line 40) | def coords2bbox_all(coords):
  function coords2bboxTensor (line 48) | def coords2bboxTensor(coords, extend=2):
  function mask2box (line 69) | def mask2box(masks):
  function tensor_mask2box (line 80) | def tensor_mask2box(masks):
  function batch_mask2boxlist (line 92) | def batch_mask2boxlist(masks):
  function bboxlist2roi (line 116) | def bboxlist2roi(bbox_list):
  function bbox2roi (line 137) | def bbox2roi(bbox_list):
  function temp_interp_mask (line 158) | def temp_interp_mask(maskseq, T):
  function mask_seq_jac (line 172) | def mask_seq_jac(sa, sb):
  function skltn2mask (line 182) | def skltn2mask(skltn, size):
  function pts2array (line 226) | def pts2array(pts):

FILE: unitrack/utils/meter.py
  class Timer (line 14) | class Timer(object):
    method __init__ (line 16) | def __init__(self):
    method tic (line 25) | def tic(self):
    method toc (line 30) | def toc(self, average=True):
    method clear (line 41) | def clear(self):

FILE: unitrack/utils/visualize.py
  function dump_predictions (line 14) | def dump_predictions(pred, lbl_set, img, prefix):
  function make_gif (line 57) | def make_gif(video, outname='/tmp/test.gif', sz=256):
  function get_color (line 73) | def get_color(idx):
  function plot_tracking (line 78) | def plot_tracking(image, obs, obj_ids, scores=None, frame_id=0, fps=0.):
  function vis_pose (line 109) | def vis_pose(oriImg, points):
  function draw_skeleton (line 149) | def draw_skeleton(aa, kp, color, show_skeleton_labels=False, dataset= "P...
Condensed preview — 237 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (2,001K chars).
[
  {
    "path": ".gitignore",
    "chars": 1352,
    "preview": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packagi"
  },
  {
    "path": "DATASET.md",
    "chars": 3988,
    "preview": "Please prepare the data structure as the following instruction:\n\nThe final dataset folder should be like this. \n```\nroot"
  },
  {
    "path": "LICENSE",
    "chars": 1070,
    "preview": "MIT License\n\nCopyright (c) 2022 Xiangtai  Lee\n\nPermission is hereby granted, free of charge, to any person obtaining a c"
  },
  {
    "path": "README.md",
    "chars": 8528,
    "preview": "# Video K-Net: A Simple, Strong, and Unified Baseline for Video Segmentation (CVPR-2022, oral) \n## [Paper](https://arxiv"
  },
  {
    "path": "configs/det/_base_/datasets/cityscapes_panoptic.py",
    "chars": 2547,
    "preview": "# dataset settings\ndataset_type = 'CityscapesPanopticDataset'\ndata_root = 'data/cityscapes/'\n\nimg_norm_cfg = dict(\n    m"
  },
  {
    "path": "configs/det/_base_/datasets/cityscapes_step.py",
    "chars": 2224,
    "preview": "dataset_type = 'CityscapesSTEP'\ndata_root = 'data/cityscapes'\n\nimg_norm_cfg = dict(\n    mean=[123.675, 116.28, 103.53],\n"
  },
  {
    "path": "configs/det/_base_/datasets/cityscapes_vps_clips.py",
    "chars": 2877,
    "preview": "dataset_type = 'CityscapesVPSDataset'\ndata_root = 'data/cityscapes_vps/'\ndataset_type_test = \"CityscapesPanopticDataset\""
  },
  {
    "path": "configs/det/_base_/datasets/cityscapes_vps_clips_trainval.py",
    "chars": 3147,
    "preview": "dataset_type = 'CityscapesVPSDataset'\ndata_root = 'data/cityscapes_vps/'\ndataset_type_test = \"CityscapesPanopticDataset\""
  },
  {
    "path": "configs/det/_base_/datasets/coco_instance.py",
    "chars": 1782,
    "preview": "dataset_type = 'CocoDataset'\ndata_root = 'data/coco/'\nimg_norm_cfg = dict(\n    mean=[123.675, 116.28, 103.53], std=[58.3"
  },
  {
    "path": "configs/det/_base_/datasets/coco_panoptic.py",
    "chars": 2282,
    "preview": "dataset_type = 'CocoPanopticDatasetCustom'\ndata_root = 'data/coco/'\nimg_norm_cfg = dict(\n    mean=[123.675, 116.28, 103."
  },
  {
    "path": "configs/det/_base_/datasets/coco_panoptic_instance_annotations.py",
    "chars": 2282,
    "preview": "dataset_type = 'CocoPanopticDatasetCustom'\ndata_root = 'data/coco/'\nimg_norm_cfg = dict(\n    mean=[123.675, 116.28, 103."
  },
  {
    "path": "configs/det/_base_/datasets/kitti_step_dvps.py",
    "chars": 2439,
    "preview": "dataset_type = 'KITTISTEPDVPSDataset'\ndata_root = 'data/kitti-step'\n\nimg_norm_cfg = dict(\n    mean=[123.675, 116.28, 103"
  },
  {
    "path": "configs/det/_base_/datasets/kitti_step_vps.py",
    "chars": 2491,
    "preview": "dataset_type = 'KITTISTEPDVPSDataset'\ndata_root = 'data/kitti-step'\n\nimg_norm_cfg = dict(\n    mean=[123.675, 116.28, 103"
  },
  {
    "path": "configs/det/_base_/datasets/kitti_step_vps_trainval.py",
    "chars": 3011,
    "preview": "dataset_type = 'KITTISTEPDVPSDataset'\ndata_root = 'data/kitti-step'\n\nimg_norm_cfg = dict(\n    mean=[123.675, 116.28, 103"
  },
  {
    "path": "configs/det/_base_/datasets/mapillary_panoptic.py",
    "chars": 2403,
    "preview": "dataset_type = 'MapillaryPanopticDataset'\ndata_root = 'data/mapillary/'\nimg_norm_cfg = dict(\n    mean=[123.675, 116.28, "
  },
  {
    "path": "configs/det/_base_/datasets/vipseg_dvps.py",
    "chars": 2339,
    "preview": "dataset_type = 'VIPSegDVPSDataset'\ndata_root = 'data/VIPSeg'\n\nimg_norm_cfg = dict(\n    mean=[123.675, 116.28, 103.53], s"
  },
  {
    "path": "configs/det/_base_/default_runtime.py",
    "chars": 275,
    "preview": "checkpoint_config = dict(interval=1)\n# yapf:disable\nlog_config = dict(\n    interval=50,\n    hooks=[\n        dict(type='T"
  },
  {
    "path": "configs/det/_base_/models/knet_citystep_s3_r50_fpn.py",
    "chars": 6399,
    "preview": "num_stages = 3\nnum_proposals = 100\nconv_kernel_size = 1\n\nmodel = dict(\n    type='KNet',\n    cityscapes=False,\n    kitti_"
  },
  {
    "path": "configs/det/_base_/models/knet_kitti_step_s3_r50_fpn.py",
    "chars": 7448,
    "preview": "num_stages = 3\nnum_proposals = 100\nconv_kernel_size = 1\n\nmodel = dict(\n    type='KNet',\n    cityscapes=False,\n    kitti_"
  },
  {
    "path": "configs/det/_base_/models/knet_s3_r50_deformable_fpn.py",
    "chars": 6463,
    "preview": "num_stages = 3\nnum_proposals = 100\nconv_kernel_size = 1\nmodel = dict(\n    type='KNet',\n    backbone=dict(\n        type='"
  },
  {
    "path": "configs/det/_base_/models/knet_s3_r50_fpn.py",
    "chars": 5372,
    "preview": "num_stages = 3\nnum_proposals = 100\nconv_kernel_size = 1\nmodel = dict(\n    type='KNet',\n    backbone=dict(\n        type='"
  },
  {
    "path": "configs/det/_base_/models/knet_s3_r50_fpn_panoptic.py",
    "chars": 6290,
    "preview": "num_stages = 3\nnum_proposals = 100\nconv_kernel_size = 1\nmodel = dict(\n    type='KNet',\n    backbone=dict(\n        type='"
  },
  {
    "path": "configs/det/_base_/models/knet_vipseg_s3_r50_fpn.py",
    "chars": 7679,
    "preview": "num_stages = 3\nnum_proposals = 100\nconv_kernel_size = 1\n\nnum_thing_classes = 58\nnum_stuff_classes = 66\nnum_classes = num"
  },
  {
    "path": "configs/det/_base_/models/video_knet_s3_r50_fpn_panoptic.py",
    "chars": 7299,
    "preview": "num_stages = 3\nnum_proposals = 100\nconv_kernel_size = 1\nmodel = dict(\n    type='VideoKNet',\n    backbone=dict(\n        t"
  },
  {
    "path": "configs/det/_base_/schedules/schedule_10e.py",
    "chars": 476,
    "preview": "# optimizer\n# this is different from the original 1x schedule that use SGD\noptimizer = dict(\n    type='AdamW',\n    lr=0."
  },
  {
    "path": "configs/det/_base_/schedules/schedule_1x.py",
    "chars": 479,
    "preview": "# optimizer\n# this is different from the original 1x schedule that use SGD\noptimizer = dict(\n    type='AdamW',\n    lr=0."
  },
  {
    "path": "configs/det/coco/knet_s3_r50_deformable_fpn_ms-3x_coco.py",
    "chars": 353,
    "preview": "_base_ = [\n    '../_base_/models/knet_s3_r50_deformable_fpn.py',\n    '../common/mstrain_3x_coco_instance.py'\n]\n\nmodel = "
  },
  {
    "path": "configs/det/coco/knet_s3_r50_fpn_ms-3x_coco-panoptic.py",
    "chars": 4534,
    "preview": "_base_ = [\n    '../_base_/models/knet_s3_r50_fpn_panoptic.py',\n    '../common/mstrain_3x_coco_panoptic.py'\n]\nnum_stages "
  },
  {
    "path": "configs/det/coco/knet_s3_r50_fpn_ms-3x_coco.py",
    "chars": 100,
    "preview": "_base_ = [\n    '../_base_/models/knet_s3_r50_fpn.py',\n    '../common/mstrain_3x_coco_instance.py'\n]\n"
  },
  {
    "path": "configs/det/coco/knet_s3_swin-b_deformable_fpn_ms-3x_coco.py",
    "chars": 720,
    "preview": "_base_ = [\n    '../_base_/models/knet_s3_r50_deformable_fpn.py',\n    '../common/mstrain_3x_coco_instance.py'\n]\n\nmodel = "
  },
  {
    "path": "configs/det/common/lsj_coco_panoptic_50e.py",
    "chars": 3450,
    "preview": "_base_ = '../_base_/default_runtime.py'\n# dataset settings\ndataset_type = 'CocoPanopticDatasetCustom'\ndata_root = 'data/"
  },
  {
    "path": "configs/det/common/mstrain_3x_coco_instance.py",
    "chars": 2618,
    "preview": "_base_ = '../_base_/default_runtime.py'\n# dataset settings\ndataset_type = 'CocoDataset'\ndata_root = 'data/coco/'\nimg_nor"
  },
  {
    "path": "configs/det/common/mstrain_3x_coco_panoptic_inst_anno.py",
    "chars": 3189,
    "preview": "_base_ = '../_base_/default_runtime.py'\n# dataset settings\ndataset_type = 'CocoPanopticDatasetCustom'\ndata_root = 'data/"
  },
  {
    "path": "configs/det/common/mstrain_3x_coco_panoptic_inst_anno_detr_aug.py",
    "chars": 4302,
    "preview": "_base_ = '../_base_/default_runtime.py'\n# dataset settings\ndataset_type = 'CocoPanopticDatasetCustom'\ndata_root = 'data/"
  },
  {
    "path": "configs/det/common/mstrain_64e_city_panoptic.py",
    "chars": 3260,
    "preview": "_base_ = '../_base_/default_runtime.py'\n# dataset settings\ndataset_type = 'CityscapesPanopticDataset'\ndata_root = 'data/"
  },
  {
    "path": "configs/det/knet_cityscapes_step/knet_s3_r50_fpn.py",
    "chars": 1328,
    "preview": "_base_ = [\n    '../_base_/schedules/schedule_1x.py',\n    '../_base_/default_runtime.py',\n    '../_base_/models/knet_city"
  },
  {
    "path": "configs/det/knet_cityscapes_step/knet_s3_swin_b_fpn.py",
    "chars": 1499,
    "preview": "_base_ = [\n    '../_base_/schedules/schedule_1x.py',\n    '../_base_/default_runtime.py',\n    '../_base_/models/knet_city"
  },
  {
    "path": "configs/det/knet_cityscapes_step/knet_s3_swin_l_fpn.py",
    "chars": 1500,
    "preview": "_base_ = [\n    '../_base_/schedules/schedule_1x.py',\n    '../_base_/default_runtime.py',\n    '../_base_/models/knet_city"
  },
  {
    "path": "configs/det/video_knet_kitti_step/video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train.py",
    "chars": 6464,
    "preview": "_base_ = [\n    '../_base_/schedules/schedule_1x.py',\n    '../_base_/default_runtime.py',\n    '../_base_/models/knet_kitt"
  },
  {
    "path": "configs/det/video_knet_kitti_step/video_knet_s3_r50_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_train_8e.py",
    "chars": 6459,
    "preview": "_base_ = [\n    '../_base_/schedules/schedule_1x.py',\n    '../_base_/default_runtime.py',\n    '../_base_/models/knet_kitt"
  },
  {
    "path": "configs/det/video_knet_kitti_step/video_knet_s3_swinb_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_update.py",
    "chars": 6833,
    "preview": "_base_ = [\n    '../_base_/schedules/schedule_1x.py',\n    '../_base_/default_runtime.py',\n    '../_base_/models/knet_kitt"
  },
  {
    "path": "configs/det/video_knet_kitti_step/video_knet_s3_swinl_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_joint_update.py",
    "chars": 6831,
    "preview": "_base_ = [\n    '../../_base_/schedules/schedule_1x.py',\n    '../../_base_/default_runtime.py',\n    '../../_base_/models/"
  },
  {
    "path": "configs/det/video_knet_kitti_step/video_knet_s3_swinl_rpn_1x_kitti_step_sigmoid_stride2_mask_embed_link_ffn_update_conv_short_track_fc.py",
    "chars": 6748,
    "preview": "_base_ = [\n    '../_base_/schedules/schedule_1x.py',\n    '../_base_/default_runtime.py',\n    '../_base_/models/knet_kitt"
  },
  {
    "path": "configs/det/video_knet_vipseg/video_knet_s3_r50_rpn_vipseg_mask_embed_link_ffn_joint_train.py",
    "chars": 4802,
    "preview": "_base_ = [\n    '../_base_/schedules/schedule_1x.py',\n    '../_base_/default_runtime.py',\n    '../_base_/models/knet_vips"
  },
  {
    "path": "configs/det/video_knet_vipseg/video_knet_s3_swin_b_rpn_vipseg_mask_embed_link_ffn_joint_train_8e.py",
    "chars": 5060,
    "preview": "_base_ = [\n    '../_base_/schedules/schedule_1x.py',\n    '../_base_/default_runtime.py',\n    '../_base_/models/knet_vips"
  },
  {
    "path": "configs/video_knet_vis/_base_/datasets/coco_instance.py",
    "chars": 1782,
    "preview": "dataset_type = 'CocoDataset'\ndata_root = 'data/coco/'\nimg_norm_cfg = dict(\n    mean=[123.675, 116.28, 103.53], std=[58.3"
  },
  {
    "path": "configs/video_knet_vis/_base_/datasets/youtubevis_2019.py",
    "chars": 2827,
    "preview": "# dataset settings\nimg_norm_cfg = dict(\n    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375],\n    to_rgb=True"
  },
  {
    "path": "configs/video_knet_vis/_base_/default_runtime.py",
    "chars": 328,
    "preview": "checkpoint_config = dict(interval=1)\nlog_config = dict(\n    interval=50,\n    hooks=[\n        dict(type='TextLoggerHook')"
  },
  {
    "path": "configs/video_knet_vis/_base_/models/knet_track_r50.py",
    "chars": 8286,
    "preview": "num_stages = 3\nnum_proposals = 100\nconv_kernel_size = 1\nmodel = dict(\n    type='KNetTrack',\n    backbone=dict(\n        t"
  },
  {
    "path": "configs/video_knet_vis/_base_/models/knet_track_r50_deformablefpn.py",
    "chars": 9415,
    "preview": "num_stages = 3\nnum_proposals = 100\nconv_kernel_size = 1\nmodel = dict(\n    type='KNetTrack',\n    backbone=dict(\n        t"
  },
  {
    "path": "configs/video_knet_vis/_base_/schedules/schedule_0.75x.py",
    "chars": 452,
    "preview": "# optimizer\noptimizer = dict(\n    type='AdamW',\n    lr=0.0001,\n    weight_decay=0.05,\n    paramwise_cfg=dict(\n        cu"
  },
  {
    "path": "configs/video_knet_vis/_base_/schedules/schedule_1x.py",
    "chars": 454,
    "preview": "# optimizer\noptimizer = dict(\n    type='AdamW',\n    lr=0.0001,\n    weight_decay=0.05,\n    paramwise_cfg=dict(\n        cu"
  },
  {
    "path": "configs/video_knet_vis/_base_/schedules/schedule_8e.py",
    "chars": 451,
    "preview": "# optimizer\noptimizer = dict(\n    type='AdamW',\n    lr=0.0001,\n    weight_decay=0.05,\n    paramwise_cfg=dict(\n        cu"
  },
  {
    "path": "configs/video_knet_vis/common/mstrain_3x_coco_instance.py",
    "chars": 2618,
    "preview": "_base_ = '../_base_/default_runtime.py'\n# dataset settings\ndataset_type = 'CocoDataset'\ndata_root = 'data/coco/'\nimg_nor"
  },
  {
    "path": "configs/video_knet_vis/video_knet_vis/knet_track_r50_1x_youtubevis.py",
    "chars": 177,
    "preview": "_base_ = [\n    '../_base_/schedules/schedule_1x.py',\n    '../_base_/default_runtime.py',\n    '../_base_/models/knet_trac"
  },
  {
    "path": "configs/video_knet_vis/video_knet_vis/knet_track_r50_deformable_fpn_1x_youtubevis.py",
    "chars": 254,
    "preview": "_base_ = [\n    '../_base_/schedules/schedule_1x.py',\n    '../_base_/default_runtime.py',\n    '../_base_/models/knet_trac"
  },
  {
    "path": "configs/video_knet_vis/video_knet_vis/knet_track_swinb_1x_youtubevis_8e.py",
    "chars": 759,
    "preview": "_base_ = [\n    '../_base_/schedules/schedule_8e.py',\n    '../_base_/default_runtime.py',\n    '../_base_/models/knet_trac"
  },
  {
    "path": "configs/video_knet_vis/video_knet_vis/knet_track_swinb_deformable_1x_youtubevis.py",
    "chars": 2313,
    "preview": "_base_ = [\n    '../_base_/schedules/schedule_1x.py',\n    '../_base_/default_runtime.py',\n    '../_base_/models/knet_trac"
  },
  {
    "path": "external/cityscape_panoptic.py",
    "chars": 26733,
    "preview": "import contextlib\nimport io\nimport itertools\nimport os\nimport glob\nimport tempfile\nimport logging\nimport os.path as osp\n"
  },
  {
    "path": "external/cityscapes_step.py",
    "chars": 12694,
    "preview": "import os\n\nimport numpy as np\n\nfrom mmdet.datasets.builder import DATASETS\nfrom mmdet.datasets.pipelines.compose import "
  },
  {
    "path": "external/cityscapes_vps.py",
    "chars": 32564,
    "preview": "import contextlib\nimport io\nimport itertools\nimport os\nimport glob\nimport tempfile\nimport logging\nimport os.path as osp\n"
  },
  {
    "path": "external/coco_panoptic.py",
    "chars": 20704,
    "preview": "import contextlib\nimport io\nimport itertools\nimport logging\nimport tempfile\nimport os.path as osp\nfrom collections impor"
  },
  {
    "path": "external/dataset/dvps_pipelines/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "external/dataset/dvps_pipelines/loading.py",
    "chars": 8781,
    "preview": "import mmcv\nimport numpy as np\nfrom mmdet.core import BitmapMasks\nfrom mmdet.datasets.builder import PIPELINES\n\n\ndef bit"
  },
  {
    "path": "external/dataset/dvps_pipelines/transforms.py",
    "chars": 15104,
    "preview": "import mmcv\nimport numpy as np\nfrom mmdet.datasets.builder import PIPELINES\nfrom mmdet.datasets.pipelines import Resize,"
  },
  {
    "path": "external/dataset/dvps_pipelines/tricks.py",
    "chars": 529,
    "preview": "import numpy as np\nfrom mmdet.datasets.builder import PIPELINES\nfrom mmdet.datasets.pipelines import AutoAugment\n\n\n@PIPE"
  },
  {
    "path": "external/dataset/forecasting_pipelines/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "external/dataset/forecasting_pipelines/loading.py",
    "chars": 6380,
    "preview": "import mmcv\nimport numpy as np\nfrom mmdet.core import BitmapMasks\n\nfrom mmdet.datasets.builder import PIPELINES\n\n\ndef bi"
  },
  {
    "path": "external/dataset/forecasting_pipelines/transforms.py",
    "chars": 6345,
    "preview": "import mmcv\nimport numpy as np\nimport warnings\nfrom mmdet.datasets import PIPELINES\n\n\n@PIPELINES.register_module()\nclass"
  },
  {
    "path": "external/dataset/mIoU.py",
    "chars": 1392,
    "preview": "import numpy as np\n\n\ndef eval_miou(results, targets, num_classes, ignore_index=255):\n    total_area_intersect = np.zeros"
  },
  {
    "path": "external/dataset/pipelines/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "external/dataset/pipelines/formatting.py",
    "chars": 21852,
    "preview": "import numpy as np\nimport torch\nfrom mmcv.parallel import DataContainer as DC\nfrom mmdet.datasets.builder import PIPELIN"
  },
  {
    "path": "external/dataset/pipelines/loading.py",
    "chars": 8179,
    "preview": "import os.path as osp\nimport numpy as np\n\nimport mmcv\nfrom mmdet.core import BitmapMasks\n\nfrom mmdet.datasets.builder im"
  },
  {
    "path": "external/dataset/pipelines/test_time_aug.py",
    "chars": 4671,
    "preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport warnings\n\nimport mmcv\n\nfrom mmdet.datasets.builder import PIPELIN"
  },
  {
    "path": "external/dataset/pipelines/transforms.py",
    "chars": 41207,
    "preview": "import cv2\nimport mmcv\nimport numpy as np\nimport warnings\nfrom mmdet.datasets.builder import PIPELINES\nfrom mmdet.datase"
  },
  {
    "path": "external/evalhooks.py",
    "chars": 12357,
    "preview": "import os.path as osp\nimport warnings\nfrom math import inf\n\nimport mmcv\nimport torch.distributed as dist\nfrom mmcv.runne"
  },
  {
    "path": "external/ext/mask.py",
    "chars": 4591,
    "preview": "__author__ = 'tsungyi'\n\nimport pycocotools._mask as _mask\n\n# Interface for manipulating masks stored in RLE format.\n#\n# "
  },
  {
    "path": "external/ext/ytvos.py",
    "chars": 11439,
    "preview": "__author__ = 'ychfan'\n# Interface for accessing the YouTubeVIS dataset.\n\n# The following API functions are defined:\n#  Y"
  },
  {
    "path": "external/fcn_mask_head.py",
    "chars": 6369,
    "preview": "import numpy as np\nimport torch\nfrom mmdet.models.builder import HEADS\nfrom mmdet.models.roi_heads.mask_heads.fcn_mask_h"
  },
  {
    "path": "external/kitti_step_dvps.py",
    "chars": 16288,
    "preview": "import os\nimport random\nfrom typing import Dict, List\n\nimport copy\n\nimport mmcv\nimport numpy as np\nimport torch\n\nfrom mm"
  },
  {
    "path": "external/panoptic_fpn.py",
    "chars": 1022,
    "preview": "from mmdet.models.builder import DETECTORS\nfrom mmdet.models.detectors.two_stage import TwoStageDetector\n\n\n@DETECTORS.re"
  },
  {
    "path": "external/panoptic_head.py",
    "chars": 10906,
    "preview": "import torch\nfrom mmdet.core import bbox2result\nfrom mmdet.models.builder import HEADS, build_head\nfrom mmdet.models.roi"
  },
  {
    "path": "external/semantic_seg_head.py",
    "chars": 4250,
    "preview": "import torch.nn as nn\nimport torch.nn.functional as F\nfrom mmcv.cnn import kaiming_init\nfrom mmcv.runner import auto_fp1"
  },
  {
    "path": "external/semkitti_dvps.py",
    "chars": 14872,
    "preview": "import os\nfrom typing import Dict, List\n\nimport copy\n\nimport mmcv\nimport numpy as np\nimport random\nimport torch\n\nfrom mm"
  },
  {
    "path": "external/test.py",
    "chars": 5265,
    "preview": "import os.path as osp\nimport time\n\nimport mmcv\nimport torch\nfrom mmcv.image import tensor2imgs\nfrom mmcv.runner import g"
  },
  {
    "path": "external/train.py",
    "chars": 5707,
    "preview": "import warnings\n\nimport torch\nfrom mmcv.parallel import MMDataParallel, MMDistributedDataParallel\nfrom mmcv.runner impor"
  },
  {
    "path": "external/utils.py",
    "chars": 307,
    "preview": "import io\n\nfrom panopticapi.utils import id2rgb\nfrom PIL import Image\n\n\ndef encode_panoptic(panoptic_results):\n    panop"
  },
  {
    "path": "external/vipseg_dvps.py",
    "chars": 29241,
    "preview": "import os\nimport random\nfrom typing import Dict, List\n\nimport copy\n\nimport mmcv\nimport numpy as np\nimport torch\n\nfrom mm"
  },
  {
    "path": "knet/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "knet/cross_entropy_loss.py",
    "chars": 8045,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmdet.models.builder import LOSSES\nfrom mmdet.mo"
  },
  {
    "path": "knet/det/dice_loss.py",
    "chars": 1861,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmdet.models.builder import LOSSES, build_loss\nf"
  },
  {
    "path": "knet/det/kernel_head.py",
    "chars": 20860,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmcv.cnn import (ConvModule, bias_init_with_prob"
  },
  {
    "path": "knet/det/kernel_iter_head.py",
    "chars": 21416,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmdet.core import build_assigner, build_sampler\n"
  },
  {
    "path": "knet/det/kernel_update_head.py",
    "chars": 19986,
    "preview": "import numpy as np\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmcv.cnn import (ConvModule, "
  },
  {
    "path": "knet/det/knet.py",
    "chars": 8658,
    "preview": "import torch\nimport torch.nn.functional as F\nfrom mmdet.models.builder import DETECTORS\nfrom mmdet.models.detectors impo"
  },
  {
    "path": "knet/det/mask_hungarian_assigner.py",
    "chars": 11353,
    "preview": "import numpy as np\nimport torch\n\nfrom mmdet.core import AssignResult, BaseAssigner, reduce_mean\nfrom mmdet.core.bbox.bui"
  },
  {
    "path": "knet/det/mask_pseudo_sampler.py",
    "chars": 8147,
    "preview": "import torch\n\nfrom mmdet.core.bbox import BaseSampler, SamplingResult\nfrom mmdet.core.bbox.builder import BBOX_SAMPLERS\n"
  },
  {
    "path": "knet/det/msdeformattn_decoder.py",
    "chars": 11635,
    "preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom "
  },
  {
    "path": "knet/det/semantic_fpn_wrapper.py",
    "chars": 28010,
    "preview": "import math\n\n\nimport torch\nimport torch.nn as nn\nfrom torch.nn import init\nfrom mmcv.cnn import ConvModule, normal_init\n"
  },
  {
    "path": "knet/det/utils.py",
    "chars": 3605,
    "preview": "from typing import List\n\nimport torch\nimport torch.nn.functional as F\nfrom mmdet.utils import get_root_logger\n\n\ndef sem2"
  },
  {
    "path": "knet/kernel_updator.py",
    "chars": 3995,
    "preview": "import torch.nn as nn\nimport torch.nn.functional as F\nfrom mmcv.cnn import build_activation_layer, build_norm_layer\nfrom"
  },
  {
    "path": "knet/video/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "knet/video/dice_loss.py",
    "chars": 1829,
    "preview": "import torch\nimport torch.nn as nn\nfrom mmdet.models.builder import LOSSES, build_loss\nfrom mmdet.models.losses.utils im"
  },
  {
    "path": "knet/video/kernel_head.py",
    "chars": 21491,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmcv.cnn import (ConvModule, bias_init_with_prob"
  },
  {
    "path": "knet/video/kernel_iter_head.py",
    "chars": 37645,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmdet.core import build_assigner, build_sampler\n"
  },
  {
    "path": "knet/video/kernel_update_head.py",
    "chars": 34056,
    "preview": "import numpy as np\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmcv.cnn import (ConvModule, "
  },
  {
    "path": "knet/video/knet.py",
    "chars": 10103,
    "preview": "import torch\nimport torch.nn.functional as F\nfrom mmdet.models.builder import DETECTORS\nfrom mmdet.models.detectors impo"
  },
  {
    "path": "knet/video/knet_quansi_dense.py",
    "chars": 27754,
    "preview": "import warnings\nimport numpy as np\nimport torch\nimport torch.nn.functional as F\nfrom mmcv.cnn import ConvModule\nfrom mmd"
  },
  {
    "path": "knet/video/knet_quansi_dense_embed_fc.py",
    "chars": 31498,
    "preview": "import warnings\nimport numpy as np\nimport torch\nimport torch.nn.functional as F\nimport torch.nn as nn\nfrom mmcv.cnn impo"
  },
  {
    "path": "knet/video/knet_quansi_dense_embed_fc_joint_train.py",
    "chars": 31961,
    "preview": "import warnings\nimport numpy as np\nimport torch\nimport torch.nn.functional as F\nimport torch.nn as nn\nfrom mmcv.cnn impo"
  },
  {
    "path": "knet/video/knet_quansi_dense_embed_fc_toy_exp.py",
    "chars": 30783,
    "preview": "import warnings\nimport numpy as np\nimport torch\nimport torch.nn.functional as F\nfrom mmdet.models.builder import DETECTO"
  },
  {
    "path": "knet/video/knet_quansi_dense_roi_gt_box.py",
    "chars": 29818,
    "preview": "import warnings\nimport numpy as np\nimport torch\nimport torch.nn.functional as F\nfrom mmcv.cnn import ConvModule\nfrom mmd"
  },
  {
    "path": "knet/video/knet_quansi_dense_roi_gt_box_joint_train.py",
    "chars": 30407,
    "preview": "import warnings\nimport numpy as np\nimport torch\nimport torch.nn.functional as F\nfrom mmcv.cnn import ConvModule\nfrom mmd"
  },
  {
    "path": "knet/video/knet_track_head.py",
    "chars": 20037,
    "preview": "import warnings\nimport numpy as np\nimport torch\nimport torch.nn.functional as F\nfrom mmdet.models.builder import DETECTO"
  },
  {
    "path": "knet/video/knet_track_head_roi_align.py",
    "chars": 20452,
    "preview": "import warnings\nimport numpy as np\nimport torch\nimport torch.nn.functional as F\nfrom mmdet.models.builder import DETECTO"
  },
  {
    "path": "knet/video/knet_uni_track.py",
    "chars": 19068,
    "preview": "import warnings\nimport numpy as np\nimport torch\nimport torch.nn.functional as F\nfrom mmdet.models.builder import DETECTO"
  },
  {
    "path": "knet/video/mask_hungarian_assigner.py",
    "chars": 18234,
    "preview": "import numpy as np\nimport torch\nfrom mmdet.core import AssignResult, BaseAssigner, reduce_mean\nfrom mmdet.core.bbox.buil"
  },
  {
    "path": "knet/video/mask_pseudo_sampler.py",
    "chars": 2587,
    "preview": "import torch\n\nfrom mmdet.core.bbox import BaseSampler, SamplingResult\nfrom mmdet.core.bbox.builder import BBOX_SAMPLERS\n"
  },
  {
    "path": "knet/video/qdtrack/builder.py",
    "chars": 198,
    "preview": "from mmcv.utils import Registry\nfrom mmcv.cnn import build_model_from_cfg as build\n\nTRACKERS = Registry('tracker')\n\n\ndef"
  },
  {
    "path": "knet/video/qdtrack/losses/__init__.py",
    "chars": 143,
    "preview": "from .l2_loss import L2Loss\nfrom .multipos_cross_entropy_loss import MultiPosCrossEntropyLoss\n\n__all__ = ['L2Loss', 'Mul"
  },
  {
    "path": "knet/video/qdtrack/losses/l2_loss.py",
    "chars": 4451,
    "preview": "import numpy as np\nimport torch\nimport torch.nn as nn\nfrom mmdet.models import LOSSES, weighted_loss\n\n\n@weighted_loss\nde"
  },
  {
    "path": "knet/video/qdtrack/losses/multipos_cross_entropy_loss.py",
    "chars": 2303,
    "preview": "import torch\nimport torch.nn as nn\nfrom mmdet.models import LOSSES, weight_reduce_loss\n\n\ndef multi_pos_cross_entropy(pre"
  },
  {
    "path": "knet/video/qdtrack/track/__init__.py",
    "chars": 156,
    "preview": "from .similarity import cal_similarity\nfrom .transforms import track2result, restore_result\n\n__all__ = ['cal_similarity'"
  },
  {
    "path": "knet/video/qdtrack/track/similarity.py",
    "chars": 875,
    "preview": "import torch\nimport torch.nn.functional as F\n\n\ndef cal_similarity(key_embeds,\n                   ref_embeds,\n           "
  },
  {
    "path": "knet/video/qdtrack/track/transforms.py",
    "chars": 1077,
    "preview": "import numpy as np\nimport torch\n\n\ndef track2result(bboxes, labels, ids, num_classes):\n    valid_inds = ids > -1\n    bbox"
  },
  {
    "path": "knet/video/qdtrack/trackers/__init__.py",
    "chars": 149,
    "preview": "from .quasi_dense_embed_tracker import QuasiDenseEmbedTracker\nfrom .tao_tracker import TaoTracker\n\n__all__ = ['QuasiDens"
  },
  {
    "path": "knet/video/qdtrack/trackers/quasi_dense_embed_tracker.py",
    "chars": 8225,
    "preview": "import torch\nimport torch.nn.functional as F\nfrom mmdet.core import bbox_overlaps\n\nfrom ..builder import TRACKERS\n\n\n@TRA"
  },
  {
    "path": "knet/video/qdtrack/trackers/tao_tracker.py",
    "chars": 15073,
    "preview": "import os\nimport random\nfrom collections import defaultdict\n\nimport cv2\nimport mmcv\nimport numpy as np\nimport seaborn as"
  },
  {
    "path": "knet/video/track_heads.py",
    "chars": 28199,
    "preview": "\"\"\"\n    This file implements several tracking heads\n\"\"\"\nimport numpy as np\nimport torch\nimport torch.nn as nn\nfrom mmcv."
  },
  {
    "path": "knet/video/tracker.py",
    "chars": 4542,
    "preview": "\"\"\"\nThis is a simple mask based tracker\nCopyright (c) https://github.com/xingyizhou/CenterTrack\nModified by Xiangtai Li\n"
  },
  {
    "path": "knet/video/util.py",
    "chars": 2561,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n\"\"\"\nUtilities for bounding box manipulation and G"
  },
  {
    "path": "knet_vis/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "knet_vis/det/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "knet_vis/det/kernel_head.py",
    "chars": 20906,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmcv.cnn import (ConvModule, bias_init_with_prob"
  },
  {
    "path": "knet_vis/det/kernel_iter_head.py",
    "chars": 14691,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom mmdet.core import build_assigner, build_sampler"
  },
  {
    "path": "knet_vis/det/kernel_update_head.py",
    "chars": 20807,
    "preview": "import numpy as np\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmcv.cnn import (ConvModule, "
  },
  {
    "path": "knet_vis/det/knet.py",
    "chars": 5560,
    "preview": "import torch\nimport torch.nn.functional as F\n\nfrom mmdet.models.builder import DETECTORS\nfrom mmdet.models.detectors imp"
  },
  {
    "path": "knet_vis/det/mask_hungarian_assigner.py",
    "chars": 10836,
    "preview": "import numpy as np\nimport torch\n\nfrom mmdet.core import AssignResult, BaseAssigner\nfrom mmdet.core.bbox.builder import B"
  },
  {
    "path": "knet_vis/det/mask_pseudo_sampler.py",
    "chars": 3875,
    "preview": "import torch\n\nfrom mmdet.core.bbox import BaseSampler, SamplingResult\nfrom mmdet.core.bbox.builder import BBOX_SAMPLERS\n"
  },
  {
    "path": "knet_vis/det/semantic_fpn_wrapper.py",
    "chars": 8789,
    "preview": "import torch\nimport torch.nn as nn\nfrom mmcv.cnn import ConvModule, normal_init\nfrom mmdet.models.builder import NECKS\nf"
  },
  {
    "path": "knet_vis/det/utils.py",
    "chars": 1498,
    "preview": "import torch\n\n\ndef sem2ins_masks(gt_sem_seg,\n                  num_thing_classes=80):\n    \"\"\"Convert semantic segmentati"
  },
  {
    "path": "knet_vis/kernel_updator.py",
    "chars": 3996,
    "preview": "import torch.nn as nn\nimport torch.nn.functional as F\nfrom mmcv.cnn import build_activation_layer, build_norm_layer\nfrom"
  },
  {
    "path": "knet_vis/tracker/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "knet_vis/tracker/kernel_frame_head.py",
    "chars": 21897,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmcv.cnn import (ConvModule, bias_init_with_prob"
  },
  {
    "path": "knet_vis/tracker/kernel_frame_iter_head.py",
    "chars": 17158,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmcv.cnn import build_norm_layer\nfrom mmcv.cnn.b"
  },
  {
    "path": "knet_vis/tracker/kernel_head.py",
    "chars": 21222,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmcv.cnn import (ConvModule, bias_init_with_prob"
  },
  {
    "path": "knet_vis/tracker/kernel_iter_head.py",
    "chars": 16487,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom mmdet.core import build_assigner, build_sampler"
  },
  {
    "path": "knet_vis/tracker/kernel_update_head.py",
    "chars": 26141,
    "preview": "import numpy as np\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmcv.cnn import (ConvModule, "
  },
  {
    "path": "knet_vis/tracker/mask_hungarian_assigner.py",
    "chars": 9009,
    "preview": "import numpy as np\nimport torch\n\nfrom mmdet.core import AssignResult, BaseAssigner\nfrom mmdet.core.bbox.builder import B"
  },
  {
    "path": "knet_vis/tracker/positional_encoding.py",
    "chars": 2789,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# # Modified by Bowen Cheng from: https://github.com/facebookresearch"
  },
  {
    "path": "knet_vis/tracker/semantic_fpn_wrapper3D.py",
    "chars": 8809,
    "preview": "import torch\nimport torch.nn as nn\nfrom mmcv.cnn import ConvModule, normal_init\nfrom mmdet.models.builder import NECKS\nf"
  },
  {
    "path": "knet_vis/tracker/track.py",
    "chars": 16210,
    "preview": "import copy\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom mmdet.models.builder import DETECT"
  },
  {
    "path": "mmtrack/datasets/coco_video_dataset.py",
    "chars": 20758,
    "preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport random\n\nimport numpy as np\nfrom mmcv.utils import print_log\nfrom "
  },
  {
    "path": "mmtrack/datasets/parsers/__init__.py",
    "chars": 110,
    "preview": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom .coco_video_parser import CocoVID\n\n__all__ = ['CocoVID']\n"
  },
  {
    "path": "mmtrack/datasets/parsers/coco_video_parser.py",
    "chars": 5427,
    "preview": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom collections import defaultdict\n\nimport numpy as np\nfrom mmdet.datas"
  },
  {
    "path": "mmtrack/datasets/youtube_vis_dataset.py",
    "chars": 8622,
    "preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport os.path\nimport os.path as osp\nimport tempfile\nimport zipfile\n\nimp"
  },
  {
    "path": "mmtrack/pipelines/__init__.py",
    "chars": 103,
    "preview": "from .formatting import *\nfrom .loading import *\nfrom .test_time_aug import *\nfrom .transforms import *"
  },
  {
    "path": "mmtrack/pipelines/formatting.py",
    "chars": 21776,
    "preview": "import numpy as np\nimport torch\nfrom mmcv.parallel import DataContainer as DC\nfrom mmdet.datasets.builder import PIPELIN"
  },
  {
    "path": "mmtrack/pipelines/loading.py",
    "chars": 8179,
    "preview": "import os.path as osp\nimport numpy as np\n\nimport mmcv\nfrom mmdet.core import BitmapMasks\n\nfrom mmdet.datasets.builder im"
  },
  {
    "path": "mmtrack/pipelines/test_time_aug.py",
    "chars": 4671,
    "preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport warnings\n\nimport mmcv\n\nfrom mmdet.datasets.builder import PIPELIN"
  },
  {
    "path": "mmtrack/pipelines/transforms.py",
    "chars": 41207,
    "preview": "import cv2\nimport mmcv\nimport numpy as np\nimport warnings\nfrom mmdet.datasets.builder import PIPELINES\nfrom mmdet.datase"
  },
  {
    "path": "mmtrack/transform.py",
    "chars": 2603,
    "preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport torch\nfrom mmdet.core import bbox2result\n\ndef "
  },
  {
    "path": "scripts/kitti_step_prepare.py",
    "chars": 2547,
    "preview": "import os\nimport shutil\n\ntrain_seqs = [0, 1, 3, 4, 5, 9, 11, 12, 15, 17, 19, 20]\nval_seqs = [2, 6, 7, 8, 10, 13, 14, 16,"
  },
  {
    "path": "scripts/visualizer.py",
    "chars": 2267,
    "preview": "import hashlib\nimport numpy as np\nimport cv2\n\ncity_labels = [\n    ('road', 0, (128, 64, 128)),\n    ('sidewalk', 1, (244,"
  },
  {
    "path": "swin/DetectRS.py",
    "chars": 12309,
    "preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch.nn as nn\nimport torch.utils.checkpoint as cp\nfrom mmcv.cnn "
  },
  {
    "path": "swin/ckpt_convert.py",
    "chars": 4950,
    "preview": "# Copyright (c) OpenMMLab. All rights reserved.\n\n# This script consists of several convert functions which\n# can modify "
  },
  {
    "path": "swin/mix_transformer.py",
    "chars": 17579,
    "preview": "# ---------------------------------------------------------------\n# Copyright (c) 2021, NVIDIA Corporation. All rights r"
  },
  {
    "path": "swin/swin_checkpoint.py",
    "chars": 18801,
    "preview": "# Copyright (c) Open-MMLab. All rights reserved.\nimport io\nimport os\nimport os.path as osp\nimport pkgutil\nimport time\nim"
  },
  {
    "path": "swin/swin_transformer.py",
    "chars": 26619,
    "preview": "# --------------------------------------------------------\n# Swin Transformer\n# Copyright (c) 2021 Microsoft\n# Licensed "
  },
  {
    "path": "swin/swin_transformer_rfp.py",
    "chars": 38736,
    "preview": "import warnings\nfrom collections import OrderedDict\nfrom copy import deepcopy\n\nimport torch\nimport torch.nn as nn\nimport"
  },
  {
    "path": "swin/transformer.py",
    "chars": 14466,
    "preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport math\nimport warnings\nfrom typing import Sequence\n\nimport torch\nim"
  },
  {
    "path": "tools/dataset/cityscapes_instance_idmap.py",
    "chars": 1572,
    "preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport argparse\nimport os.path as osp\n\nimport mmcv\nfrom cityscapesscript"
  },
  {
    "path": "tools/dataset/youtubevis2coco.py",
    "chars": 5431,
    "preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport argparse\nimport copy\nimport os\nimport os.path as osp\nfrom collect"
  },
  {
    "path": "tools/dist_step_test.sh",
    "chars": 276,
    "preview": "#!/usr/bin/env bash\n\nCONFIG=$1\nCHECKPOINT=$2\nGPUS=$3\nPORT=${PORT:-29500}\n\nPYTHONPATH=\"$(dirname $0)/..\":$PYTHONPATH \\\npy"
  },
  {
    "path": "tools/dist_test.sh",
    "chars": 272,
    "preview": "#!/usr/bin/env bash\n\nCONFIG=$1\nCHECKPOINT=$2\nGPUS=$3\nPORT=${PORT:-29500}\n\nPYTHONPATH=\"$(dirname $0)/..\":$PYTHONPATH \\\npy"
  },
  {
    "path": "tools/dist_train.sh",
    "chars": 268,
    "preview": "#!/usr/bin/env bash\n\nCONFIG=$1\nGPUS=$2\nPORT=${PORT:-$((29500 + $RANDOM % 29))}\n\nPYTHONPATH=\"$(dirname $0)/..\":$PYTHONPAT"
  },
  {
    "path": "tools/dist_train_new.sh",
    "chars": 269,
    "preview": "#!/usr/bin/env bash\n\nCONFIG=$1\nGPUS=$2\nPORT=${PORT:-$((29500 + $RANDOM % 29))}\n\nPYTHONPATH=\"$(dirname $0)/..\":$PYTHONPAT"
  },
  {
    "path": "tools/dist_vps_test.sh",
    "chars": 275,
    "preview": "#!/usr/bin/env bash\n\nCONFIG=$1\nCHECKPOINT=$2\nGPUS=$3\nPORT=${PORT:-29500}\n\nPYTHONPATH=\"$(dirname $0)/..\":$PYTHONPATH \\\npy"
  },
  {
    "path": "tools/docker.sh",
    "chars": 232,
    "preview": "#!/bin/bash\n\nDATALOC=${DATALOC:-~/datasets}\nLOGLOC=${LOGLOC:-~/logger}\nIMG=${IMG:-\"harbory/openmmlab:latest\"}\n\ndocker ru"
  },
  {
    "path": "tools/eval_dstq.py",
    "chars": 5165,
    "preview": "import argparse\nimport os\n\nimport mmcv\nimport numpy as np\nimport torch\nfrom mmcv import ProgressBar\n\nimport torch.nn.fun"
  },
  {
    "path": "tools/eval_dstq_step.py",
    "chars": 4947,
    "preview": "import argparse\nimport os\n\nimport mmcv\nimport numpy as np\nimport torch\nfrom mmcv import ProgressBar\n\nimport torch.nn.fun"
  },
  {
    "path": "tools/eval_dstq_vipseg.py",
    "chars": 24577,
    "preview": "import argparse\nimport os\n\nimport mmcv\nimport numpy as np\nimport torch\nfrom mmcv import ProgressBar\n\nimport torch.nn.fun"
  },
  {
    "path": "tools/eval_dvpq_step.py",
    "chars": 9329,
    "preview": "import numpy as np\nfrom PIL import Image\nimport six\nimport os\nimport multiprocessing as mp\nimport argparse\n\nparser = arg"
  },
  {
    "path": "tools/eval_dvpq_vipseg.py",
    "chars": 27049,
    "preview": "import argparse\nimport os\n\nimport mmcv\nimport numpy as np\nimport six\nimport multiprocessing as mp\n\nCLASSES = [\n    {\"id\""
  },
  {
    "path": "tools/flops_counter.py",
    "chars": 20261,
    "preview": "'''\nCopyright (C) 2019 Sovrasov V. - All Rights Reserved\n * You may use, distribute and modify this code under the\n * te"
  },
  {
    "path": "tools/get_flops.py",
    "chars": 3258,
    "preview": "# Copyright (c) OpenMMLab. All rights reserved.\nimport argparse\n\nimport numpy as np\nimport torch\nfrom mmcv import Config"
  },
  {
    "path": "tools/inference_kitti_step.sh",
    "chars": 871,
    "preview": "#!/usr/bin/env bash\n\nCONFIG=$1\nCHECKPOINT=$2\nLOG=$3\n\n# configs/det/video_knet_kitti_step/video_knet_s3_r50_rpn_1x_kitti_"
  },
  {
    "path": "tools/slurm_test.sh",
    "chars": 566,
    "preview": "#!/usr/bin/env bash\n\nset -x\n\nPARTITION=$1\nJOB_NAME=$2\nCONFIG=$3\nCHECKPOINT=$4\nGPUS=${GPUS:-8}\nGPUS_PER_NODE=${GPUS_PER_N"
  },
  {
    "path": "tools/slurm_test_dvps.sh",
    "chars": 570,
    "preview": "#!/usr/bin/env bash\n\nset -x\n\nPARTITION=$1\nJOB_NAME=$2\nCONFIG=$3\nCHECKPOINT=$4\nGPUS=${GPUS:-1}\nGPUS_PER_NODE=${GPUS_PER_N"
  },
  {
    "path": "tools/slurm_test_step.sh",
    "chars": 570,
    "preview": "#!/usr/bin/env bash\n\nset -x\n\nPARTITION=$1\nJOB_NAME=$2\nCONFIG=$3\nCHECKPOINT=$4\nGPUS=${GPUS:-1}\nGPUS_PER_NODE=${GPUS_PER_N"
  },
  {
    "path": "tools/slurm_test_vis.sh",
    "chars": 569,
    "preview": "#!/usr/bin/env bash\n\nset -x\n\nPARTITION=$1\nJOB_NAME=$2\nCONFIG=$3\nCHECKPOINT=$4\nGPUS=${GPUS:-1}\nGPUS_PER_NODE=${GPUS_PER_N"
  },
  {
    "path": "tools/slurm_test_vps.sh",
    "chars": 580,
    "preview": "#!/usr/bin/env bash\n\nset -x\n\nPARTITION=$1\nJOB_NAME=$2\nCONFIG=$3\nCHECKPOINT=$4\nGPUS=${GPUS:-1}\nGPUS_PER_NODE=${GPUS_PER_N"
  },
  {
    "path": "tools/slurm_train.sh",
    "chars": 574,
    "preview": "#!/usr/bin/env bash\n\nset -x\n\nPARTITION=$1\nJOB_NAME=$2\nCONFIG=$3\nWORK_DIR=$4\nGPUS=${GPUS:-8}\nGPUS_PER_NODE=${GPUS_PER_NOD"
  },
  {
    "path": "tools/test.py",
    "chars": 8672,
    "preview": "import argparse\nimport os\nimport warnings\n\nimport mmcv\nimport torch\nfrom mmcv import Config, DictAction\nfrom mmcv.cnn im"
  },
  {
    "path": "tools/test_dvps.py",
    "chars": 11118,
    "preview": "import argparse\nimport os\nimport os.path as osp\nimport warnings\nimport numpy as np\nimport pickle\nimport json\nimport mmcv"
  },
  {
    "path": "tools/test_step.py",
    "chars": 10616,
    "preview": "import argparse\nimport os\nimport os.path as osp\nimport warnings\nimport numpy as np\nimport pickle\nimport json\nimport cv2\n"
  },
  {
    "path": "tools/test_vps.py",
    "chars": 10408,
    "preview": "import argparse\nimport os\nimport os.path as osp\nimport warnings\nimport numpy as np\nimport pickle\nimport json\nimport mmcv"
  },
  {
    "path": "tools/train.py",
    "chars": 8061,
    "preview": "import argparse\nimport copy\nimport os\nimport os.path as osp\nimport time\nimport warnings\n\nimport mmcv\nimport torch\nimport"
  },
  {
    "path": "tools/utils/DSTQ.py",
    "chars": 7239,
    "preview": "from typing import Sequence, Tuple\nimport collections\n\nimport numpy as np\n\nfrom .STQ import STQuality\n\n\nclass DSTQuality"
  },
  {
    "path": "tools/utils/STQ.py",
    "chars": 13650,
    "preview": "# This file is copied from deeplab2, please refer to https://github.com/google-research/deeplab2/\n# for details. Please "
  }
]

// ... and 37 more files (download for full content)

About this extraction

This page contains the full source code of the lxtGH/Video-K-Net GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 237 files (1.8 MB), approximately 460.2k tokens, and a symbol index with 1528 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!