gitextract_tlc1nw96/

├── DATASET_prepare.md
├── LICENSE
├── README.md
├── configs/
│   ├── coco/
│   │   └── instance-segmentation/
│   │       ├── Base-COCO-InstanceSegmentation.yaml
│   │       └── maskformer2_R50_bs16_50ep.yaml
│   └── youtubevis_2019/
│       ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml
│       ├── Base-YouTubeVIS-VideoInstanceSegmentation_long.yaml
│       ├── Base-YouTubeVIS-VideoInstanceSegmentation_long_bs16.yaml
│       ├── swin/
│       │   └── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml
│       ├── video_maskformer2_R101_bs16_8ep.yaml
│       ├── video_maskformer2_R50_bs16_8ep.yaml
│       └── video_maskformer2_R50_bs16_8ep_swin.yaml
├── demo/
│   ├── README.md
│   ├── demo.py
│   └── predictor.py
├── demo_video/
│   ├── README.md
│   ├── demo.py
│   ├── predictor.py
│   └── visualizer.py
├── mask2former/
│   ├── __init__.py
│   ├── config.py
│   ├── data/
│   │   ├── __init__.py
│   │   ├── dataset_mappers/
│   │   │   ├── __init__.py
│   │   │   ├── __init__.py.new
│   │   │   ├── coco_instance_new_baseline_dataset_mapper.py
│   │   │   ├── coco_panoptic_new_baseline_dataset_mapper.py
│   │   │   ├── mask_former_instance_dataset_mapper.py
│   │   │   ├── mask_former_panoptic_dataset_mapper.py
│   │   │   └── mask_former_semantic_dataset_mapper.py
│   │   └── datasets/
│   │       ├── __init__.py
│   │       ├── register_ade20k_full.py
│   │       ├── register_ade20k_instance.py
│   │       ├── register_ade20k_panoptic.py
│   │       ├── register_coco_panoptic_annos_semseg.py
│   │       ├── register_coco_stuff_10k.py
│   │       ├── register_mapillary_vistas.py
│   │       └── register_mapillary_vistas_panoptic.py
│   ├── evaluation/
│   │   ├── __init__.py
│   │   ├── __init__.py.new
│   │   └── instance_evaluation.py
│   ├── maskformer_model.py
│   ├── modeling/
│   │   ├── __init__.py
│   │   ├── backbone/
│   │   │   ├── __init__.py
│   │   │   ├── __init__.py.new
│   │   │   └── swin.py
│   │   ├── criterion.py
│   │   ├── matcher.py
│   │   ├── meta_arch/
│   │   │   ├── __init__.py
│   │   │   ├── __init__.py.new
│   │   │   ├── mask_former_head.py
│   │   │   └── per_pixel_baseline.py
│   │   ├── pixel_decoder/
│   │   │   ├── __init__.py
│   │   │   ├── __init__.py.new
│   │   │   ├── fpn.py
│   │   │   ├── msdeformattn.py
│   │   │   └── ops/
│   │   │       ├── functions/
│   │   │       │   ├── __init__.py
│   │   │       │   └── ms_deform_attn_func.py
│   │   │       ├── make.sh
│   │   │       ├── modules/
│   │   │       │   ├── __init__.py
│   │   │       │   └── ms_deform_attn.py
│   │   │       ├── setup.py
│   │   │       ├── src/
│   │   │       │   ├── cpu/
│   │   │       │   │   ├── ms_deform_attn_cpu.cpp
│   │   │       │   │   └── ms_deform_attn_cpu.h
│   │   │       │   ├── cuda/
│   │   │       │   │   ├── ms_deform_attn_cuda.cu
│   │   │       │   │   ├── ms_deform_attn_cuda.h
│   │   │       │   │   └── ms_deform_im2col_cuda.cuh
│   │   │       │   ├── ms_deform_attn.h
│   │   │       │   └── vision.cpp
│   │   │       └── test.py
│   │   └── transformer_decoder/
│   │       ├── __init__.py
│   │       ├── mask2former_transformer_decoder.py
│   │       ├── maskformer_transformer_decoder.py
│   │       ├── position_encoding.py
│   │       └── transformer.py
│   ├── test_time_augmentation.py
│   └── utils/
│       ├── __init__.py
│       ├── __init__.py.new
│       └── misc.py
├── mask2former_video/
│   ├── __init__.py
│   ├── config.py
│   ├── data_video/
│   │   ├── __init__.py
│   │   ├── augmentation.py
│   │   ├── build.py
│   │   ├── combined_loader.py
│   │   ├── dataset_mapper.py
│   │   ├── datasets/
│   │   │   ├── __init__.py
│   │   │   ├── builtin.py
│   │   │   ├── ytvis.py
│   │   │   └── ytvis_api/
│   │   │       ├── __init__.py
│   │   │       ├── ytvos.py
│   │   │       └── ytvoseval.py
│   │   └── ytvis_eval.py
│   ├── modeling/
│   │   ├── __init__.py
│   │   ├── criterion.py
│   │   ├── matcher.py
│   │   └── transformer_decoder/
│   │       ├── __init__.py
│   │       ├── position_encoding.py
│   │       └── video_mask2former_transformer_decoder.py
│   ├── utils/
│   │   ├── __init__.py
│   │   ├── __init__.py.new
│   │   └── memory.py
│   └── video_maskformer_model.py
├── mfvis_nococo/
│   ├── __init__.py
│   ├── configs/
│   │   └── youtubevis_2019/
│   │       ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml
│   │       ├── video_maskformer2_R101_bs16_8ep_coco.yaml
│   │       ├── video_maskformer2_R50_bs16_8ep.yaml
│   │       └── video_maskformer2_R50_bs16_8ep_coco.yaml
│   ├── mask2former/
│   │   ├── __init__.py
│   │   ├── config.py
│   │   ├── data/
│   │   │   ├── __init__.py
│   │   │   ├── dataset_mappers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __init__.py.new
│   │   │   │   ├── coco_instance_new_baseline_dataset_mapper.py
│   │   │   │   ├── coco_panoptic_new_baseline_dataset_mapper.py
│   │   │   │   ├── mask_former_instance_dataset_mapper.py
│   │   │   │   ├── mask_former_panoptic_dataset_mapper.py
│   │   │   │   └── mask_former_semantic_dataset_mapper.py
│   │   │   └── datasets/
│   │   │       ├── __init__.py
│   │   │       ├── register_ade20k_full.py
│   │   │       ├── register_ade20k_instance.py
│   │   │       ├── register_ade20k_panoptic.py
│   │   │       ├── register_coco_panoptic_annos_semseg.py
│   │   │       ├── register_coco_stuff_10k.py
│   │   │       ├── register_mapillary_vistas.py
│   │   │       └── register_mapillary_vistas_panoptic.py
│   │   ├── evaluation/
│   │   │   ├── __init__.py
│   │   │   ├── __init__.py.new
│   │   │   └── instance_evaluation.py
│   │   ├── maskformer_model.py
│   │   ├── modeling/
│   │   │   ├── __init__.py
│   │   │   ├── backbone/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __init__.py.new
│   │   │   │   └── swin.py
│   │   │   ├── criterion.py
│   │   │   ├── matcher.py
│   │   │   ├── meta_arch/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __init__.py.new
│   │   │   │   ├── mask_former_head.py
│   │   │   │   └── per_pixel_baseline.py
│   │   │   ├── pixel_decoder/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __init__.py.new
│   │   │   │   ├── fpn.py
│   │   │   │   ├── msdeformattn.py
│   │   │   │   └── ops/
│   │   │   │       ├── functions/
│   │   │   │       │   ├── __init__.py
│   │   │   │       │   └── ms_deform_attn_func.py
│   │   │   │       ├── make.sh
│   │   │   │       ├── modules/
│   │   │   │       │   ├── __init__.py
│   │   │   │       │   └── ms_deform_attn.py
│   │   │   │       ├── setup.py
│   │   │   │       ├── src/
│   │   │   │       │   ├── cpu/
│   │   │   │       │   │   ├── ms_deform_attn_cpu.cpp
│   │   │   │       │   │   └── ms_deform_attn_cpu.h
│   │   │   │       │   ├── cuda/
│   │   │   │       │   │   ├── ms_deform_attn_cuda.cu
│   │   │   │       │   │   ├── ms_deform_attn_cuda.h
│   │   │   │       │   │   └── ms_deform_im2col_cuda.cuh
│   │   │   │       │   ├── ms_deform_attn.h
│   │   │   │       │   └── vision.cpp
│   │   │   │       └── test.py
│   │   │   └── transformer_decoder/
│   │   │       ├── __init__.py
│   │   │       ├── mask2former_transformer_decoder.py
│   │   │       ├── maskformer_transformer_decoder.py
│   │   │       ├── position_encoding.py
│   │   │       └── transformer.py
│   │   ├── test_time_augmentation.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── __init__.py.new
│   │       └── misc.py
│   ├── mask2former_video/
│   │   ├── __init__.py
│   │   ├── config.py
│   │   ├── data_video/
│   │   │   ├── __init__.py
│   │   │   ├── augmentation.py
│   │   │   ├── build.py
│   │   │   ├── dataset_mapper.py
│   │   │   ├── datasets/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── builtin.py
│   │   │   │   ├── ytvis.py
│   │   │   │   └── ytvis_api/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── ytvos.py
│   │   │   │       └── ytvoseval.py
│   │   │   └── ytvis_eval.py
│   │   ├── modeling/
│   │   │   ├── __init__.py
│   │   │   ├── criterion.py
│   │   │   ├── matcher.py
│   │   │   └── transformer_decoder/
│   │   │       ├── __init__.py
│   │   │       ├── position_encoding.py
│   │   │       └── video_mask2former_transformer_decoder.py
│   │   ├── utils/
│   │   │   ├── __init__.py
│   │   │   └── memory.py
│   │   └── video_maskformer_model.py
│   ├── scripts/
│   │   ├── eval_8gpu_mask2former_r101_video.sh
│   │   ├── train_8gpu_mask2former_r101_video_coco.sh
│   │   ├── train_8gpu_mask2former_r50_video.sh
│   │   ├── train_8gpu_mask2former_r50_video_coco.sh
│   │   ├── visual_video_r101.sh
│   │   └── visual_video_r50.sh
│   └── train_net_video.py
├── requirements.txt
├── scripts/
│   ├── eval_8gpu_mask2former_r101_video.sh
│   ├── eval_8gpu_mask2former_r50_video.sh
│   ├── eval_8gpu_mask2former_swinl_video.sh
│   ├── train_8gpu_mask2former_r101_video.sh
│   ├── train_8gpu_mask2former_r50_video.sh
│   ├── train_8gpu_mask2former_swinl_video.sh
│   └── visual_video.sh
├── tools/
│   ├── README.md
│   ├── analyze_model.py
│   ├── convert-pretrained-swin-model-to-d2.py
│   ├── convert-torchvision-to-d2.py
│   ├── evaluate_coco_boundary_ap.py
│   └── evaluate_pq_for_semantic_segmentation.py
├── train_net.py
├── train_net_video.py
└── util/
    ├── __init__.py
    ├── box_ops.py
    ├── misc.py
    └── plot_utils.py